├── .editorconfig
├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── feature_request.md
    │   └── question.md
    └── workflows
    │   ├── dependency_checker.yml
    │   ├── install.yml
    │   ├── integration.yml
    │   ├── lint.yml
    │   ├── minimum.yml
    │   ├── prepare_release.yml
    │   ├── readme.yml
    │   └── unit.yml
├── .gitignore
├── AUTHORS.rst
├── CONTRIBUTING.rst
├── HISTORY.md
├── INSTALL.md
├── LICENSE
├── Makefile
├── README.md
├── RELEASE.md
├── codecov.yml
├── docs
    └── images
    │   ├── column_comparison.png
    │   └── column_pairs.png
├── latest_requirements.txt
├── pyproject.toml
├── resources
    └── visualize.png
├── scripts
    └── release_notes_generator.py
├── sdmetrics
    ├── __init__.py
    ├── _utils_metadata.py
    ├── base.py
    ├── column_pairs
    │   ├── __init__.py
    │   ├── base.py
    │   └── statistical
    │   │   ├── __init__.py
    │   │   ├── cardinality_boundary_adherence.py
    │   │   ├── contingency_similarity.py
    │   │   ├── correlation_similarity.py
    │   │   ├── inter_row_msas.py
    │   │   ├── kl_divergence.py
    │   │   ├── referential_integrity.py
    │   │   └── statistic_msas.py
    ├── demos.py
    ├── demos
    │   ├── multi_table
    │   │   ├── metadata.json
    │   │   ├── sessions_real.csv
    │   │   ├── sessions_synthetic.csv
    │   │   ├── transactions_real.csv
    │   │   ├── transactions_synthetic.csv
    │   │   ├── users_real.csv
    │   │   └── users_synthetic.csv
    │   ├── single_table
    │   │   ├── metadata.json
    │   │   ├── real.csv
    │   │   └── synthetic.csv
    │   └── timeseries
    │   │   ├── metadata.json
    │   │   ├── real.csv
    │   │   └── synthetic.csv
    ├── errors.py
    ├── goal.py
    ├── multi_table
    │   ├── README.md
    │   ├── __init__.py
    │   ├── base.py
    │   ├── detection
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── parent_child.py
    │   ├── multi_single_table.py
    │   └── statistical
    │   │   ├── __init__.py
    │   │   ├── cardinality_shape_similarity.py
    │   │   └── cardinality_statistic_similarity.py
    ├── reports
    │   ├── __init__.py
    │   ├── base_report.py
    │   ├── multi_table
    │   │   ├── __init__.py
    │   │   ├── _properties
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── boundary.py
    │   │   │   ├── cardinality.py
    │   │   │   ├── column_pair_trends.py
    │   │   │   ├── column_shapes.py
    │   │   │   ├── coverage.py
    │   │   │   ├── data_validity.py
    │   │   │   ├── inter_table_trends.py
    │   │   │   ├── relationship_validity.py
    │   │   │   ├── structure.py
    │   │   │   └── synthesis.py
    │   │   ├── base_multi_table_report.py
    │   │   ├── diagnostic_report.py
    │   │   └── quality_report.py
    │   ├── single_table
    │   │   ├── __init__.py
    │   │   ├── _properties
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── boundary.py
    │   │   │   ├── column_pair_trends.py
    │   │   │   ├── column_shapes.py
    │   │   │   ├── coverage.py
    │   │   │   ├── data_validity.py
    │   │   │   ├── structure.py
    │   │   │   └── synthesis.py
    │   │   ├── diagnostic_report.py
    │   │   ├── plot_utils.py
    │   │   └── quality_report.py
    │   └── utils.py
    ├── single_column
    │   ├── README.md
    │   ├── __init__.py
    │   ├── base.py
    │   └── statistical
    │   │   ├── __init__.py
    │   │   ├── boundary_adherence.py
    │   │   ├── category_adherence.py
    │   │   ├── category_coverage.py
    │   │   ├── cstest.py
    │   │   ├── key_uniqueness.py
    │   │   ├── kscomplement.py
    │   │   ├── missing_value_similarity.py
    │   │   ├── range_coverage.py
    │   │   ├── sequence_length_similarity.py
    │   │   ├── statistic_similarity.py
    │   │   └── tv_complement.py
    ├── single_table
    │   ├── README.md
    │   ├── __init__.py
    │   ├── base.py
    │   ├── bayesian_network.py
    │   ├── data_augmentation
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── binary_classifier_precision_efficacy.py
    │   │   ├── binary_classifier_recall_efficacy.py
    │   │   └── utils.py
    │   ├── detection
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── sklearn.py
    │   ├── efficacy
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── binary.py
    │   │   ├── mlefficacy.py
    │   │   ├── multiclass.py
    │   │   └── regression.py
    │   ├── gaussian_mixture.py
    │   ├── multi_column_pairs.py
    │   ├── multi_single_column.py
    │   ├── new_row_synthesis.py
    │   ├── privacy
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── cap.py
    │   │   ├── categorical_sklearn.py
    │   │   ├── dcr_baseline_protection.py
    │   │   ├── dcr_overfitting_protection.py
    │   │   ├── dcr_utils.py
    │   │   ├── disclosure_protection.py
    │   │   ├── ensemble.py
    │   │   ├── loss.py
    │   │   ├── numerical_sklearn.py
    │   │   ├── radius_nearest_neighbor.py
    │   │   └── util.py
    │   └── table_structure.py
    ├── timeseries
    │   ├── README.md
    │   ├── __init__.py
    │   ├── base.py
    │   ├── detection.py
    │   ├── efficacy
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── classification.py
    │   └── ml_scorers.py
    ├── utils.py
    ├── visualization.py
    └── warnings.py
├── static_code_analysis.txt
├── tasks.py
├── tests
    ├── __init__.py
    ├── integration
    │   ├── __init__.py
    │   ├── column_pairs
    │   │   ├── __init__.py
    │   │   └── statistical
    │   │   │   ├── __init__.py
    │   │   │   ├── test_contingency_similarity.py
    │   │   │   └── test_kl_divergence.py
    │   ├── multi_table
    │   │   ├── __init__.py
    │   │   ├── test_multi_single_table.py
    │   │   ├── test_multi_table.py
    │   │   ├── test_parent_child.py
    │   │   └── test_statistical_metrics.py
    │   ├── reports
    │   │   ├── __init__.py
    │   │   ├── multi_table
    │   │   │   ├── __init__.py
    │   │   │   ├── _properties
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_boundary.py
    │   │   │   │   ├── test_cardinality.py
    │   │   │   │   ├── test_column_pair_trends.py
    │   │   │   │   ├── test_column_shapes.py
    │   │   │   │   ├── test_coverage.py
    │   │   │   │   ├── test_data_validity.py
    │   │   │   │   ├── test_inter_table_trends.py
    │   │   │   │   ├── test_relationship_validity.py
    │   │   │   │   ├── test_structure.py
    │   │   │   │   └── test_synthesis.py
    │   │   │   ├── test_diagnostic_report.py
    │   │   │   └── test_quality_report.py
    │   │   └── single_table
    │   │   │   ├── __init__.py
    │   │   │   ├── _properties
    │   │   │       ├── test_boundary.py
    │   │   │       ├── test_column_pair_trends.py
    │   │   │       ├── test_column_shapes.py
    │   │   │       ├── test_coverage.py
    │   │   │       ├── test_data_validity.py
    │   │   │       ├── test_structure.py
    │   │   │       └── test_synthesis.py
    │   │   │   ├── test_diagnostic_report.py
    │   │   │   └── test_quality_report.py
    │   ├── single_column
    │   │   ├── __init__.py
    │   │   └── statistical
    │   │   │   ├── __init__.py
    │   │   │   ├── test_cstest.py
    │   │   │   └── test_kscomplement.py
    │   ├── single_table
    │   │   ├── __init__.py
    │   │   ├── data_augmentation
    │   │   │   ├── __init__.py
    │   │   │   ├── test_binary_classifier_precision_efficacy.py
    │   │   │   └── test_binary_classifier_recall_efficacy.py
    │   │   ├── efficacy
    │   │   │   ├── __init__.py
    │   │   │   ├── test_binary.py
    │   │   │   ├── test_detection.py
    │   │   │   ├── test_multiclass.py
    │   │   │   └── test_regression.py
    │   │   ├── privacy
    │   │   │   ├── __init__.py
    │   │   │   ├── test_dcr_baseline_protection.py
    │   │   │   ├── test_dcr_overfitting_protection.py
    │   │   │   ├── test_dcr_utils.py
    │   │   │   ├── test_disclosure_protection.py
    │   │   │   └── test_privacy.py
    │   │   ├── test_gaussian_mixture.py
    │   │   └── test_single_table.py
    │   ├── test_base.py
    │   ├── test_property.py
    │   └── timeseries
    │   │   ├── __init__.py
    │   │   ├── efficacy
    │   │       ├── __init__.py
    │   │       └── test_classification.py
    │   │   └── test_timeseries.py
    ├── test_tasks.py
    ├── unit
    │   ├── __init__.py
    │   ├── column_pairs
    │   │   ├── __init__.py
    │   │   ├── statistical
    │   │   │   ├── __init__.py
    │   │   │   ├── test_cardinality_boundary_adherence.py
    │   │   │   ├── test_contingency_similarity.py
    │   │   │   ├── test_correlation_similarity.py
    │   │   │   ├── test_inter_row_msas.py
    │   │   │   ├── test_referential_integrity.py
    │   │   │   └── test_statistic_msas.py
    │   │   └── test_base.py
    │   ├── multi_table
    │   │   ├── __init__.py
    │   │   ├── statistical
    │   │   │   ├── __init__.py
    │   │   │   ├── test_cardinality_shape_similarity.py
    │   │   │   └── test_cardinality_statistic_similarity.py
    │   │   └── test_multi_single_table.py
    │   ├── reports
    │   │   ├── __init__.py
    │   │   ├── multi_table
    │   │   │   ├── __init__.py
    │   │   │   ├── _properties
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_base.py
    │   │   │   │   ├── test_boundary.py
    │   │   │   │   ├── test_cardinality.py
    │   │   │   │   ├── test_column_pair_trends.py
    │   │   │   │   ├── test_column_shapes.py
    │   │   │   │   ├── test_coverage.py
    │   │   │   │   ├── test_inter_table_trends.py
    │   │   │   │   ├── test_relationship_validity.py
    │   │   │   │   ├── test_structure.py
    │   │   │   │   ├── test_synthesis.py
    │   │   │   │   └── test_validity.py
    │   │   │   ├── test_base_multi_table_report.py
    │   │   │   ├── test_diagnostic_report.py
    │   │   │   └── test_quality_report.py
    │   │   ├── single_table
    │   │   │   ├── __init__.py
    │   │   │   ├── _properties
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── test_base.py
    │   │   │   │   ├── test_boundary.py
    │   │   │   │   ├── test_column_pair_trends.py
    │   │   │   │   ├── test_column_shapes.py
    │   │   │   │   ├── test_coverage.py
    │   │   │   │   ├── test_data_validity.py
    │   │   │   │   ├── test_structure.py
    │   │   │   │   └── test_synthesis.py
    │   │   │   ├── test_diagnostic_report.py
    │   │   │   ├── test_quality_report.py
    │   │   │   └── test_single_table_plot_utils.py
    │   │   ├── test_base_report.py
    │   │   └── test_utils.py
    │   ├── single_column
    │   │   ├── __init__.py
    │   │   ├── statistical
    │   │   │   ├── __init__.py
    │   │   │   ├── test_boundary_adherence.py
    │   │   │   ├── test_category_adherence.py
    │   │   │   ├── test_category_coverage.py
    │   │   │   ├── test_key_uniqueness.py
    │   │   │   ├── test_missing_value_similarity.py
    │   │   │   ├── test_range_coverage.py
    │   │   │   ├── test_sequence_length_similarity.py
    │   │   │   ├── test_statistic_similarity.py
    │   │   │   └── test_tv_complement.py
    │   │   └── test_base.py
    │   ├── single_table
    │   │   ├── __init__.py
    │   │   ├── data_augmentation
    │   │   │   ├── __init__.py
    │   │   │   ├── test_base.py
    │   │   │   ├── test_binary_classifier_precision_efficacy.py
    │   │   │   ├── test_binary_classifier_recall_efficacy.py
    │   │   │   └── test_utils.py
    │   │   ├── detection
    │   │   │   ├── __init__.py
    │   │   │   └── test_detection.py
    │   │   ├── privacy
    │   │   │   ├── __init__.py
    │   │   │   ├── test_cap.py
    │   │   │   ├── test_dcr_baseline_protection.py
    │   │   │   ├── test_dcr_overfitting_protection.py
    │   │   │   ├── test_dcr_utils.py
    │   │   │   ├── test_disclosure_protection.py
    │   │   │   └── test_util.py
    │   │   ├── test_base.py
    │   │   ├── test_bayesian_network.py
    │   │   ├── test_multi_single_column.py
    │   │   ├── test_new_row_synthesis.py
    │   │   └── test_table_structure.py
    │   ├── test___init__.py
    │   ├── test__utils_metadata.py
    │   ├── test_base.py
    │   ├── test_demos.py
    │   ├── test_utils.py
    │   ├── test_visualization.py
    │   └── timeseries
    │   │   ├── __init__.py
    │   │   └── test_timeseries.py
    └── utils.py
└── tox.ini


/.editorconfig:
--------------------------------------------------------------------------------
 1 | # http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 4
 8 | trim_trailing_whitespace = true
 9 | insert_final_newline = true
10 | charset = utf-8
11 | end_of_line = lf
12 | 
13 | [*.py]
14 | max_line_length = 99
15 | 
16 | [*.bat]
17 | indent_style = tab
18 | end_of_line = crlf
19 | 
20 | [LICENSE]
21 | insert_final_newline = false
22 | 
23 | [Makefile]
24 | indent_style = tab
25 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Global rule:
2 | *           @sdv-dev/core-contributors
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Report an error that you found when using SDMetrics
 4 | title: ''
 5 | labels: bug, new
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ### Environment Details
11 | 
12 | Please indicate the following details about the environment in which you found the bug:
13 | 
14 | * SDMetrics version:
15 | * Python version:
16 | * Operating System:
17 | 
18 | ### Error Description
19 | 
20 | <!--Replace this text with a description of what you were trying to get done.
21 | Tell us what happened, what went wrong, and what you expected to happen.-->
22 | 
23 | ### Steps to reproduce
24 | 
25 | <!--Replace this text with a description of the steps that anyone can follow to
26 | reproduce the error. If the error happens only on a specific dataset, please
27 | consider attaching some example data to the issue so that others can use it
28 | to reproduce the error.-->
29 | 
30 | ```
31 | Paste the command(s) you ran and the output.
32 | If there was a crash, please include the traceback here.
33 | ```
34 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Request a new feature that you would like to see implemented in SDMetrics
 4 | title: ''
 5 | labels: feature request, new
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ### Problem Description
11 | 
12 | <!--Replace this with a description of the problem that you think SDMetrics should be able
13 | to solve and is not solving already-->
14 | 
15 | ### Expected behavior
16 | 
17 | <!--Replace this a clear and concise description of what you would expect SDMetrics with regards
18 | with the described problem. If possible, explain how you would like to interact with SDMetrics
19 | and what the outcome of this interaction would be.-->
20 | 
21 | ### Additional context
22 | 
23 | <!--Please provide any additional context that may be relevant to the issue here. If none,
24 | please remove this section.-->
25 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: Doubts about SDMetrics usage
 4 | title: ''
 5 | labels: question, new
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ### Environment details
11 | 
12 | If you are already running SDMetrics, please indicate the following details about the environment in
13 | which you are running it:
14 | 
15 | * SDMetrics version:
16 | * Python version:
17 | * Operating System:
18 | 
19 | ### Problem description
20 | 
21 | <!--Replace this with a description of the problem that you are trying to solve using SDMetrics. If
22 | possible, describe the data that you are using, or consider attaching some example data
23 | that others can use to propose a working solution for your problem.-->
24 | 
25 | ### What I already tried
26 | 
27 | <!--Replace with a description of what you already tried and what is the behavior that you observe.
28 | If possible, also add below the exact code that you are running.-->
29 | 
30 | ```
31 | Paste the command(s) you ran and the output.
32 | If there was a crash, please include the traceback here.
33 | ```
34 | 


--------------------------------------------------------------------------------
/.github/workflows/dependency_checker.yml:
--------------------------------------------------------------------------------
 1 | name: Dependency Checker
 2 | on:
 3 |   workflow_dispatch:
 4 |   schedule:
 5 |     - cron: '0 0 * * 1'
 6 | jobs:
 7 |   build:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |     - uses: actions/checkout@v4
11 |     - name: Set up Python 3.9
12 |       uses: actions/setup-python@v5
13 |       with:
14 |         python-version: 3.9
15 |     - name: Install dependencies
16 |       run: |
17 |         python -m pip install .[dev]
18 |         make check-deps OUTPUT_FILEPATH=latest_requirements.txt
19 |         make fix-lint
20 |     - name: Create pull request
21 |       id: cpr
22 |       uses: peter-evans/create-pull-request@v4
23 |       with:
24 |         token: ${{ secrets.GH_ACCESS_TOKEN }}
25 |         commit-message: Update latest dependencies
26 |         author: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
27 |         committer: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
28 |         title: Automated Latest Dependency Updates
29 |         body: "This is an auto-generated PR with **latest** dependency updates."
30 |         branch: latest-dependency-update
31 |         branch-suffix: short-commit-hash
32 |         base: main
33 | 


--------------------------------------------------------------------------------
/.github/workflows/install.yml:
--------------------------------------------------------------------------------
 1 | name: Install Tests
 2 | on:
 3 |   pull_request:
 4 |     types: [opened, synchronize]
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | concurrency:
10 |   group: ${{ github.workflow }}-${{ github.ref }}
11 |   cancel-in-progress: true
12 | 
13 | jobs:
14 |   install:
15 |     name: ${{ matrix.python_version }} install
16 |     strategy:
17 |       fail-fast: true
18 |       matrix:
19 |         python_version: ["3.8", "3.13"]
20 |     runs-on: ubuntu-latest
21 |     steps:
22 |       - name: Set up python ${{ matrix.python_version }}
23 |         uses: actions/setup-python@v5
24 |         with:
25 |           python-version: ${{ matrix.python_version }}
26 |       - uses: actions/checkout@v4
27 |       - name: Build package
28 |         run: |
29 |           make package
30 |       - name: Install package
31 |         run: |
32 |           python -m pip install "unpacked_sdist/."
33 |       - name: Test by importing packages
34 |         run: |
35 |           python -c "import sdmetrics"
36 |       - name: Check package conflicts
37 |         run: |
38 |           python -m pip check


--------------------------------------------------------------------------------
/.github/workflows/integration.yml:
--------------------------------------------------------------------------------
 1 | name: Integration Tests
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |     types: [opened, reopened]
 7 | 
 8 | concurrency:
 9 |   group: ${{ github.workflow }}-${{ github.ref }}
10 |   cancel-in-progress: true
11 | 
12 | jobs:
13 |   integration:
14 |     runs-on: ${{ matrix.os }}
15 |     strategy:
16 |       matrix:
17 |         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
18 |         os: [ubuntu-latest, windows-latest]
19 |         include:
20 |           - os: macos-latest
21 |             python-version: '3.8'
22 |           - os: macos-latest
23 |             python-version: '3.13'
24 |     steps:
25 |     - uses: actions/checkout@v4
26 |     - name: Set up Python ${{ matrix.python-version }}
27 |       uses: actions/setup-python@v5
28 |       with:
29 |         python-version: ${{ matrix.python-version }}
30 |     - name: Install libomp (macOS only)
31 |       if: matrix.os == 'macos-latest'
32 |       run: |
33 |           brew install libomp
34 |           echo 'export DYLD_LIBRARY_PATH=$(brew --prefix libomp)/lib:$DYLD_LIBRARY_PATH' >> $GITHUB_ENV
35 |     - name: Install dependencies
36 |       run: |
37 |           python -m pip install --upgrade pip
38 |           python -m pip install invoke .[test]
39 |     - name: Run integration tests
40 |       run: invoke integration
41 | 
42 |     - if: matrix.os == 'ubuntu-latest' && matrix.python-version == 3.12
43 |       name: Upload integration codecov report
44 |       uses: codecov/codecov-action@v4
45 |       with:
46 |         flags: integration
47 |         file: ${{ github.workspace }}/integration_cov.xml
48 |         fail_ci_if_error: true
49 |         token: ${{ secrets.CODECOV_TOKEN }}
50 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Style Checks
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |     types: [opened, reopened]
 7 | 
 8 | jobs:
 9 |   lint:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v4
13 |     - name: Set up Python 3.9
14 |       uses: actions/setup-python@v5
15 |       with:
16 |         python-version: 3.9
17 |     - name: Install dependencies
18 |       run: |
19 |         python -m pip install --upgrade pip
20 |         python -m pip install invoke .[dev]
21 |     - name: Run lint checks
22 |       run: invoke lint
23 | 


--------------------------------------------------------------------------------
/.github/workflows/minimum.yml:
--------------------------------------------------------------------------------
 1 | name: Unit Tests Minimum Versions
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |     types: [opened, reopened]
 7 | 
 8 | concurrency:
 9 |   group: ${{ github.workflow }}-${{ github.ref }}
10 |   cancel-in-progress: true
11 | 
12 | jobs:
13 |   minimum:
14 |     runs-on: ${{ matrix.os }}
15 |     strategy:
16 |       matrix:
17 |         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
18 |         os: [ubuntu-latest, windows-latest]
19 |         include:
20 |           - os: macos-latest
21 |             python-version: '3.8'
22 |           - os: macos-latest
23 |             python-version: '3.13'
24 |     steps:
25 |     - uses: actions/checkout@v4
26 |     - name: Set up Python ${{ matrix.python-version }}
27 |       uses: actions/setup-python@v5
28 |       with:
29 |         python-version: ${{ matrix.python-version }}
30 |     - name: Install libomp (macOS only)
31 |       if: matrix.os == 'macos-latest'
32 |       run: |
33 |           brew install libomp
34 |           echo 'export DYLD_LIBRARY_PATH=$(brew --prefix libomp)/lib:$DYLD_LIBRARY_PATH' >> $GITHUB_ENV
35 |     - name: Install dependencies
36 |       run: |
37 |           python -m pip install --upgrade pip
38 |           python -m pip install invoke .[test]
39 |     - name: Test with minimum versions
40 |       run: invoke minimum
41 | 


--------------------------------------------------------------------------------
/.github/workflows/prepare_release.yml:
--------------------------------------------------------------------------------
 1 | name: Release Prep
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       branch:
 7 |         description: 'Branch to merge release notes and code analysis into.'
 8 |         required: true
 9 |         default: 'main'
10 |       version:
11 |         description:
12 |           'Version to use for the release. Must be in format: X.Y.Z.'
13 |       date:
14 |         description:
15 |           'Date of the release. Must be in format YYYY-MM-DD.'
16 | 
17 | jobs:
18 |   preparerelease:
19 |     runs-on: ubuntu-latest
20 |     steps:
21 |       - uses: actions/checkout@v4
22 |       - name: Set up Python 3.10
23 |         uses: actions/setup-python@v5
24 |         with:
25 |           python-version: '3.10'
26 | 
27 |       - name: Install dependencies
28 |         run: |
29 |             python -m pip install --upgrade pip
30 |             python -m pip install requests==2.31.0
31 |             python -m pip install bandit==1.7.7
32 |             python -m pip install .[test]
33 | 
34 |       - name: Generate release notes
35 |         env:
36 |             GH_ACCESS_TOKEN: ${{ secrets.GH_ACCESS_TOKEN }}
37 |         run: >
38 |             python scripts/release_notes_generator.py
39 |             -v ${{ inputs.version }}
40 |             -d ${{ inputs.date }}
41 | 
42 |       - name: Save static code analysis
43 |         run: bandit -r . -x ./tests,./scripts,./build -f txt -o static_code_analysis.txt --exit-zero
44 | 
45 |       - name: Create pull request
46 |         id: cpr
47 |         uses: peter-evans/create-pull-request@v4
48 |         with:
49 |           token: ${{ secrets.GH_ACCESS_TOKEN }}
50 |           commit-message: Prepare release for v${{ inputs.version }}
51 |           author: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
52 |           committer: "github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>"
53 |           title: v${{ inputs.version }} Release Preparation
54 |           body: "This is an auto-generated PR to prepare the release."
55 |           branch: prepared-release
56 |           branch-suffix: short-commit-hash
57 |           base: ${{ inputs.branch }}
58 | 


--------------------------------------------------------------------------------
/.github/workflows/readme.yml:
--------------------------------------------------------------------------------
 1 | name: Test README
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |     types: [opened, reopened]
 7 | 
 8 | jobs:
 9 |   readme:
10 |     runs-on: ${{ matrix.os }}
11 |     strategy:
12 |       matrix:
13 |         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
14 |         os: [ubuntu-latest, macos-latest]   # skip windows bc rundoc fails
15 |     steps:
16 |     - uses: actions/checkout@v4
17 |     - name: Set up Python ${{ matrix.python-version }}
18 |       uses: actions/setup-python@v5
19 |       with:
20 |         python-version: ${{ matrix.python-version }}
21 |     - name: Install dependencies
22 |       run: |
23 |           python -m pip install --upgrade pip
24 |           python -m pip install invoke rundoc .
25 |           python -m pip install tomli
26 |           python -m pip install packaging
27 |     - name: Run the README.md
28 |       run: invoke readme
29 | 


--------------------------------------------------------------------------------
/.github/workflows/unit.yml:
--------------------------------------------------------------------------------
 1 | name: Unit Tests
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |     types: [opened, reopened]
 7 | 
 8 | concurrency:
 9 |   group: ${{ github.workflow }}-${{ github.ref }}
10 |   cancel-in-progress: true
11 | 
12 | jobs:
13 |   unit:
14 |     runs-on: ${{ matrix.os }}
15 |     strategy:
16 |       matrix:
17 |         python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
18 |         os: [ubuntu-latest, windows-latest]
19 |         include:
20 |           - os: macos-latest
21 |             python-version: '3.8'
22 |           - os: macos-latest
23 |             python-version: '3.13'
24 |     steps:
25 |     - uses: actions/checkout@v4
26 |     - name: Set up Python ${{ matrix.python-version }}
27 |       uses: actions/setup-python@v5
28 |       with:
29 |         python-version: ${{ matrix.python-version }}
30 |     - name: Install libomp (macOS only)
31 |       if: matrix.os == 'macos-latest'
32 |       run: |
33 |           brew install libomp
34 |           echo 'export DYLD_LIBRARY_PATH=$(brew --prefix libomp)/lib:$DYLD_LIBRARY_PATH' >> $GITHUB_ENV
35 |     - name: Install dependencies
36 |       run: |
37 |           python -m pip install --upgrade pip
38 |           python -m pip install invoke .[test]
39 |     - name: Run unit tests
40 |       run: invoke unit
41 | 
42 |     - if: matrix.os == 'ubuntu-latest' && matrix.python-version == 3.13
43 |       name: Upload unit codecov report
44 |       uses: codecov/codecov-action@v4
45 |       with:
46 |         flags: unit
47 |         file: ${{ github.workspace }}/unit_cov.xml
48 |         fail_ci_if_error: true
49 |         token: ${{ secrets.CODECOV_TOKEN }}
50 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | *_cov.xml
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | docs/api/
 69 | docs/tutorials/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # dotenv
 87 | .env
 88 | 
 89 | # virtualenv
 90 | .venv
 91 | venv/
 92 | ENV/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 
107 | # Vim
108 | .*.swp
109 | 
110 | # OS Files
111 | .DS_Store
112 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | =======
2 | Credits
3 | =======
4 | 
5 | * Kevin Alex Zhang <kevz@mit.edu>
6 | * Kalyan Veeramachaneni <kalyan@csail.mit.edu>
7 | * Carles Sala <csala@csail.mit.edu>
8 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | # Installing SDMetrics
 2 | 
 3 | ## Requirements
 4 | 
 5 | **SDMetrics** has been developed and tested on [Python 3.8, 3.9, 3.10, 3.11, 3.12 and 3.13](https://www.python.org/downloads/)
 6 | 
 7 | Also, although it is not strictly required, the usage of a [virtualenv](
 8 | https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid
 9 | interfering with other software installed in the system where **SDMetrics** is run.
10 | 
11 | ## Install with pip
12 | 
13 | The easiest and recommended way to install **SDMetrics** is using [pip](
14 | https://pip.pypa.io/en/stable/):
15 | 
16 | ```bash
17 | pip install sdmetrics
18 | ```
19 | 
20 | This will pull and install the latest stable release from [PyPi](https://pypi.org/).
21 | 
22 | ## Install with conda
23 | 
24 | **SDMetrics** can also be installed using [conda](https://docs.conda.io/en/latest/):
25 | 
26 | ```bash
27 | conda install -c sdv-dev -c conda-forge sdmetrics
28 | ```
29 | 
30 | This will pull and install the latest stable release from [Anaconda](https://anaconda.org/).
31 | 
32 | ## Install from source
33 | 
34 | If you want to install **SDMetrics** from source you need to first clone the repository
35 | and then execute the `make install` command inside the `stable` branch. Note that this
36 | command works only on Unix based systems like GNU/Linux and macOS:
37 | 
38 | ```bash
39 | git clone https://github.com/sdv-dev/SDMetrics
40 | cd SDMetrics
41 | git checkout stable
42 | make install
43 | ```
44 | 
45 | ## Install for development
46 | 
47 | If you intend to modify the source code or contribute to the project you will need to
48 | install it from the source using the `make install-develop` command. In this case, we
49 | recommend you to branch from `main` first:
50 | 
51 | ```bash
52 | git clone git@github.com:sdv-dev/SDMetrics
53 | cd SDMetrics
54 | git checkout main
55 | git checkout -b <your-branch-name>
56 | make install-develp
57 | ```
58 | 
59 | For more details about how to contribute to the project please visit the [Contributing Guide](
60 | CONTRIBUTING.rst).
61 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020, MIT Data To AI Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | coverage:
2 |   precision: 2
3 |   range: "90...100"
4 |   status:
5 |     project:
6 |       default: false
7 |     patch:
8 |       default: false


--------------------------------------------------------------------------------
/docs/images/column_comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdv-dev/SDMetrics/d52733646855d9d4606f95235f5c65e10afdc439/docs/images/column_comparison.png


--------------------------------------------------------------------------------
/docs/images/column_pairs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdv-dev/SDMetrics/d52733646855d9d4606f95235f5c65e10afdc439/docs/images/column_pairs.png


--------------------------------------------------------------------------------
/latest_requirements.txt:
--------------------------------------------------------------------------------
1 | copulas==0.12.2
2 | numpy==2.0.2
3 | pandas==2.2.3
4 | plotly==6.0.1
5 | scikit-learn==1.6.1
6 | scipy==1.13.1
7 | tqdm==4.67.1
8 | 


--------------------------------------------------------------------------------
/resources/visualize.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdv-dev/SDMetrics/d52733646855d9d4606f95235f5c65e10afdc439/resources/visualize.png


--------------------------------------------------------------------------------
/sdmetrics/column_pairs/__init__.py:
--------------------------------------------------------------------------------
 1 | """Metrics to compare column pairs."""
 2 | 
 3 | from sdmetrics.column_pairs.base import ColumnPairsMetric
 4 | from sdmetrics.column_pairs.statistical.cardinality_boundary_adherence import (
 5 |     CardinalityBoundaryAdherence,
 6 | )
 7 | from sdmetrics.column_pairs.statistical.contingency_similarity import ContingencySimilarity
 8 | from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity
 9 | from sdmetrics.column_pairs.statistical.kl_divergence import (
10 |     ContinuousKLDivergence,
11 |     DiscreteKLDivergence,
12 | )
13 | from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity
14 | from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS
15 | from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS
16 | 
17 | __all__ = [
18 |     'CardinalityBoundaryAdherence',
19 |     'ColumnPairsMetric',
20 |     'ContingencySimilarity',
21 |     'ContinuousKLDivergence',
22 |     'CorrelationSimilarity',
23 |     'DiscreteKLDivergence',
24 |     'ReferentialIntegrity',
25 |     'InterRowMSAS',
26 |     'StatisticMSAS',
27 | ]
28 | 


--------------------------------------------------------------------------------
/sdmetrics/column_pairs/base.py:
--------------------------------------------------------------------------------
 1 | """Base class for metrics that compare pairs of columns."""
 2 | 
 3 | from sdmetrics.base import BaseMetric
 4 | 
 5 | 
 6 | class ColumnPairsMetric(BaseMetric):
 7 |     """Base class for metrics that compare pairs of columns.
 8 | 
 9 |     Attributes:
10 |         name (str):
11 |             Name to use when reports about this metric are printed.
12 |         goal (sdmetrics.goal.Goal):
13 |             The goal of this metric.
14 |         min_value (Union[float, tuple[float]]):
15 |             Minimum value or values that this metric can take.
16 |         max_value (Union[float, tuple[float]]):
17 |             Maximum value or values that this metric can take.
18 |     """
19 | 
20 |     name = None
21 |     goal = None
22 |     min_value = None
23 |     max_value = None
24 | 
25 |     @staticmethod
26 |     def compute(real_data, synthetic_data):
27 |         """Compute this metric.
28 | 
29 |         Args:
30 |             real_data (pandas.DataFrame):
31 |                 The values from the real dataset, passed as pandas.DataFrame
32 |                 with 2 columns.
33 |             synthetic_data (pandas.DataFrame):
34 |                 The values from the synthetic dataset, passed as a
35 |                 pandas.DataFrame with 2 columns.
36 | 
37 |         Returns:
38 |             float:
39 |                 Metric output.
40 |         """
41 |         raise NotImplementedError()
42 | 
43 |     @classmethod
44 |     def compute_breakdown(cls, real_data, synthetic_data):
45 |         """Compute the breakdown of this metric.
46 | 
47 |         Args:
48 |             real_data (pandas.DataFrame):
49 |                 The values from the real dataset, passed as pandas.DataFrame
50 |                 with 2 columns.
51 |             synthetic_data (pandas.DataFrame):
52 |                 The values from the synthetic dataset, passed as a
53 |                 pandas.DataFrame with 2 columns.
54 | 
55 |         Returns:
56 |             dict
57 |                 A mapping of the metric output. Must contain the key 'score'.
58 |         """
59 |         return {'score': cls.compute(real_data, synthetic_data)}
60 | 


--------------------------------------------------------------------------------
/sdmetrics/column_pairs/statistical/__init__.py:
--------------------------------------------------------------------------------
 1 | """Statistical Metrics to compare column pairs."""
 2 | 
 3 | from sdmetrics.column_pairs.statistical.cardinality_boundary_adherence import (
 4 |     CardinalityBoundaryAdherence,
 5 | )
 6 | from sdmetrics.column_pairs.statistical.contingency_similarity import ContingencySimilarity
 7 | from sdmetrics.column_pairs.statistical.correlation_similarity import CorrelationSimilarity
 8 | from sdmetrics.column_pairs.statistical.kl_divergence import (
 9 |     ContinuousKLDivergence,
10 |     DiscreteKLDivergence,
11 | )
12 | from sdmetrics.column_pairs.statistical.referential_integrity import ReferentialIntegrity
13 | from sdmetrics.column_pairs.statistical.inter_row_msas import InterRowMSAS
14 | from sdmetrics.column_pairs.statistical.statistic_msas import StatisticMSAS
15 | 
16 | __all__ = [
17 |     'CardinalityBoundaryAdherence',
18 |     'ContingencySimilarity',
19 |     'ContinuousKLDivergence',
20 |     'CorrelationSimilarity',
21 |     'DiscreteKLDivergence',
22 |     'ReferentialIntegrity',
23 |     'InterRowMSAS',
24 |     'StatisticMSAS',
25 | ]
26 | 


--------------------------------------------------------------------------------
/sdmetrics/column_pairs/statistical/referential_integrity.py:
--------------------------------------------------------------------------------
 1 | """Referential Integrity Metric."""
 2 | 
 3 | import logging
 4 | 
 5 | import pandas as pd
 6 | 
 7 | from sdmetrics.column_pairs.base import ColumnPairsMetric
 8 | from sdmetrics.goal import Goal
 9 | 
10 | LOGGER = logging.getLogger(__name__)
11 | 
12 | 
13 | class ReferentialIntegrity(ColumnPairsMetric):
14 |     """Referential Integrity metric.
15 | 
16 |     Compute the fraction of foreign key values that reference a value in the primary key column
17 |     in the synthetic data.
18 | 
19 |     Attributes:
20 |         name (str):
21 |             Name to use when reports about this metric are printed.
22 |         goal (sdmetrics.goal.Goal):
23 |             The goal of this metric.
24 |         min_value (Union[float, tuple[float]]):
25 |             Minimum value or values that this metric can take.
26 |         max_value (Union[float, tuple[float]]):
27 |             Maximum value or values that this metric can take.
28 |     """
29 | 
30 |     name = 'ReferentialIntegrity'
31 |     goal = Goal.MAXIMIZE
32 |     min_value = 0.0
33 |     max_value = 1.0
34 | 
35 |     @classmethod
36 |     def compute_breakdown(cls, real_data, synthetic_data):
37 |         """Compute the score breakdown of the referential integrity metric.
38 | 
39 |         Args:
40 |             real_data (tuple of 2 pandas.Series):
41 |                 (primary_key, foreign_key) columns from the real data.
42 |             synthetic_data (tuple of 2 pandas.Series):
43 |                 (primary_key, foreign_key) columns from the synthetic data.
44 | 
45 |         Returns:
46 |             dict:
47 |                 The score breakdown of the key uniqueness metric.
48 |         """
49 |         if pd.isna(real_data[1]).any():
50 |             synthetic_data = list(synthetic_data)
51 |             synthetic_data[1] = synthetic_data[1].dropna()
52 | 
53 |         missing_parents = not real_data[1].isin(real_data[0]).all()
54 |         if missing_parents:
55 |             LOGGER.info("The real data has foreign keys that don't reference any primary key.")
56 | 
57 |         score = synthetic_data[1].isin(synthetic_data[0]).mean()
58 | 
59 |         return {'score': score}
60 | 
61 |     @classmethod
62 |     def compute(cls, real_data, synthetic_data):
63 |         """Compute the referential integrity of two columns.
64 | 
65 |         Args:
66 |             real_data (tuple of 2 pandas.Series):
67 |                 (primary_key, foreign_key) columns from the real data.
68 |             synthetic_data (tuple of 2 pandas.Series):
69 |                 (primary_key, foreign_key) columns from the synthetic data.
70 | 
71 |         Returns:
72 |             float:
73 |                 The key uniqueness of the two columns.
74 |         """
75 |         return cls.compute_breakdown(real_data, synthetic_data)['score']
76 | 


--------------------------------------------------------------------------------
/sdmetrics/demos/multi_table/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "tables": {
 3 |         "users": {
 4 |             "primary_key": "user_id",
 5 |             "columns": {
 6 |                 "user_id": {
 7 |                     "sdtype": "id",
 8 |                     "regex_format": "\\d{30}"
 9 |                 },
10 |                 "country": {
11 |                     "sdtype": "categorical"
12 |                 },
13 |                 "gender": {
14 |                     "sdtype": "categorical"
15 |                 },
16 |                 "age": {
17 |                     "sdtype": "numerical",
18 |                     "computer_representation": "Int64"
19 |                 }
20 |             }
21 |         },
22 |         "sessions": {
23 |             "primary_key": "session_id",
24 |             "columns": {
25 |                 "session_id": {
26 |                     "sdtype": "id",
27 |                     "regex_format": "\\d{30}"
28 |                 },
29 |                 "user_id": {
30 |                     "sdtype": "id",
31 |                     "regex_format": "\\d{30}"
32 |                 },
33 |                 "device": {
34 |                     "sdtype": "categorical"
35 |                 },
36 |                 "os": {
37 |                     "sdtype": "categorical"
38 |                 }
39 |             }
40 |         },
41 |         "transactions": {
42 |             "primary_key": "transaction_id",
43 |             "columns": {
44 |                 "transaction_id": {
45 |                     "sdtype": "id",
46 |                     "regex_format": "\\d{30}"
47 |                 },
48 |                 "session_id": {
49 |                     "sdtype": "id",
50 |                     "regex_format": "\\d{30}"
51 |                 },
52 |                 "timestamp": {
53 |                     "sdtype": "datetime",
54 |                     "datetime_format": "%Y-%m-%d %H:%M:%S"
55 |                 },
56 |                 "amount": {
57 |                     "sdtype": "numerical",
58 |                     "computer_representation": "Float"
59 |                 },
60 |                 "approved": {
61 |                     "sdtype": "boolean"
62 |                 }
63 |             }
64 |         }
65 |     },
66 |     "relationships": [
67 |         {
68 |             "parent_table_name": "users",
69 |             "parent_primary_key": "user_id",
70 |             "child_table_name": "sessions",
71 |             "child_foreign_key": "user_id"
72 |         },
73 |         {
74 |             "parent_table_name": "sessions",
75 |             "parent_primary_key": "session_id",
76 |             "child_table_name": "transactions",
77 |             "child_foreign_key": "session_id"
78 |         }
79 |     ],
80 |     "METADATA_SPEC_VERSION": "MULTI_TABLE_V1"
81 | }


--------------------------------------------------------------------------------
/sdmetrics/demos/multi_table/sessions_real.csv:
--------------------------------------------------------------------------------
 1 | session_id,user_id,device,os
 2 | 0,0,mobile,android
 3 | 1,1,tablet,ios
 4 | 2,1,tablet,android
 5 | 3,2,mobile,android
 6 | 4,4,mobile,ios
 7 | 5,5,mobile,android
 8 | 6,6,mobile,ios
 9 | 7,6,tablet,ios
10 | 8,6,mobile,ios
11 | 9,8,tablet,ios
12 | 


--------------------------------------------------------------------------------
/sdmetrics/demos/multi_table/sessions_synthetic.csv:
--------------------------------------------------------------------------------
 1 | session_id,user_id,device,os
 2 | 0,1,mobile,ios
 3 | 1,4,mobile,android
 4 | 2,0,mobile,ios
 5 | 3,8,mobile,ios
 6 | 4,9,tablet,android
 7 | 5,5,tablet,ios
 8 | 6,9,mobile,ios
 9 | 7,8,mobile,ios
10 | 8,3,mobile,android
11 | 9,8,mobile,ios
12 | 


--------------------------------------------------------------------------------
/sdmetrics/demos/multi_table/transactions_real.csv:
--------------------------------------------------------------------------------
 1 | transaction_id,session_id,timestamp,amount,approved
 2 | 0,0,2019-01-01 12:34:32,100.0,True
 3 | 1,0,2019-01-01 12:42:21,55.3,True
 4 | 2,1,2019-01-07 17:23:11,79.5,True
 5 | 3,3,2019-01-10 11:08:57,112.1,False
 6 | 4,5,2019-01-10 21:54:08,110.0,False
 7 | 5,5,2019-01-11 11:21:20,76.3,True
 8 | 6,7,2019-01-22 14:44:10,89.5,True
 9 | 7,8,2019-01-23 10:14:09,132.1,False
10 | 8,9,2019-01-27 16:09:17,68.0,True
11 | 9,9,2019-01-29 12:10:48,99.9,True
12 | 


--------------------------------------------------------------------------------
/sdmetrics/demos/multi_table/transactions_synthetic.csv:
--------------------------------------------------------------------------------
 1 | transaction_id,session_id,timestamp,amount,approved
 2 | 0,4,2019-01-25 18:21:18,115.2,True
 3 | 1,1,2019-01-08 09:17:33,75.0,True
 4 | 2,8,2019-01-26 05:55:58,77.7,True
 5 | 3,2,2019-01-12 14:32:23,102.3,True
 6 | 4,7,2019-01-10 00:55:37,75.2,True
 7 | 5,7,2019-01-20 03:21:23,72.7,True
 8 | 6,9,2019-01-13 16:09:43,81.9,True
 9 | 7,3,2019-01-16 17:56:36,73.7,True
10 | 8,6,2019-01-04 22:04:02,120.6,False
11 | 9,1,2019-01-11 01:00:26,110.9,False
12 | 


--------------------------------------------------------------------------------
/sdmetrics/demos/multi_table/users_real.csv:
--------------------------------------------------------------------------------
 1 | user_id,country,gender,age
 2 | 0,US,M,34
 3 | 1,UK,F,23
 4 | 2,ES,,44
 5 | 3,UK,M,22
 6 | 4,US,F,54
 7 | 5,DE,M,57
 8 | 6,BG,F,45
 9 | 7,ES,,41
10 | 8,FR,F,23
11 | 9,UK,,30
12 | 


--------------------------------------------------------------------------------
/sdmetrics/demos/multi_table/users_synthetic.csv:
--------------------------------------------------------------------------------
 1 | user_id,country,gender,age
 2 | 0,UK,M,47
 3 | 1,UK,,29
 4 | 2,US,F,41
 5 | 3,ES,F,36
 6 | 4,US,F,42
 7 | 5,US,F,45
 8 | 6,ES,F,32
 9 | 7,UK,F,35
10 | 8,DE,F,28
11 | 9,ES,F,34
12 | 


--------------------------------------------------------------------------------
/sdmetrics/demos/single_table/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "primary_key": "student_id",
 3 |     "columns": {
 4 |         "start_date": {
 5 |             "sdtype": "datetime",
 6 |             "datetime_format": "%Y-%m-%d"
 7 |         },
 8 |         "end_date": {
 9 |             "sdtype": "datetime",
10 |             "datetime_format": "%Y-%m-%d"
11 |         },
12 |         "salary": {
13 |             "sdtype": "numerical",
14 |             "computer_representation": "Int64"
15 |         },
16 |         "duration": {
17 |             "sdtype": "numerical",
18 |             "computer_representation": "Int64"
19 |         },
20 |         "student_id": {
21 |             "sdtype": "id",
22 |             "regex_format": "\\d{30}"
23 |         },
24 |         "high_perc": {
25 |             "sdtype": "numerical",
26 |             "computer_representation": "Float"
27 |         },
28 |         "high_spec": {
29 |             "sdtype": "categorical"
30 |         },
31 |         "mba_spec": {
32 |             "sdtype": "categorical"
33 |         },
34 |         "second_perc": {
35 |             "sdtype": "numerical",
36 |             "computer_representation": "Float"
37 |         },
38 |         "gender": {
39 |             "sdtype": "categorical"
40 |         },
41 |         "degree_perc": {
42 |             "sdtype": "numerical",
43 |             "computer_representation": "Float"
44 |         },
45 |         "placed": {
46 |             "sdtype": "boolean"
47 |         },
48 |         "experience_years": {
49 |             "sdtype": "numerical",
50 |             "computer_representation": "Float"
51 |         },
52 |         "employability_perc": {
53 |             "sdtype": "numerical",
54 |             "computer_representation": "Float"
55 |         },
56 |         "mba_perc": {
57 |             "sdtype": "numerical",
58 |             "computer_representation": "Float"
59 |         },
60 |         "work_experience": {
61 |             "sdtype": "boolean"
62 |         },
63 |         "degree_type": {
64 |             "sdtype": "categorical"
65 |         }
66 |     },
67 |     "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
68 | }


--------------------------------------------------------------------------------
/sdmetrics/demos/timeseries/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "columns": {
 3 |         "region": {
 4 |             "sdtype": "categorical"
 5 |         },
 6 |         "store_id": {
 7 |             "sdtype": "numerical",
 8 |             "computer_representation": "Int64"
 9 |         },
10 |         "nb_customers": {
11 |             "sdtype": "numerical",
12 |             "computer_representation": "Int64"
13 |         },
14 |         "total_sales": {
15 |             "sdtype": "numerical",
16 |             "computer_representation": "Float"
17 |         },
18 |         "date": {
19 |             "sdtype": "datetime"
20 |         },
21 |         "day_of_week": {
22 |             "sdtype": "numerical",
23 |             "computer_representation": "Int64"
24 |         }
25 |     },
26 |     "sequence_key": "store_id",
27 |     "sequence_index": "date",
28 |     "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
29 | }


--------------------------------------------------------------------------------
/sdmetrics/errors.py:
--------------------------------------------------------------------------------
 1 | """Custom errors for SDMetrics."""
 2 | 
 3 | 
 4 | class VisualizationUnavailableError(Exception):
 5 |     """Raised when a visualization is not available."""
 6 | 
 7 | 
 8 | class IncomputableMetricError(Exception):
 9 |     """Raised when a metric cannot be computed."""
10 | 
11 | 
12 | class ConstantInputError(Exception):
13 |     """Thrown when the input data has all the same values."""
14 | 
15 | 
16 | class InvalidDataError(Exception):
17 |     """Error to raise when data is not valid."""
18 | 


--------------------------------------------------------------------------------
/sdmetrics/goal.py:
--------------------------------------------------------------------------------
 1 | """SDMetrics Goal Enumeration."""
 2 | 
 3 | from enum import Enum
 4 | 
 5 | 
 6 | class Goal(Enum):
 7 |     """Goal Enumeration.
 8 | 
 9 |     This enumerates the ``goal`` for a metric; the value of a metric can be ignored,
10 |     minimized, or maximized.
11 |     """
12 | 
13 |     IGNORE = 'ignore'
14 |     MAXIMIZE = 'maximize'
15 |     MINIMIZE = 'minimize'
16 | 


--------------------------------------------------------------------------------
/sdmetrics/multi_table/__init__.py:
--------------------------------------------------------------------------------
 1 | """Metrics for multi table datasets."""
 2 | 
 3 | from sdmetrics.multi_table import detection, multi_single_table
 4 | from sdmetrics.multi_table.base import MultiTableMetric
 5 | from sdmetrics.multi_table.detection.base import DetectionMetric
 6 | from sdmetrics.multi_table.detection.parent_child import (
 7 |     LogisticParentChildDetection,
 8 |     ParentChildDetectionMetric,
 9 |     SVCParentChildDetection,
10 | )
11 | from sdmetrics.multi_table.multi_single_table import (
12 |     BNLikelihood,
13 |     BNLogLikelihood,
14 |     BoundaryAdherence,
15 |     CategoryCoverage,
16 |     ContingencySimilarity,
17 |     CorrelationSimilarity,
18 |     CSTest,
19 |     KSComplement,
20 |     LogisticDetection,
21 |     MissingValueSimilarity,
22 |     MultiSingleTableMetric,
23 |     NewRowSynthesis,
24 |     RangeCoverage,
25 |     StatisticSimilarity,
26 |     SVCDetection,
27 |     TVComplement,
28 | )
29 | from sdmetrics.multi_table.statistical.cardinality_shape_similarity import (
30 |     CardinalityShapeSimilarity,
31 | )
32 | from sdmetrics.multi_table.statistical.cardinality_statistic_similarity import (
33 |     CardinalityStatisticSimilarity,
34 | )
35 | 
36 | __all__ = [
37 |     'detection',
38 |     'multi_single_table',
39 |     'MultiTableMetric',
40 |     'DetectionMetric',
41 |     'ParentChildDetectionMetric',
42 |     'LogisticParentChildDetection',
43 |     'SVCParentChildDetection',
44 |     'BNLikelihood',
45 |     'BNLogLikelihood',
46 |     'CSTest',
47 |     'KSComplement',
48 |     'LogisticDetection',
49 |     'SVCDetection',
50 |     'MultiSingleTableMetric',
51 |     'CardinalityShapeSimilarity',
52 |     'CardinalityStatisticSimilarity',
53 |     'BoundaryAdherence',
54 |     'CategoryCoverage',
55 |     'CorrelationSimilarity',
56 |     'ContingencySimilarity',
57 |     'MissingValueSimilarity',
58 |     'StatisticSimilarity',
59 |     'TVComplement',
60 |     'RangeCoverage',
61 |     'NewRowSynthesis',
62 | ]
63 | 


--------------------------------------------------------------------------------
/sdmetrics/multi_table/base.py:
--------------------------------------------------------------------------------
 1 | """Base Multi Table metric class."""
 2 | 
 3 | from sdmetrics.base import BaseMetric
 4 | 
 5 | 
 6 | class MultiTableMetric(BaseMetric):
 7 |     """Base class for metrics that apply to multiple tables.
 8 | 
 9 |     Attributes:
10 |         name (str):
11 |             Name to use when reports about this metric are printed.
12 |         goal (sdmetrics.goal.Goal):
13 |             The goal of this metric.
14 |         min_value (Union[float, tuple[float]]):
15 |             Minimum value or values that this metric can take.
16 |         max_value (Union[float, tuple[float]]):
17 |             Maximum value or values that this metric can take.
18 |     """
19 | 
20 |     name = None
21 |     goal = None
22 |     min_value = None
23 |     max_value = None
24 | 
25 |     @staticmethod
26 |     def compute(real_data, synthetic_data, metadata=None):
27 |         """Compute this metric.
28 | 
29 |         Args:
30 |             real_data (dict[str, pandas.DataFrame]):
31 |                 The tables from the real dataset, passed as a dictionary of
32 |                 table names and pandas.DataFrames.
33 |             synthetic_data (dict[str, pandas.DataFrame]):
34 |                 The tables from the synthetic dataset, passed as a dictionary of
35 |                 table names and pandas.DataFrames.
36 |             metadata (dict):
37 |                 Multi-table metadata dict. If not passed, it is build based on the
38 |                 real_data fields and dtypes.
39 | 
40 |         Returns:
41 |             Union[float, tuple[float]]:
42 |                 Metric output.
43 |         """
44 |         raise NotImplementedError()
45 | 


--------------------------------------------------------------------------------
/sdmetrics/multi_table/detection/__init__.py:
--------------------------------------------------------------------------------
1 | """Machine Learning Detection metrics that work on multiple tables."""
2 | 


--------------------------------------------------------------------------------
/sdmetrics/multi_table/detection/base.py:
--------------------------------------------------------------------------------
 1 | """Base class for Machine Learning Detection metrics that work on multiple tables."""
 2 | 
 3 | from sdmetrics.multi_table.base import MultiTableMetric
 4 | 
 5 | 
 6 | class DetectionMetric(MultiTableMetric):
 7 |     """Base class for Machine Learning Detection based metrics on multiple tables.
 8 | 
 9 |     These metrics build a Machine Learning Classifier that learns to tell the synthetic
10 |     data apart from the real data, which later on is evaluated using Cross Validation.
11 | 
12 |     The output of the metric is one minus the average ROC AUC score obtained.
13 | 
14 |     Attributes:
15 |         name (str):
16 |             Name to use when reports about this metric are printed.
17 |         goal (sdmetrics.goal.Goal):
18 |             The goal of this metric.
19 |         min_value (Union[float, tuple[float]]):
20 |             Minimum value or values that this metric can take.
21 |         max_value (Union[float, tuple[float]]):
22 |             Maximum value or values that this metric can take.
23 |     """
24 | 
25 |     name = None
26 |     goal = None
27 |     min_value = None
28 |     max_value = None
29 | 
30 |     @classmethod
31 |     def compute(cls, real_data, synthetic_data, metadata=None):
32 |         """Compute this metric.
33 | 
34 |         Args:
35 |             real_data (dict[str, pandas.DataFrame]):
36 |                 The tables from the real dataset.
37 |             synthetic_data (dict[str, pandas.DataFrame]):
38 |                 The tables from the synthetic dataset.
39 |             metadata (dict):
40 |                 Multi-table metadata dict. If not passed, it is build based on the
41 |                 real_data fields and dtypes.
42 | 
43 |         Returns:
44 |             Union[float, tuple[float]]:
45 |                 Metric output.
46 |         """
47 |         raise NotImplementedError()
48 | 
49 |     @classmethod
50 |     def normalize(cls, raw_score):
51 |         """Return the `raw_score` as is, since it is already normalized.
52 | 
53 |         Args:
54 |             raw_score (float):
55 |                 The value of the metric from `compute`.
56 | 
57 |         Returns:
58 |             float:
59 |                 The normalized value of the metric
60 |         """
61 |         return super().normalize(raw_score)
62 | 


--------------------------------------------------------------------------------
/sdmetrics/multi_table/statistical/__init__.py:
--------------------------------------------------------------------------------
 1 | """Multi table statistical metrics."""
 2 | 
 3 | from sdmetrics.multi_table.statistical.cardinality_shape_similarity import (
 4 |     CardinalityShapeSimilarity,
 5 | )
 6 | from sdmetrics.multi_table.statistical.cardinality_statistic_similarity import (
 7 |     CardinalityStatisticSimilarity,
 8 | )
 9 | 
10 | __all__ = ['CardinalityShapeSimilarity', 'CardinalityStatisticSimilarity']
11 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/__init__.py:
--------------------------------------------------------------------------------
 1 | """Reports for sdmetrics."""
 2 | 
 3 | from sdmetrics.reports.multi_table import DiagnosticReport as MultiTableDiagnosticReport
 4 | from sdmetrics.reports.multi_table import QualityReport as MultiTableQualityReport
 5 | from sdmetrics.reports.single_table import DiagnosticReport as SingleTableDiagnosticReport
 6 | from sdmetrics.reports.single_table import QualityReport as SingleTableQualityReport
 7 | 
 8 | __all__ = [
 9 |     'SingleTableQualityReport',
10 |     'SingleTableDiagnosticReport',
11 |     'MultiTableQualityReport',
12 |     'MultiTableDiagnosticReport',
13 | ]
14 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/multi_table/__init__.py:
--------------------------------------------------------------------------------
 1 | """Multi table reports for sdmetrics."""
 2 | 
 3 | from sdmetrics.reports.multi_table.diagnostic_report import DiagnosticReport
 4 | from sdmetrics.reports.multi_table.quality_report import QualityReport
 5 | 
 6 | __all__ = [
 7 |     'DiagnosticReport',
 8 |     'QualityReport',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/multi_table/_properties/__init__.py:
--------------------------------------------------------------------------------
 1 | """Multi table properties for sdmetrics."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties.base import BaseMultiTableProperty
 4 | from sdmetrics.reports.multi_table._properties.boundary import Boundary
 5 | from sdmetrics.reports.multi_table._properties.cardinality import Cardinality
 6 | from sdmetrics.reports.multi_table._properties.column_pair_trends import ColumnPairTrends
 7 | from sdmetrics.reports.multi_table._properties.column_shapes import ColumnShapes
 8 | from sdmetrics.reports.multi_table._properties.coverage import Coverage
 9 | from sdmetrics.reports.multi_table._properties.data_validity import DataValidity
10 | from sdmetrics.reports.multi_table._properties.inter_table_trends import InterTableTrends
11 | from sdmetrics.reports.multi_table._properties.relationship_validity import RelationshipValidity
12 | from sdmetrics.reports.multi_table._properties.structure import Structure
13 | from sdmetrics.reports.multi_table._properties.synthesis import Synthesis
14 | 
15 | __all__ = [
16 |     'BaseMultiTableProperty',
17 |     'Boundary',
18 |     'Cardinality',
19 |     'ColumnShapes',
20 |     'ColumnPairTrends',
21 |     'Coverage',
22 |     'InterTableTrends',
23 |     'Synthesis',
24 |     'Structure',
25 |     'DataValidity',
26 |     'RelationshipValidity',
27 | ]
28 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/multi_table/_properties/boundary.py:
--------------------------------------------------------------------------------
 1 | """Boundary property for multi-table."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty
 4 | from sdmetrics.reports.single_table._properties import Boundary as SingleTableBoundary
 5 | 
 6 | 
 7 | class Boundary(BaseMultiTableProperty):
 8 |     """Boundary property class for multi-table.
 9 | 
10 |     This property assesses the boundary adherence of the synthetic data over the real data.
11 |     The ``BoundaryAdherence`` metric is computed column-wise and the final score is the average
12 |     over all columns. This metric is computed over numerical and datetime columns only.
13 |     The other column types are ignored by this property.
14 |     """
15 | 
16 |     _single_table_property = SingleTableBoundary
17 |     _num_iteration_case = 'column'
18 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/multi_table/_properties/column_pair_trends.py:
--------------------------------------------------------------------------------
 1 | """Column pair trends property for multi-table."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty
 4 | from sdmetrics.reports.single_table._properties import (
 5 |     ColumnPairTrends as SingleTableColumnPairTrends,
 6 | )
 7 | 
 8 | 
 9 | class ColumnPairTrends(BaseMultiTableProperty):
10 |     """Column pair trends property for multi-table.
11 | 
12 |     This property evaluates the matching in trends between pairs of real
13 |     and synthetic data columns. Each pair's correlation is calculated and
14 |     the final score represents the average of these measures across all column pairs
15 |     """
16 | 
17 |     _single_table_property = SingleTableColumnPairTrends
18 |     _num_iteration_case = 'column_pair'
19 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/multi_table/_properties/column_shapes.py:
--------------------------------------------------------------------------------
 1 | """Column shapes property for multi-table."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty
 4 | from sdmetrics.reports.single_table._properties import ColumnShapes as SingleTableColumnShapes
 5 | 
 6 | 
 7 | class ColumnShapes(BaseMultiTableProperty):
 8 |     """Column Shapes property class for multi-table.
 9 | 
10 |     This property assesses the shape similarity between the real and synthetic data.
11 |     A metric score is computed column-wise and the final score is the average over all columns.
12 |     The KSComplement metric is used for numerical and datetime columns while the TVComplement
13 |     is used for categorical and boolean columns.
14 |     The other column types are ignored by this property.
15 |     """
16 | 
17 |     _single_table_property = SingleTableColumnShapes
18 |     _num_iteration_case = 'column'
19 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/multi_table/_properties/coverage.py:
--------------------------------------------------------------------------------
 1 | """Coverage property for multi-table."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty
 4 | from sdmetrics.reports.single_table._properties import Coverage as SingleTableCoverage
 5 | 
 6 | 
 7 | class Coverage(BaseMultiTableProperty):
 8 |     """Coverage property class for multi-table.
 9 | 
10 |     This property assesses data coverage between the real and synthetic data.
11 |     A metric score is computed column-wise and the final score is the average over all columns.
12 |     The ``RangeCoverage`` metric is used for numerical and datetime columns while the
13 |     ``CategoryCoverage`` is used for categorical and boolean columns.
14 |     The other column types are ignored by this property.
15 |     """
16 | 
17 |     _single_table_property = SingleTableCoverage
18 |     _num_iteration_case = 'column'
19 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/multi_table/_properties/data_validity.py:
--------------------------------------------------------------------------------
 1 | """Data validity property for multi-table."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty
 4 | from sdmetrics.reports.single_table._properties import DataValidity as SingleTableDataValidity
 5 | 
 6 | 
 7 | class DataValidity(BaseMultiTableProperty):
 8 |     """Data Validitys property class for multi-table.
 9 | 
10 |     This property computes, at base, whether each column contains valid data.
11 |     The metric is based on the type data in each column.
12 |     A metric score is computed column-wise and the final score is the average over all columns.
13 |     The BoundaryAdherence metric is used for numerical and datetime columns, the CategoryAdherence
14 |     is used for categorical and boolean columns and the KeyUniqueness for primary and
15 |     alternate keys. The other column types are ignored by this property.
16 |     """
17 | 
18 |     _single_table_property = SingleTableDataValidity
19 |     _num_iteration_case = 'column'
20 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/multi_table/_properties/structure.py:
--------------------------------------------------------------------------------
 1 | """Structure property for multi-table."""
 2 | 
 3 | import plotly.express as px
 4 | 
 5 | from sdmetrics.errors import VisualizationUnavailableError
 6 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty
 7 | from sdmetrics.reports.single_table._properties import Structure as SingleTableStructure
 8 | from sdmetrics.reports.utils import PlotConfig
 9 | 
10 | 
11 | class Structure(BaseMultiTableProperty):
12 |     """Structure property class for multi-table.
13 | 
14 |     This property checks to see whether the overall structure of the synthetic
15 |     data is the same as the real data. The property is calculated for each table.
16 |     """
17 | 
18 |     _single_table_property = SingleTableStructure
19 |     _num_iteration_case = 'table'
20 | 
21 |     def get_visualization(self, table_name=None):
22 |         """Return a visualization for each score in the property.
23 | 
24 |         Args:
25 |             table_name:
26 |                 If a table name is provided, an error is raised.
27 | 
28 |         Returns:
29 |             plotly.graph_objects._figure.Figure
30 |                 The visualization for the property.
31 |         """
32 |         if table_name:
33 |             raise VisualizationUnavailableError(
34 |                 'The Structure property does not have a supported visualization for'
35 |                 ' individual tables.'
36 |             )
37 | 
38 |         average_score = self._compute_average()
39 |         fig = px.bar(
40 |             data_frame=self.details,
41 |             x='Table',
42 |             y='Score',
43 |             title=f'Data Diagnostic: Structure (Average Score={average_score})',
44 |             category_orders={'group': list(self.details['Table'])},
45 |             color='Metric',
46 |             color_discrete_map={
47 |                 'TableStructure': PlotConfig.DATACEBO_DARK,
48 |             },
49 |             pattern_shape='Metric',
50 |             pattern_shape_sequence=[''],
51 |             hover_name='Table',
52 |             hover_data={
53 |                 'Table': False,
54 |                 'Metric': True,
55 |                 'Score': True,
56 |             },
57 |         )
58 | 
59 |         fig.update_yaxes(range=[0, 1])
60 | 
61 |         fig.update_layout(
62 |             xaxis_categoryorder='total ascending',
63 |             plot_bgcolor=PlotConfig.BACKGROUND_COLOR,
64 |             margin={'t': 150},
65 |             font={'size': PlotConfig.FONT_SIZE},
66 |         )
67 | 
68 |         return fig
69 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/multi_table/_properties/synthesis.py:
--------------------------------------------------------------------------------
 1 | """Synthesis property for multi-table."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import BaseMultiTableProperty
 4 | from sdmetrics.reports.single_table._properties import Synthesis as SingleTableSynthesis
 5 | 
 6 | 
 7 | class Synthesis(BaseMultiTableProperty):
 8 |     """Synthesis property class for multi-table.
 9 | 
10 |     This property assesses the novelty of the syntetic data over the real data.
11 |     The ``NewRowSynthesis`` metric is computed over the real and synthetic for each table
12 |     to score the proportion of new rows in the synthetic data.
13 |     The final score is the average over all tables.
14 |     """
15 | 
16 |     _single_table_property = SingleTableSynthesis
17 |     _num_iteration_case = 'table'
18 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/multi_table/diagnostic_report.py:
--------------------------------------------------------------------------------
 1 | """Multi table diagnostic report."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import DataValidity, RelationshipValidity, Structure
 4 | from sdmetrics.reports.multi_table.base_multi_table_report import BaseMultiTableReport
 5 | 
 6 | 
 7 | class DiagnosticReport(BaseMultiTableReport):
 8 |     """Multi table diagnostic report.
 9 | 
10 |     This class creates a diagnostic report for multi-table data. It calculates the diagnostic
11 |     score along three properties - Relationship Validity, Data Structure, and Data Validity.
12 |     """
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 |         self._properties = {
17 |             'Data Validity': DataValidity(),
18 |             'Data Structure': Structure(),
19 |             'Relationship Validity': RelationshipValidity(),
20 |         }
21 | 
22 |     def _validate_metadata_matches_data(self, real_data, synthetic_data, metadata):
23 |         self._validate_relationships(real_data, synthetic_data, metadata)
24 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/multi_table/quality_report.py:
--------------------------------------------------------------------------------
 1 | """Multi table quality report."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import (
 4 |     Cardinality,
 5 |     ColumnPairTrends,
 6 |     ColumnShapes,
 7 |     InterTableTrends,
 8 | )
 9 | from sdmetrics.reports.multi_table.base_multi_table_report import BaseMultiTableReport
10 | 
11 | 
12 | class QualityReport(BaseMultiTableReport):
13 |     """Multi table quality report.
14 | 
15 |     This class creates a quality report for multi-table data. It calculates the quality
16 |     score along three properties - Column Shapes, Column Pair Trends, and Cardinality.
17 |     """
18 | 
19 |     def __init__(self):
20 |         super().__init__()
21 |         self._properties = {
22 |             'Column Shapes': ColumnShapes(),
23 |             'Column Pair Trends': ColumnPairTrends(),
24 |             'Cardinality': Cardinality(),
25 |             'Intertable Trends': InterTableTrends(),
26 |         }
27 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/single_table/__init__.py:
--------------------------------------------------------------------------------
 1 | """Single table reports for sdmetrics."""
 2 | 
 3 | from sdmetrics.reports.single_table.diagnostic_report import DiagnosticReport
 4 | from sdmetrics.reports.single_table.quality_report import QualityReport
 5 | 
 6 | __all__ = [
 7 |     'DiagnosticReport',
 8 |     'QualityReport',
 9 | ]
10 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/single_table/_properties/__init__.py:
--------------------------------------------------------------------------------
 1 | """Single table properties for sdmetrics."""
 2 | 
 3 | from sdmetrics.reports.single_table._properties.base import BaseSingleTableProperty
 4 | from sdmetrics.reports.single_table._properties.boundary import Boundary
 5 | from sdmetrics.reports.single_table._properties.column_pair_trends import ColumnPairTrends
 6 | from sdmetrics.reports.single_table._properties.column_shapes import ColumnShapes
 7 | from sdmetrics.reports.single_table._properties.coverage import Coverage
 8 | from sdmetrics.reports.single_table._properties.data_validity import DataValidity
 9 | from sdmetrics.reports.single_table._properties.structure import Structure
10 | from sdmetrics.reports.single_table._properties.synthesis import Synthesis
11 | 
12 | __all__ = [
13 |     'BaseSingleTableProperty',
14 |     'ColumnShapes',
15 |     'ColumnPairTrends',
16 |     'Coverage',
17 |     'Boundary',
18 |     'Synthesis',
19 |     'Structure',
20 |     'DataValidity',
21 | ]
22 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/single_table/_properties/base.py:
--------------------------------------------------------------------------------
 1 | """Single table base property class."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | 
 6 | class BaseSingleTableProperty:
 7 |     """Base class for single table properties.
 8 | 
 9 |     A property is a higher-level concept for a class that loops through all the base-level data
10 |     and applies different base-level metrics based on the data type.
11 |     """
12 | 
13 |     _num_iteration_case = None
14 | 
15 |     def __init__(self):
16 |         self.details = pd.DataFrame()
17 | 
18 |     def _compute_average(self):
19 |         """Average the scores for each column."""
20 |         if not isinstance(self.details, pd.DataFrame) or 'Score' not in self.details.columns:
21 |             raise ValueError("The property details must be a DataFrame with a 'Score' column.")
22 | 
23 |         return self.details['Score'].mean()
24 | 
25 |     def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=None):
26 |         """Generate the _details dataframe for the property."""
27 |         raise NotImplementedError()
28 | 
29 |     def _get_num_iterations(self, metadata):
30 |         """Get the number of iterations for the property."""
31 |         if self._num_iteration_case == 'column':
32 |             return len(metadata['columns'])
33 |         elif self._num_iteration_case == 'table':
34 |             return 1
35 |         elif self._num_iteration_case == 'column_pair':
36 |             return int(len(metadata['columns']) * (len(metadata['columns']) - 1) / 2)
37 | 
38 |     def get_score(self, real_data, synthetic_data, metadata, progress_bar=None):
39 |         """Get the average score for the property on the data.
40 | 
41 |         Args:
42 |             real_data (pandas.DataFrame):
43 |                 The real data.
44 |             synthetic_data (pandas.DataFrame):
45 |                 The synthetic data.
46 |             metadata (dict):
47 |                 The metadata, which contains each column's data type as well as relationships.
48 |             progress_bar (tqdm.tqdm or None):
49 |                 The progress bar object. Defaults to None.
50 | 
51 |         Returns:
52 |             float:
53 |                 The average score for the property.
54 |         """
55 |         self.details = self._generate_details(real_data, synthetic_data, metadata, progress_bar)
56 |         return self._compute_average()
57 | 
58 |     def get_visualization(self):
59 |         """Return a visualization for each score in the property.
60 | 
61 |         Returns:
62 |             plotly.graph_objects._figure.Figure
63 |                 The visualization for the property.
64 |         """
65 |         raise NotImplementedError()
66 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/single_table/_properties/structure.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from sdmetrics.errors import VisualizationUnavailableError
 5 | from sdmetrics.reports.single_table._properties import BaseSingleTableProperty
 6 | from sdmetrics.single_table import TableStructure
 7 | 
 8 | 
 9 | class Structure(BaseSingleTableProperty):
10 |     """Structure property class for single table.
11 | 
12 |     This property checks to see whether the overall structure of the synthetic
13 |     data is the same as the real data.
14 |     """
15 | 
16 |     _num_iteration_case = 'table'
17 | 
18 |     def _generate_details(self, real_data, synthetic_data, metadata, progress_bar=None):
19 |         """Generate the _details dataframe for the structure property.
20 | 
21 |         Args:
22 |             real_data (pandas.DataFrame):
23 |                 The real data.
24 |             synthetic_data (pandas.DataFrame):
25 |                 The synthetic data.
26 |             metadata (dict):
27 |                 The metadata of the table
28 |             progress_bar (tqdm.tqdm or None):
29 |                 The progress bar to use. Defaults to None.
30 | 
31 |         Returns:
32 |             pandas.DataFrame
33 |         """
34 |         try:
35 |             score = TableStructure.compute(real_data, synthetic_data)
36 |             error_message = None
37 | 
38 |         except Exception as e:
39 |             score = np.nan
40 |             error_message = f'{type(e).__name__}: {e}'
41 | 
42 |         finally:
43 |             if progress_bar:
44 |                 progress_bar.update()
45 | 
46 |         result = pd.DataFrame(
47 |             {
48 |                 'Metric': 'TableStructure',
49 |                 'Score': score,
50 |                 'Error': error_message,
51 |             },
52 |             index=[0],
53 |         )
54 | 
55 |         if result['Error'].isna().all():
56 |             result = result.drop('Error', axis=1)
57 | 
58 |         return result
59 | 
60 |     def get_visualization(self):
61 |         """Return the visualization for the property.
62 | 
63 |         Raise an error in this case because the single table Structure property
64 |         does not have a supported visualization.
65 |         """
66 |         raise VisualizationUnavailableError(
67 |             'The single table Structure property does not have a supported visualization.'
68 |         )
69 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/single_table/diagnostic_report.py:
--------------------------------------------------------------------------------
 1 | """Single table diagnostic report."""
 2 | 
 3 | from sdmetrics.reports.base_report import BaseReport
 4 | from sdmetrics.reports.single_table._properties import DataValidity, Structure
 5 | 
 6 | 
 7 | class DiagnosticReport(BaseReport):
 8 |     """Single table diagnostic report.
 9 | 
10 |     This class creates a diagnostic report for single-table data. It calculates the diagnostic
11 |     score along two properties - Data Structure and Data Validity.
12 |     """
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 |         self._properties = {
17 |             'Data Validity': DataValidity(),
18 |             'Data Structure': Structure(),
19 |         }
20 | 
21 |     def _validate_metadata_matches_data(self, real_data, synthetic_data, metadata):
22 |         return
23 | 


--------------------------------------------------------------------------------
/sdmetrics/reports/single_table/quality_report.py:
--------------------------------------------------------------------------------
 1 | """Single table quality report."""
 2 | 
 3 | from sdmetrics.reports.base_report import BaseReport
 4 | from sdmetrics.reports.single_table._properties import ColumnPairTrends, ColumnShapes
 5 | 
 6 | 
 7 | class QualityReport(BaseReport):
 8 |     """Single table quality report.
 9 | 
10 |     This class creates a quality report for single-table data. It calculates the quality
11 |     score along two properties - Column Shapes and Column Pair Trends.
12 |     """
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 |         self._properties = {
17 |             'Column Shapes': ColumnShapes(),
18 |             'Column Pair Trends': ColumnPairTrends(),
19 |         }
20 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/README.md:
--------------------------------------------------------------------------------
 1 | # Single Column Metrics
 2 | 
 3 | The metrics found on this folder operate on individual columns (or univariate random variables),
 4 | passed as two 1 dimensional arrays.
 5 | 
 6 | Implemented metrics:
 7 | 
 8 | * Statistical: Metrics that compare the arrays using statistical tests
 9 |     * `CSTest`: Chi-Squared test to compare the distributions of two categorical columns.
10 |     * `KSComplement`: Complement to the Kolmogorov-Smirnov statistic to compare the distributions
11 |       of two numerical columns using their empirical CDF.
12 | 
13 | ## SingleColumnMetric
14 | 
15 | All the single column metrics are subclasses form the `sdmetrics.single_column.SingleColumnMetric`
16 | class, which can be used to locate all of them:
17 | 
18 | ```python3
19 | In [1]: from sdmetrics.single_column import SingleColumnMetric
20 | 
21 | In [2]: SingleColumnMetric.get_subclasses()
22 | Out[2]:
23 | {'CSTest': sdmetrics.single_column.statistical.cstest.CSTest,
24 |  'KSComplement': sdmetrics.single_column.statistical.kscomplement.KSComplement}
25 | ```
26 | 
27 | ## Single Column Inputs and Outputs
28 | 
29 | All the single column metrics operate on just two inputs:
30 | 
31 | * `real_data`: A 1d numpy array, coming from the real dataset.
32 | * `synthetic_data`: A 1d numpy array, coming from the synthetic dataset.
33 | 
34 | For example, this how the KSComplement metric can be computed for the `age` column
35 | from the demo data:
36 | 
37 | ```python3
38 | In [3]: from sdmetrics import load_demo
39 | 
40 | In [4]: real_data, synthetic_data, metadata = load_demo()
41 | 
42 | In [5]: from sdmetrics.single_column import KSComplement
43 | 
44 | In [6]: real_column = real_data['users']['age'].to_numpy()
45 | 
46 | In [7]: synthetic_column = synthetic_data['users']['age'].to_numpy()
47 | 
48 | In [8]: KSComplement.compute(real_column, synthetic_column)
49 | Out[8]: 0.8
50 | ```
51 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/__init__.py:
--------------------------------------------------------------------------------
 1 | """Metrics for Single columns."""
 2 | 
 3 | from sdmetrics.single_column import base
 4 | from sdmetrics.single_column.base import SingleColumnMetric
 5 | from sdmetrics.single_column.statistical.boundary_adherence import BoundaryAdherence
 6 | from sdmetrics.single_column.statistical.category_adherence import CategoryAdherence
 7 | from sdmetrics.single_column.statistical.category_coverage import CategoryCoverage
 8 | from sdmetrics.single_column.statistical.cstest import CSTest
 9 | from sdmetrics.single_column.statistical.key_uniqueness import KeyUniqueness
10 | from sdmetrics.single_column.statistical.kscomplement import KSComplement
11 | from sdmetrics.single_column.statistical.missing_value_similarity import MissingValueSimilarity
12 | from sdmetrics.single_column.statistical.range_coverage import RangeCoverage
13 | from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity
14 | from sdmetrics.single_column.statistical.tv_complement import TVComplement
15 | from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity
16 | 
17 | __all__ = [
18 |     'base',
19 |     'SingleColumnMetric',
20 |     'BoundaryAdherence',
21 |     'CategoryCoverage',
22 |     'CategoryAdherence',
23 |     'CSTest',
24 |     'KeyUniqueness',
25 |     'KSComplement',
26 |     'MissingValueSimilarity',
27 |     'RangeCoverage',
28 |     'StatisticSimilarity',
29 |     'TVComplement',
30 |     'SequenceLengthSimilarity',
31 | ]
32 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/base.py:
--------------------------------------------------------------------------------
 1 | """Base SingleColumnMetric class."""
 2 | 
 3 | from sdmetrics.base import BaseMetric
 4 | 
 5 | 
 6 | class SingleColumnMetric(BaseMetric):
 7 |     """Base class for metrics that apply to individual columns.
 8 | 
 9 |     Attributes:
10 |         name (str):
11 |             Name to use when reports about this metric are printed.
12 |         goal (sdmetrics.goal.Goal):
13 |             The goal of this metric.
14 |         min_value (Union[float, tuple[float]]):
15 |             Minimum value or values that this metric can take.
16 |         max_value (Union[float, tuple[float]]):
17 |             Maximum value or values that this metric can take.
18 |     """
19 | 
20 |     name = None
21 |     goal = None
22 |     min_value = None
23 |     max_value = None
24 | 
25 |     @staticmethod
26 |     def compute(real_data, synthetic_data):
27 |         """Compute this metric.
28 | 
29 |         Args:
30 |             real_data (Union[numpy.ndarray, pandas.Series]):
31 |                 The values from the real dataset, passed as a 1d numpy
32 |                 array or as a pandas.Series.
33 |             synthetic_data (Union[numpy.ndarray, pandas.Series]):
34 |                 The values from the synthetic dataset, passed as a 1d numpy
35 |                 array or as a pandas.Series.
36 | 
37 |         Returns:
38 |             float
39 |                 Metric output.
40 |         """
41 |         raise NotImplementedError()
42 | 
43 |     @classmethod
44 |     def compute_breakdown(cls, real_data, synthetic_data):
45 |         """Compute this metric breakdown.
46 | 
47 |         Args:
48 |             real_data (Union[numpy.ndarray, pandas.Series]):
49 |                 The values from the real dataset, passed as a 1d numpy
50 |                 array or as a pandas.Series.
51 |             synthetic_data (Union[numpy.ndarray, pandas.Series]):
52 |                 The values from the synthetic dataset, passed as a 1d numpy
53 |                 array or as a pandas.Series.
54 | 
55 |         Returns:
56 |             dict
57 |                 Mapping of the metric output. Must include the key 'score'.
58 |         """
59 |         return {'score': cls.compute(real_data, synthetic_data)}
60 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/statistical/__init__.py:
--------------------------------------------------------------------------------
 1 | """Univariate goodness-of-fit tests."""
 2 | 
 3 | from sdmetrics.single_column.statistical.boundary_adherence import BoundaryAdherence
 4 | from sdmetrics.single_column.statistical.category_adherence import CategoryAdherence
 5 | from sdmetrics.single_column.statistical.category_coverage import CategoryCoverage
 6 | from sdmetrics.single_column.statistical.cstest import CSTest
 7 | from sdmetrics.single_column.statistical.key_uniqueness import KeyUniqueness
 8 | from sdmetrics.single_column.statistical.kscomplement import KSComplement
 9 | from sdmetrics.single_column.statistical.missing_value_similarity import MissingValueSimilarity
10 | from sdmetrics.single_column.statistical.range_coverage import RangeCoverage
11 | from sdmetrics.single_column.statistical.statistic_similarity import StatisticSimilarity
12 | from sdmetrics.single_column.statistical.tv_complement import TVComplement
13 | from sdmetrics.single_column.statistical.sequence_length_similarity import SequenceLengthSimilarity
14 | 
15 | __all__ = [
16 |     'BoundaryAdherence',
17 |     'CategoryCoverage',
18 |     'CategoryAdherence',
19 |     'CSTest',
20 |     'KeyUniqueness',
21 |     'KSComplement',
22 |     'MissingValueSimilarity',
23 |     'RangeCoverage',
24 |     'StatisticSimilarity',
25 |     'TVComplement',
26 |     'SequenceLengthSimilarity',
27 | ]
28 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/statistical/boundary_adherence.py:
--------------------------------------------------------------------------------
 1 | """Boundary Adherence Metric."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from sdmetrics.goal import Goal
 6 | from sdmetrics.single_column.base import SingleColumnMetric
 7 | from sdmetrics.utils import is_datetime
 8 | 
 9 | 
10 | class BoundaryAdherence(SingleColumnMetric):
11 |     """Boundary adherence metric.
12 | 
13 |     Compute the fraction of rows in the synthetic data that are within the min and max
14 |     bounds of the real data
15 | 
16 |     Attributes:
17 |         name (str):
18 |             Name to use when reports about this metric are printed.
19 |         goal (sdmetrics.goal.Goal):
20 |             The goal of this metric.
21 |         min_value (Union[float, tuple[float]]):
22 |             Minimum value or values that this metric can take.
23 |         max_value (Union[float, tuple[float]]):
24 |             Maximum value or values that this metric can take.
25 |     """
26 | 
27 |     name = 'BoundaryAdherence'
28 |     goal = Goal.MAXIMIZE
29 |     min_value = 0.0
30 |     max_value = 1.0
31 | 
32 |     @classmethod
33 |     def compute(cls, real_data, synthetic_data):
34 |         """Compute the boundary adherence of two continuous columns.
35 | 
36 |         Args:
37 |             real_data (Union[numpy.ndarray, pandas.Series]):
38 |                 The values from the real dataset.
39 |             synthetic_data (Union[numpy.ndarray, pandas.Series]):
40 |                 The values from the synthetic dataset.
41 | 
42 |         Returns:
43 |             float:
44 |                 The boundary adherence of the two columns.
45 |         """
46 |         real_data = pd.Series(real_data)
47 |         synthetic_data = pd.Series(synthetic_data)
48 |         if any(pd.isna(real_data)):
49 |             real_data = real_data.dropna()
50 |             synthetic_data = synthetic_data.dropna()
51 | 
52 |         if is_datetime(real_data):
53 |             real_data = pd.to_numeric(real_data)
54 |             synthetic_data = pd.to_numeric(synthetic_data)
55 | 
56 |         valid = synthetic_data.between(real_data.min(), real_data.max())
57 | 
58 |         return valid.sum() / len(synthetic_data)
59 | 
60 |     @classmethod
61 |     def normalize(cls, raw_score):
62 |         """Return the `raw_score` as is, since it is already normalized.
63 | 
64 |         Args:
65 |             raw_score (float):
66 |                 The value of the metric from `compute`.
67 | 
68 |         Returns:
69 |             float:
70 |                 The normalized value of the metric
71 |         """
72 |         return super().normalize(raw_score)
73 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/statistical/category_adherence.py:
--------------------------------------------------------------------------------
 1 | """Category Adherence Metric."""
 2 | 
 3 | import numpy as np
 4 | 
 5 | from sdmetrics.goal import Goal
 6 | from sdmetrics.single_column.base import SingleColumnMetric
 7 | 
 8 | 
 9 | class CategoryAdherence(SingleColumnMetric):
10 |     """Category adherence metric.
11 | 
12 |     The proportion of synthetic data points that match an existing category from the real data.
13 | 
14 |     Attributes:
15 |         name (str):
16 |             Name to use when reports about this metric are printed.
17 |         goal (sdmetrics.goal.Goal):
18 |             The goal of this metric.
19 |         min_value (Union[float, tuple[float]]):
20 |             Minimum value or values that this metric can take.
21 |         max_value (Union[float, tuple[float]]):
22 |             Maximum value or values that this metric can take.
23 |     """
24 | 
25 |     name = 'CategoryAdherence'
26 |     goal = Goal.MAXIMIZE
27 |     min_value = 0.0
28 |     max_value = 1.0
29 | 
30 |     @classmethod
31 |     def compute_breakdown(cls, real_data, synthetic_data):
32 |         """Compute the score breakdown of the category adherence metric.
33 | 
34 |         Args:
35 |             real_data (pandas.Series):
36 |                 The real data.
37 |             synthetic_data (pandas.Series):
38 |                 The synthetic data.
39 | 
40 |         Returns:
41 |             dict:
42 |                 The score breakdown of the category adherence metric.
43 |         """
44 |         real_data = real_data.fillna(np.nan)
45 |         synthetic_data = synthetic_data.fillna(np.nan)
46 |         score = synthetic_data.isin(real_data).mean()
47 | 
48 |         return {'score': score}
49 | 
50 |     @classmethod
51 |     def compute(cls, real_data, synthetic_data):
52 |         """Compute the category adherence of two columns.
53 | 
54 |         Args:
55 |             real_data (pandas.Series):
56 |                 The real data.
57 |             synthetic_data (pandas.Series):
58 |                 The synthetic data.
59 | 
60 |         Returns:
61 |             float:
62 |                 The category adherence metric score.
63 |         """
64 |         return cls.compute_breakdown(real_data, synthetic_data)['score']
65 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/statistical/category_coverage.py:
--------------------------------------------------------------------------------
 1 | """Category Coverage Metric."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from sdmetrics.goal import Goal
 6 | from sdmetrics.single_column.base import SingleColumnMetric
 7 | 
 8 | 
 9 | class CategoryCoverage(SingleColumnMetric):
10 |     """Category coverage metric.
11 | 
12 |     Compute the fraction of real data categories that are present in the synthetic data.
13 | 
14 |     Attributes:
15 |         name (str):
16 |             Name to use when reports about this metric are printed.
17 |         goal (sdmetrics.goal.Goal):
18 |             The goal of this metric.
19 |         min_value (Union[float, tuple[float]]):
20 |             Minimum value or values that this metric can take.
21 |         max_value (Union[float, tuple[float]]):
22 |             Maximum value or values that this metric can take.
23 |     """
24 | 
25 |     name = 'CategoryCoverage'
26 |     goal = Goal.MAXIMIZE
27 |     min_value = 0.0
28 |     max_value = 1.0
29 | 
30 |     @classmethod
31 |     def compute(cls, real_data, synthetic_data):
32 |         """Compare the category coverage of two continuous columns.
33 | 
34 |         Args:
35 |             real_data (Union[numpy.ndarray, pandas.Series]):
36 |                 The values from the real dataset.
37 |             synthetic_data (Union[numpy.ndarray, pandas.Series]):
38 |                 The values from the synthetic dataset.
39 | 
40 |         Returns:
41 |             float:
42 |                 The category coverage ratio of the two columns.
43 |         """
44 |         results = cls.compute_breakdown(real_data, synthetic_data)
45 |         return results['score']
46 | 
47 |     @classmethod
48 |     def compute_breakdown(cls, real_data, synthetic_data):
49 |         """Compare the category coverage of two continuous columns.
50 | 
51 |         Args:
52 |             real_data (Union[numpy.ndarray, pandas.Series]):
53 |                 The values from the real dataset.
54 |             synthetic_data (Union[numpy.ndarray, pandas.Series]):
55 |                 The values from the synthetic dataset.
56 | 
57 |         Returns:
58 |             dict:
59 |                 A mapping of the category coverage results.
60 |         """
61 |         real_data = pd.Series(real_data).dropna()
62 |         synthetic_data = pd.Series(synthetic_data).dropna()
63 | 
64 |         real_data_values = set(real_data.value_counts().index)
65 |         synthetic_data_values = set(synthetic_data.value_counts().index)
66 |         synthetic_coverage = synthetic_data_values.intersection(real_data_values)
67 | 
68 |         return {
69 |             'score': len(synthetic_coverage) / len(real_data_values),
70 |             'real': len(real_data_values),
71 |             'synthetic': len(synthetic_coverage),
72 |         }
73 | 
74 |     @classmethod
75 |     def normalize(cls, raw_score):
76 |         """Return the `raw_score` as is, since it is already normalized.
77 | 
78 |         Args:
79 |             raw_score (float):
80 |                 The value of the metric from `compute`.
81 | 
82 |         Returns:
83 |             float:
84 |                 The normalized value of the metric
85 |         """
86 |         return super().normalize(raw_score)
87 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/statistical/cstest.py:
--------------------------------------------------------------------------------
 1 | """Chi-Squared test based metric."""
 2 | 
 3 | from scipy.stats import chisquare
 4 | 
 5 | from sdmetrics.goal import Goal
 6 | from sdmetrics.single_column.base import SingleColumnMetric
 7 | from sdmetrics.utils import get_frequencies
 8 | 
 9 | 
10 | class CSTest(SingleColumnMetric):
11 |     """Chi-Squared test based metric.
12 | 
13 |     This metric uses the Chi-Squared test to compare the distributions
14 |     of the two categorical columns. It returns the resulting p-value so that
15 |     a small value indicates that we can reject the null hypothesis (i.e. and
16 |     suggests that the distributions are different).
17 | 
18 |     Attributes:
19 |         name (str):
20 |             Name to use when reports about this metric are printed.
21 |         goal (sdmetrics.goal.Goal):
22 |             The goal of this metric.
23 |         min_value (Union[float, tuple[float]]):
24 |             Minimum value or values that this metric can take.
25 |         max_value (Union[float, tuple[float]]):
26 |             Maximum value or values that this metric can take.
27 |     """
28 | 
29 |     name = 'Chi-Squared'
30 |     goal = Goal.MAXIMIZE
31 |     min_value = 0.0
32 |     max_value = 1.0
33 | 
34 |     @staticmethod
35 |     def compute(real_data, synthetic_data):
36 |         """Compare two discrete columns using a Chi-Squared test.
37 | 
38 |         Args:
39 |             real_data (Union[numpy.ndarray, pandas.Series]):
40 |                 The values from the real dataset.
41 |             synthetic_data (Union[numpy.ndarray, pandas.Series]):
42 |                 The values from the synthetic dataset.
43 | 
44 |         Returns:
45 |             float:
46 |                 The Chi-Squared test p-value
47 |         """
48 |         f_obs, f_exp = get_frequencies(real_data, synthetic_data)
49 |         if len(f_obs) == len(f_exp) == 1:
50 |             pvalue = 1.0
51 |         else:
52 |             _, pvalue = chisquare(f_obs, f_exp)
53 | 
54 |         return pvalue
55 | 
56 |     @classmethod
57 |     def normalize(cls, raw_score):
58 |         """Return the `raw_score` as is, since it is already normalized.
59 | 
60 |         Args:
61 |             raw_score (float):
62 |                 The value of the metric from `compute`.
63 | 
64 |         Returns:
65 |             float:
66 |                 The normalized value of the metric
67 |         """
68 |         return super().normalize(raw_score)
69 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/statistical/key_uniqueness.py:
--------------------------------------------------------------------------------
 1 | """Key Uniqueness Metric."""
 2 | 
 3 | import logging
 4 | 
 5 | from sdmetrics.goal import Goal
 6 | from sdmetrics.single_column.base import SingleColumnMetric
 7 | 
 8 | LOGGER = logging.getLogger(__name__)
 9 | 
10 | 
11 | class KeyUniqueness(SingleColumnMetric):
12 |     """Key uniqueness metric.
13 | 
14 |     The proportion of data points in the synthetic data that are unique.
15 | 
16 |     Attributes:
17 |         name (str):
18 |             Name to use when reports about this metric are printed.
19 |         goal (sdmetrics.goal.Goal):
20 |             The goal of this metric.
21 |         min_value (Union[float, tuple[float]]):
22 |             Minimum value or values that this metric can take.
23 |         max_value (Union[float, tuple[float]]):
24 |             Maximum value or values that this metric can take.
25 |     """
26 | 
27 |     name = 'KeyUniqueness'
28 |     goal = Goal.MAXIMIZE
29 |     min_value = 0.0
30 |     max_value = 1.0
31 | 
32 |     @classmethod
33 |     def compute_breakdown(cls, real_data, synthetic_data):
34 |         """Compute the score breakdown of the key uniqueness metric.
35 | 
36 |         Args:
37 |             real_data (pandas.Series):
38 |                 The real data.
39 |             synthetic_data (pandas.Series):
40 |                 The synthetic data.
41 | 
42 |         Returns:
43 |             dict:
44 |                 The score breakdown of the key uniqueness metric.
45 |         """
46 |         has_duplicates = real_data.duplicated().any()
47 |         has_nans = real_data.isna().any()
48 |         if has_duplicates or has_nans:
49 |             LOGGER.info('The real data contains NA or duplicate values.')
50 | 
51 |         nans_or_duplicates_synthetic = synthetic_data.duplicated() | synthetic_data.isna()
52 |         score = 1 - nans_or_duplicates_synthetic.sum() / len(synthetic_data)
53 | 
54 |         return {'score': score}
55 | 
56 |     @classmethod
57 |     def compute(cls, real_data, synthetic_data):
58 |         """Compute the key uniqueness metric.
59 | 
60 |         Args:
61 |             real_data (pandas.Series):
62 |                 The real data.
63 |             synthetic_data (pandas.Series):
64 |                 The synthetic data.
65 | 
66 |         Returns:
67 |             float:
68 |                 The proportion of data points in the synthetic data that are unique.
69 |         """
70 |         return cls.compute_breakdown(real_data, synthetic_data)['score']
71 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/statistical/kscomplement.py:
--------------------------------------------------------------------------------
 1 | """Kolmogorov-Smirnov test based Metric."""
 2 | 
 3 | import sys
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | from scipy.stats import ks_2samp
 8 | 
 9 | from sdmetrics.goal import Goal
10 | from sdmetrics.single_column.base import SingleColumnMetric
11 | from sdmetrics.utils import is_datetime
12 | 
13 | MAX_DECIMALS = sys.float_info.dig - 1
14 | 
15 | 
16 | class KSComplement(SingleColumnMetric):
17 |     """Kolmogorov-Smirnov statistic based metric.
18 | 
19 |     This function uses the two-sample Kolmogorov–Smirnov test to compare
20 |     the distributions of the two continuous columns using the empirical CDF.
21 |     It returns 1 minus the KS Test D statistic, which indicates the maximum
22 |     distance between the expected CDF and the observed CDF values.
23 | 
24 |     As a result, the output value is 1.0 if the distributions are identical
25 |     and 0.0 if they are completely different.
26 | 
27 |     Attributes:
28 |         name (str):
29 |             Name to use when reports about this metric are printed.
30 |         goal (sdmetrics.goal.Goal):
31 |             The goal of this metric.
32 |         min_value (Union[float, tuple[float]]):
33 |             Minimum value or values that this metric can take.
34 |         max_value (Union[float, tuple[float]]):
35 |             Maximum value or values that this metric can take.
36 |     """
37 | 
38 |     name = 'Inverted Kolmogorov-Smirnov D statistic'
39 |     goal = Goal.MAXIMIZE
40 |     min_value = 0.0
41 |     max_value = 1.0
42 | 
43 |     @staticmethod
44 |     def compute(real_data, synthetic_data):
45 |         """Compare two continuous columns using a Kolmogorov–Smirnov test.
46 | 
47 |         Args:
48 |             real_data (Union[numpy.ndarray, pandas.Series]):
49 |                 The values from the real dataset.
50 |             synthetic_data (Union[numpy.ndarray, pandas.Series]):
51 |                 The values from the synthetic dataset.
52 | 
53 |         Returns:
54 |             float:
55 |                 1 minus the Kolmogorov–Smirnov D statistic.
56 |         """
57 |         real_data = pd.Series(real_data).dropna()
58 |         synthetic_data = pd.Series(synthetic_data).dropna()
59 | 
60 |         if is_datetime(real_data):
61 |             real_data = pd.to_numeric(real_data)
62 |             synthetic_data = pd.to_numeric(synthetic_data)
63 | 
64 |         real_data = real_data.round(MAX_DECIMALS)
65 |         synthetic_data = synthetic_data.round(MAX_DECIMALS)
66 | 
67 |         try:
68 |             statistic, _ = ks_2samp(real_data, synthetic_data)
69 |         except ValueError as e:
70 |             if str(e) == 'Data passed to ks_2samp must not be empty':
71 |                 return np.nan
72 |             else:
73 |                 raise ValueError(e)
74 | 
75 |         return 1 - statistic
76 | 
77 |     @classmethod
78 |     def normalize(cls, raw_score):
79 |         """Return the `raw_score` as is, since it is already normalized.
80 | 
81 |         Args:
82 |             raw_score (float):
83 |                 The value of the metric from `compute`.
84 | 
85 |         Returns:
86 |             float:
87 |                 The normalized value of the metric
88 |         """
89 |         return super().normalize(raw_score)
90 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/statistical/missing_value_similarity.py:
--------------------------------------------------------------------------------
 1 | """Missing Value Similarity Metric."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from sdmetrics.goal import Goal
 6 | from sdmetrics.single_column.base import SingleColumnMetric
 7 | 
 8 | 
 9 | class MissingValueSimilarity(SingleColumnMetric):
10 |     """Missing value similarity metric.
11 | 
12 |     Compute the percentage of missing values between the real and synthetic data.
13 | 
14 |     Attributes:
15 |         name (str):
16 |             Name to use when reports about this metric are printed.
17 |         goal (sdmetrics.goal.Goal):
18 |             The goal of this metric.
19 |         min_value (Union[float, tuple[float]]):
20 |             Minimum value or values that this metric can take.
21 |         max_value (Union[float, tuple[float]]):
22 |             Maximum value or values that this metric can take.
23 |     """
24 | 
25 |     name = 'MissingValueSimilarity'
26 |     goal = Goal.MAXIMIZE
27 |     min_value = 0.0
28 |     max_value = 1.0
29 | 
30 |     @classmethod
31 |     def compute_breakdown(cls, real_data, synthetic_data):
32 |         """Compare the missing value similarity of two continuous columns.
33 | 
34 |         Args:
35 |             real_data (Union[numpy.ndarray, pandas.Series]):
36 |                 The values from the real dataset.
37 |             synthetic_data (Union[numpy.ndarray, pandas.Series]):
38 |                 The values from the synthetic dataset.
39 | 
40 |         Returns:
41 |             dict:
42 |                 A mapping of the missing value similarity results.
43 |         """
44 |         real_data = pd.Series(real_data)
45 |         synthetic_data = pd.Series(synthetic_data)
46 | 
47 |         real_data_value = real_data.isna().sum() / len(real_data)
48 |         synthetic_data_value = synthetic_data.isna().sum() / len(synthetic_data)
49 | 
50 |         return {
51 |             'score': 1 - abs(real_data_value - synthetic_data_value),
52 |             'real': real_data_value,
53 |             'synthetic': synthetic_data_value,
54 |         }
55 | 
56 |     @classmethod
57 |     def compute(cls, real_data, synthetic_data):
58 |         """Compare the missing value similarity of two continuous columns.
59 | 
60 |         Args:
61 |             real_data (Union[numpy.ndarray, pandas.Series]):
62 |                 The values from the real dataset.
63 |             synthetic_data (Union[numpy.ndarray, pandas.Series]):
64 |                 The values from the synthetic dataset.
65 | 
66 |         Returns:
67 |             float:
68 |                 The missing value similarity of the two columns.
69 |         """
70 |         results = cls.compute_breakdown(real_data, synthetic_data)
71 |         return results['score']
72 | 
73 |     @classmethod
74 |     def normalize(cls, raw_score):
75 |         """Return the `raw_score` as is, since it is already normalized.
76 | 
77 |         Args:
78 |             raw_score (float):
79 |                 The value of the metric from `compute`.
80 | 
81 |         Returns:
82 |             float:
83 |                 The normalized value of the metric
84 |         """
85 |         return super().normalize(raw_score)
86 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/statistical/range_coverage.py:
--------------------------------------------------------------------------------
 1 | """Range Coverage Metric."""
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from sdmetrics.goal import Goal
 7 | from sdmetrics.single_column.base import SingleColumnMetric
 8 | 
 9 | 
10 | class RangeCoverage(SingleColumnMetric):
11 |     """Range coverage metric.
12 | 
13 |     Compute whether a synthetic column covers the full range of values that are
14 |     present in a real column
15 | 
16 |     Attributes:
17 |         name (str):
18 |             Name to use when reports about this metric are printed.
19 |         goal (sdmetrics.goal.Goal):
20 |             The goal of this metric.
21 |         min_value (Union[float, tuple[float]]):
22 |             Minimum value or values that this metric can take.
23 |         max_value (Union[float, tuple[float]]):
24 |             Maximum value or values that this metric can take.
25 |     """
26 | 
27 |     name = 'RangeCoverage'
28 |     goal = Goal.MAXIMIZE
29 |     min_value = 0.0
30 |     max_value = 1.0
31 | 
32 |     @classmethod
33 |     def compute(cls, real_data, synthetic_data):
34 |         """Compute the range coverage of synthetic columns over the real column.
35 | 
36 |         Args:
37 |             real_data (Union[numpy.ndarray, pandas.Series]):
38 |                 The values from the real dataset.
39 |             synthetic_data (Union[numpy.ndarray, pandas.Series]):
40 |                 The values from the synthetic dataset.
41 | 
42 |         Returns:
43 |             float:
44 |                 The range coverage of the synthetic data over the real data.
45 |         """
46 |         if not isinstance(real_data, pd.Series):
47 |             real_data = pd.Series(real_data)
48 | 
49 |         if not isinstance(synthetic_data, pd.Series):
50 |             synthetic_data = pd.Series(synthetic_data)
51 | 
52 |         min_r = real_data.min()
53 |         max_r = real_data.max()
54 |         min_s = synthetic_data.min()
55 |         max_s = synthetic_data.max()
56 | 
57 |         if min_r == max_r:
58 |             return np.nan
59 | 
60 |         normalized_min = max((min_s - min_r) / (max_r - min_r), 0)
61 |         normalized_max = max((max_r - max_s) / (max_r - min_r), 0)
62 |         return max(1 - (normalized_min + normalized_max), 0)
63 | 
64 |     @classmethod
65 |     def normalize(cls, raw_score):
66 |         """Return the `raw_score` as is, since it is already normalized.
67 | 
68 |         Args:
69 |             raw_score (float):
70 |                 The value of the metric from `compute`.
71 | 
72 |         Returns:
73 |             float:
74 |                 The normalized value of the metric
75 |         """
76 |         return super().normalize(raw_score)
77 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/statistical/sequence_length_similarity.py:
--------------------------------------------------------------------------------
 1 | """SequenceLengthSimilarity module."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from sdmetrics.goal import Goal
 6 | from sdmetrics.single_column.base import SingleColumnMetric
 7 | from sdmetrics.single_column.statistical.kscomplement import KSComplement
 8 | 
 9 | 
10 | class SequenceLengthSimilarity(SingleColumnMetric):
11 |     """Sequence Length Similarity metric.
12 | 
13 |     Attributes:
14 |         name (str):
15 |             Name to use when reports about this metric are printed.
16 |         goal (sdmetrics.goal.Goal):
17 |             The goal of this metric.
18 |         min_value (Union[float, tuple[float]]):
19 |             Minimum value or values that this metric can take.
20 |         max_value (Union[float, tuple[float]]):
21 |             Maximum value or values that this metric can take.
22 |     """
23 | 
24 |     name = 'Sequence Length Similarity'
25 |     goal = Goal.MAXIMIZE
26 |     min_value = 0.0
27 |     max_value = 1.0
28 | 
29 |     @staticmethod
30 |     def compute(real_data: pd.Series, synthetic_data: pd.Series) -> float:
31 |         """Compute this metric.
32 | 
33 |         The length of a sequence is determined by the number of times the same sequence key occurs.
34 |         For example if id_09231 appeared 150 times in the sequence key, then the sequence is of
35 |         length 150. This metric compares the lengths of all sequence keys in the
36 |         real data vs. the synthetic data.
37 | 
38 |         It works as follows:
39 |             - Calculate the length of each sequence in the real data
40 |             - Calculate the length of each sequence in the synthetic data
41 |             - Apply the KSComplement metric to compare the similarities of the distributions
42 |             - Return this score
43 | 
44 |         Args:
45 |             real_data (pd.Series):
46 |                 The values from the real dataset.
47 |             synthetic_data (pd.Series):
48 |                 The values from the synthetic dataset.
49 | 
50 |         Returns:
51 |             float:
52 |                 The score.
53 |         """
54 |         return KSComplement.compute(real_data.value_counts(), synthetic_data.value_counts())
55 | 


--------------------------------------------------------------------------------
/sdmetrics/single_column/statistical/tv_complement.py:
--------------------------------------------------------------------------------
 1 | """Total Variation Complement Metric."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from sdmetrics.errors import IncomputableMetricError
 6 | from sdmetrics.goal import Goal
 7 | from sdmetrics.single_column.base import SingleColumnMetric
 8 | from sdmetrics.utils import get_frequencies
 9 | 
10 | 
11 | class TVComplement(SingleColumnMetric):
12 |     """Total Variation Complement metric.
13 | 
14 |     The complement of the total variation distance.
15 | 
16 |     Attributes:
17 |         name (str):
18 |             Name to use when reports about this metric are printed.
19 |         goal (sdmetrics.goal.Goal):
20 |             The goal of this metric.
21 |         min_value (Union[float, tuple[float]]):
22 |             Minimum value or values that this metric can take.
23 |         max_value (Union[float, tuple[float]]):
24 |             Maximum value or values that this metric can take.
25 |     """
26 | 
27 |     name = 'TVComplement'
28 |     goal = Goal.MAXIMIZE
29 |     min_value = 0.0
30 |     max_value = 1.0
31 | 
32 |     @classmethod
33 |     def compute(cls, real_data, synthetic_data):
34 |         """Compute the complement of the total variation distance of two discrete columns.
35 | 
36 |         Args:
37 |             real_data (Union[numpy.ndarray, pandas.Series]):
38 |                 The values from the real dataset.
39 |             synthetic_data (Union[numpy.ndarray, pandas.Series]):
40 |                 The values from the synthetic dataset.
41 | 
42 |         Returns:
43 |             float:
44 |                 The complement of the total variation distance.
45 |         """
46 |         real_data = pd.Series(real_data).dropna()
47 |         synthetic_data = pd.Series(synthetic_data).dropna()
48 | 
49 |         if len(synthetic_data) == 0 or len(real_data) == 0:
50 |             raise IncomputableMetricError(
51 |                 'The TVComplement metric must have 1 or more non-null values.'
52 |             )
53 | 
54 |         f_obs, f_exp = get_frequencies(real_data, synthetic_data)
55 |         total_variation = 0
56 |         for i in range(len(f_obs)):
57 |             total_variation += abs(f_obs[i] - f_exp[i])
58 | 
59 |         return 1 - 0.5 * total_variation
60 | 
61 |     @classmethod
62 |     def normalize(cls, raw_score):
63 |         """Return the `raw_score` as is, since it is already normalized.
64 | 
65 |         Args:
66 |             raw_score (float):
67 |                 The value of the metric from `compute`.
68 | 
69 |         Returns:
70 |             float:
71 |                 The normalized value of the metric
72 |         """
73 |         return super().normalize(raw_score)
74 | 


--------------------------------------------------------------------------------
/sdmetrics/single_table/data_augmentation/__init__.py:
--------------------------------------------------------------------------------
 1 | """Data Augmentation Metric for single table datasets."""
 2 | 
 3 | from sdmetrics.single_table.data_augmentation.binary_classifier_precision_efficacy import (
 4 |     BinaryClassifierPrecisionEfficacy,
 5 | )
 6 | from sdmetrics.single_table.data_augmentation.binary_classifier_recall_efficacy import (
 7 |     BinaryClassifierRecallEfficacy,
 8 | )
 9 | 
10 | __all__ = ['BinaryClassifierPrecisionEfficacy', 'BinaryClassifierRecallEfficacy']
11 | 


--------------------------------------------------------------------------------
/sdmetrics/single_table/data_augmentation/binary_classifier_precision_efficacy.py:
--------------------------------------------------------------------------------
 1 | """Binary classifier precision efficacy metric."""
 2 | 
 3 | from sdmetrics.single_table.data_augmentation.base import BaseDataAugmentationMetric
 4 | 
 5 | 
 6 | class BinaryClassifierPrecisionEfficacy(BaseDataAugmentationMetric):
 7 |     """Binary classifier precision efficacy metric."""
 8 | 
 9 |     name = 'Binary Classifier Precision Efficacy'
10 |     metric_name = 'precision'
11 | 
12 |     @classmethod
13 |     def compute_breakdown(
14 |         cls,
15 |         real_training_data,
16 |         synthetic_data,
17 |         real_validation_data,
18 |         metadata,
19 |         prediction_column_name,
20 |         minority_class_label,
21 |         classifier='XGBoost',
22 |         fixed_recall_value=0.9,
23 |     ):
24 |         """Compute the score breakdown of the metric."""
25 |         return super().compute_breakdown(
26 |             real_training_data,
27 |             synthetic_data,
28 |             real_validation_data,
29 |             metadata,
30 |             prediction_column_name,
31 |             minority_class_label,
32 |             classifier,
33 |             fixed_recall_value,
34 |         )
35 | 
36 |     @classmethod
37 |     def compute(
38 |         cls,
39 |         real_training_data,
40 |         synthetic_data,
41 |         real_validation_data,
42 |         metadata,
43 |         prediction_column_name,
44 |         minority_class_label,
45 |         classifier='xgboost',
46 |         fixed_recall_value=0.9,
47 |     ):
48 |         """Compute the score of the metric.
49 | 
50 |         Args:
51 |             real_training_data (pandas.DataFrame):
52 |                 The real training data.
53 |             synthetic_data (pandas.DataFrame):
54 |                 The synthetic data.
55 |             real_validation_data (pandas.DataFrame):
56 |                 The real validation data.
57 |             metadata (dict):
58 |                 The metadata dictionary describing the table of data.
59 |             prediction_column_name (str):
60 |                 The name of the column to be predicted.
61 |             minority_class_label (int):
62 |                 The minority class label.
63 |             classifier (str):
64 |                 The ML algorithm to use when building a Binary Classfication.
65 |                 Supported options are ``XGBoost``. Defaults to ``XGBoost``.
66 |             fixed_recall_value (float):
67 |                 The fixed recall value to be used when calculating the precision score.
68 | 
69 |         Returns:
70 |             float:
71 |                 The score of the metric.
72 |         """
73 |         return super().compute(
74 |             real_training_data,
75 |             synthetic_data,
76 |             real_validation_data,
77 |             metadata,
78 |             prediction_column_name,
79 |             minority_class_label,
80 |             classifier,
81 |             fixed_recall_value,
82 |         )
83 | 


--------------------------------------------------------------------------------
/sdmetrics/single_table/data_augmentation/binary_classifier_recall_efficacy.py:
--------------------------------------------------------------------------------
 1 | """Binary classifier recall efficacy metric."""
 2 | 
 3 | from sdmetrics.single_table.data_augmentation.base import BaseDataAugmentationMetric
 4 | 
 5 | 
 6 | class BinaryClassifierRecallEfficacy(BaseDataAugmentationMetric):
 7 |     """Binary classifier recall efficacy metric."""
 8 | 
 9 |     name = 'Binary Classifier Recall Efficacy'
10 |     metric_name = 'recall'
11 | 
12 |     @classmethod
13 |     def compute_breakdown(
14 |         cls,
15 |         real_training_data,
16 |         synthetic_data,
17 |         real_validation_data,
18 |         metadata,
19 |         prediction_column_name,
20 |         minority_class_label,
21 |         classifier='XGBoost',
22 |         fixed_precision_value=0.9,
23 |     ):
24 |         """Compute the score breakdown of the metric."""
25 |         return super().compute_breakdown(
26 |             real_training_data,
27 |             synthetic_data,
28 |             real_validation_data,
29 |             metadata,
30 |             prediction_column_name,
31 |             minority_class_label,
32 |             classifier,
33 |             fixed_precision_value,
34 |         )
35 | 
36 |     @classmethod
37 |     def compute(
38 |         cls,
39 |         real_training_data,
40 |         synthetic_data,
41 |         real_validation_data,
42 |         metadata,
43 |         prediction_column_name,
44 |         minority_class_label,
45 |         classifier='XGBoost',
46 |         fixed_precision_value=0.9,
47 |     ):
48 |         """Compute the score of the metric.
49 | 
50 |         Args:
51 |             real_training_data (pandas.DataFrame):
52 |                 The real training data.
53 |             synthetic_data (pandas.DataFrame):
54 |                 The synthetic data.
55 |             real_validation_data (pandas.DataFrame):
56 |                 The real validation data.
57 |             metadata (dict):
58 |                 The metadata dictionary describing the table of data.
59 |             prediction_column_name (str):
60 |                 The name of the column to be predicted.
61 |             minority_class_label (int):
62 |                 The minority class label.
63 |             classifier (str):
64 |                 The ML algorithm to use when building a Binary Classfication.
65 |                 Supported options are ``XGBoost``. Defaults to ``XGBoost``.
66 |             fixed_precision_value (float):
67 |                 The fixed precision value to be used when calculating the recall score.
68 |                 Defaults to 0.9.
69 | 
70 |         Returns:
71 |             float:
72 |                 The score of the metric.
73 |         """
74 |         return super().compute(
75 |             real_training_data,
76 |             synthetic_data,
77 |             real_validation_data,
78 |             metadata,
79 |             prediction_column_name,
80 |             minority_class_label,
81 |             classifier,
82 |             fixed_precision_value,
83 |         )
84 | 


--------------------------------------------------------------------------------
/sdmetrics/single_table/detection/__init__.py:
--------------------------------------------------------------------------------
1 | """Machine Learning Detection metrics for single table datasets."""
2 | 
3 | from sdmetrics.single_table.detection.sklearn import LogisticDetection, SVCDetection
4 | 
5 | __all__ = ['LogisticDetection', 'SVCDetection']
6 | 


--------------------------------------------------------------------------------
/sdmetrics/single_table/detection/sklearn.py:
--------------------------------------------------------------------------------
 1 | """scikit-learn based DetectionMetrics for single table datasets."""
 2 | 
 3 | from sklearn.impute import SimpleImputer
 4 | from sklearn.linear_model import LogisticRegression
 5 | from sklearn.pipeline import Pipeline
 6 | from sklearn.preprocessing import RobustScaler
 7 | from sklearn.svm import SVC
 8 | 
 9 | from sdmetrics.single_table.detection.base import DetectionMetric
10 | 
11 | 
12 | class ScikitLearnClassifierDetectionMetric(DetectionMetric):
13 |     """Base class for Detection metrics build using Scikit Learn Classifiers.
14 | 
15 |     The base class for these metrics makes a prediction using a scikit-learn
16 |     pipeline which contains a SimpleImputer, a RobustScaler and finally
17 |     the classifier, which is defined in the subclasses.
18 |     """
19 | 
20 |     name = 'Scikit-Learn Detection'
21 | 
22 |     @staticmethod
23 |     def _get_classifier():
24 |         """Build and return an instance of a scikit-learn Classifier."""
25 |         raise NotImplementedError()
26 | 
27 |     @classmethod
28 |     def _fit_predict(cls, X_train, y_train, X_test):
29 |         """Fit a pipeline to the training data and then use it to make prediction on test data."""
30 |         model = Pipeline([
31 |             ('imputer', SimpleImputer()),
32 |             ('scalar', RobustScaler()),
33 |             ('classifier', cls._get_classifier()),
34 |         ])
35 |         model.fit(X_train, y_train)
36 | 
37 |         return model.predict_proba(X_test)[:, 1]
38 | 
39 | 
40 | class LogisticDetection(ScikitLearnClassifierDetectionMetric):
41 |     """ScikitLearnClassifierDetectionMetric based on a LogisticRegression.
42 | 
43 |     This metric builds a LogisticRegression Classifier that learns to tell the synthetic
44 |     data apart from the real data, which later on is evaluated using Cross Validation.
45 | 
46 |     The output of the metric is one minus the average ROC AUC score obtained.
47 |     """
48 | 
49 |     name = 'LogisticRegression Detection'
50 | 
51 |     @staticmethod
52 |     def _get_classifier():
53 |         return LogisticRegression(solver='lbfgs')
54 | 
55 | 
56 | class SVCDetection(ScikitLearnClassifierDetectionMetric):
57 |     """ScikitLearnClassifierDetectionMetric based on a SVC.
58 | 
59 |     This metric builds a SVC Classifier that learns to tell the synthetic
60 |     data apart from the real data, which later on is evaluated using Cross Validation.
61 | 
62 |     The output of the metric is one minus the average ROC AUC score obtained.
63 |     """
64 | 
65 |     name = 'SVC Detection'
66 | 
67 |     @staticmethod
68 |     def _get_classifier():
69 |         return SVC(probability=True, gamma='scale')
70 | 


--------------------------------------------------------------------------------
/sdmetrics/single_table/efficacy/__init__.py:
--------------------------------------------------------------------------------
 1 | """Single table efficacy metrics module."""
 2 | 
 3 | from sdmetrics.single_table.efficacy import binary, multiclass, regression
 4 | from sdmetrics.single_table.efficacy.base import MLEfficacyMetric
 5 | from sdmetrics.single_table.efficacy.binary import (
 6 |     BinaryAdaBoostClassifier,
 7 |     BinaryDecisionTreeClassifier,
 8 |     BinaryEfficacyMetric,
 9 |     BinaryLogisticRegression,
10 |     BinaryMLPClassifier,
11 | )
12 | from sdmetrics.single_table.efficacy.multiclass import (
13 |     MulticlassDecisionTreeClassifier,
14 |     MulticlassEfficacyMetric,
15 |     MulticlassMLPClassifier,
16 | )
17 | from sdmetrics.single_table.efficacy.regression import (
18 |     LinearRegression,
19 |     MLPRegressor,
20 |     RegressionEfficacyMetric,
21 | )
22 | 
23 | __all__ = [
24 |     'binary',
25 |     'multiclass',
26 |     'regression',
27 |     'MLEfficacyMetric',
28 |     'BinaryEfficacyMetric',
29 |     'BinaryDecisionTreeClassifier',
30 |     'BinaryAdaBoostClassifier',
31 |     'BinaryLogisticRegression',
32 |     'BinaryMLPClassifier',
33 |     'MulticlassEfficacyMetric',
34 |     'MulticlassDecisionTreeClassifier',
35 |     'MulticlassMLPClassifier',
36 |     'RegressionEfficacyMetric',
37 |     'LinearRegression',
38 |     'MLPRegressor',
39 | ]
40 | 


--------------------------------------------------------------------------------
/sdmetrics/single_table/efficacy/multiclass.py:
--------------------------------------------------------------------------------
 1 | """Base class for Multiclass Classification Efficacy Metrics for single table datasets."""
 2 | 
 3 | from sklearn.metrics import f1_score
 4 | from sklearn.neural_network import MLPClassifier
 5 | from sklearn.tree import DecisionTreeClassifier
 6 | 
 7 | from sdmetrics.goal import Goal
 8 | from sdmetrics.single_table.efficacy.base import MLEfficacyMetric
 9 | 
10 | 
11 | def f1_macro(test_target, predictions):
12 |     """Return the `f1_score` of the passed data."""
13 |     return f1_score(test_target, predictions, average='macro')
14 | 
15 | 
16 | class MulticlassEfficacyMetric(MLEfficacyMetric):
17 |     """Base class for Multiclass Classification Efficacy Metrics."""
18 | 
19 |     name = None
20 |     goal = Goal.MAXIMIZE
21 |     min_value = 0
22 |     max_value = 1
23 |     SCORER = f1_macro
24 | 
25 |     @classmethod
26 |     def normalize(cls, raw_score):
27 |         """Return the `raw_score` as is, since it is already normalized.
28 | 
29 |         Args:
30 |             raw_score (float):
31 |                 The value of the metric from `compute`.
32 | 
33 |         Returns:
34 |             float:
35 |                 The normalized value of the metric
36 |         """
37 |         return super().normalize(raw_score)
38 | 
39 | 
40 | class MulticlassDecisionTreeClassifier(MulticlassEfficacyMetric):
41 |     """Multiclass DecisionTreeClassifier Efficacy based metric.
42 | 
43 |     This fits a DecisionTreeClassifier to the training data and
44 |     then evaluates it making predictions on the test data.
45 |     """
46 | 
47 |     MODEL = DecisionTreeClassifier
48 |     MODEL_KWARGS = {
49 |         'max_depth': 30,
50 |         'class_weight': 'balanced',
51 |     }
52 | 
53 | 
54 | class MulticlassMLPClassifier(MulticlassEfficacyMetric):
55 |     """Multiclass MLPClassifier Efficacy based metric.
56 | 
57 |     This fits a MLPClassifier to the training data and
58 |     then evaluates it making predictions on the test data.
59 |     """
60 | 
61 |     MODEL = MLPClassifier
62 |     MODEL_KWARGS = {'hidden_layer_sizes': (100,), 'max_iter': 50}
63 | 


--------------------------------------------------------------------------------
/sdmetrics/single_table/efficacy/regression.py:
--------------------------------------------------------------------------------
 1 | """Regression Efficacy based metrics."""
 2 | 
 3 | import numpy as np
 4 | from sklearn import linear_model, neural_network
 5 | from sklearn.metrics import r2_score
 6 | 
 7 | from sdmetrics.goal import Goal
 8 | from sdmetrics.single_table.efficacy.base import MLEfficacyMetric
 9 | 
10 | 
11 | class RegressionEfficacyMetric(MLEfficacyMetric):
12 |     """RegressionEfficacy base class."""
13 | 
14 |     name = None
15 |     goal = Goal.MAXIMIZE
16 |     min_value = -np.inf
17 |     max_value = 1
18 |     SCORER = r2_score
19 | 
20 |     @classmethod
21 |     def normalize(cls, raw_score):
22 |         """Return a normalized version of the R^2 score.
23 | 
24 |         Args:
25 |             raw_score (float):
26 |                 The value of the metric from `compute`.
27 | 
28 |         Returns:
29 |             float:
30 |                 The normalized value of the metric
31 |         """
32 |         return super().normalize(raw_score)
33 | 
34 | 
35 | class LinearRegression(RegressionEfficacyMetric):
36 |     """LinearRegression Efficacy based metric.
37 | 
38 |     This fits a LinearRegression to the training data and
39 |     then evaluates it making predictions on the test data.
40 |     """
41 | 
42 |     MODEL = linear_model.LinearRegression
43 | 
44 | 
45 | class MLPRegressor(RegressionEfficacyMetric):
46 |     """MLPRegressor Efficacy based metric.
47 | 
48 |     This fits a MLPRegressor to the training data and
49 |     then evaluates it making predictions on the test data.
50 |     """
51 | 
52 |     MODEL = neural_network.MLPRegressor
53 |     MODEL_KWARGS = {'hidden_layer_sizes': (100,), 'max_iter': 50}
54 | 


--------------------------------------------------------------------------------
/sdmetrics/single_table/privacy/__init__.py:
--------------------------------------------------------------------------------
 1 | """Privacy metrics module."""
 2 | 
 3 | from sdmetrics.single_table.privacy.base import CategoricalPrivacyMetric, NumericalPrivacyMetric
 4 | from sdmetrics.single_table.privacy.cap import (
 5 |     CategoricalCAP,
 6 |     CategoricalGeneralizedCAP,
 7 |     CategoricalZeroCAP,
 8 | )
 9 | from sdmetrics.single_table.privacy.categorical_sklearn import (
10 |     CategoricalKNN,
11 |     CategoricalNB,
12 |     CategoricalRF,
13 |     CategoricalSVM,
14 | )
15 | from sdmetrics.single_table.privacy.disclosure_protection import (
16 |     DisclosureProtection,
17 |     DisclosureProtectionEstimate,
18 | )
19 | from sdmetrics.single_table.privacy.dcr_baseline_protection import DCRBaselineProtection
20 | from sdmetrics.single_table.privacy.dcr_overfitting_protection import DCROverfittingProtection
21 | from sdmetrics.single_table.privacy.ensemble import CategoricalEnsemble
22 | from sdmetrics.single_table.privacy.numerical_sklearn import NumericalLR, NumericalMLP, NumericalSVR
23 | from sdmetrics.single_table.privacy.radius_nearest_neighbor import NumericalRadiusNearestNeighbor
24 | 
25 | __all__ = [
26 |     'CategoricalCAP',
27 |     'CategoricalEnsemble',
28 |     'CategoricalGeneralizedCAP',
29 |     'CategoricalKNN',
30 |     'CategoricalNB',
31 |     'CategoricalPrivacyMetric',
32 |     'CategoricalRF',
33 |     'CategoricalSVM',
34 |     'CategoricalZeroCAP',
35 |     'DisclosureProtection',
36 |     'DisclosureProtectionEstimate',
37 |     'NumericalLR',
38 |     'NumericalMLP',
39 |     'NumericalPrivacyMetric',
40 |     'NumericalRadiusNearestNeighbor',
41 |     'NumericalSVR',
42 |     'DCRBaselineProtection',
43 |     'DCROverfittingProtection',
44 | ]
45 | 


--------------------------------------------------------------------------------
/sdmetrics/single_table/privacy/loss.py:
--------------------------------------------------------------------------------
 1 | """Utilities for the single_table.privacy modules."""
 2 | 
 3 | import numpy as np
 4 | from copulas.univariate.base import Univariate
 5 | 
 6 | 
 7 | class LossFunction:
 8 |     """Base class for a loss function."""
 9 | 
10 |     def fit(self, data, cols):
11 |         """Learn the metric on the value space.
12 | 
13 |         Args:
14 |             real_data (pandas.DataFrame):
15 |                 The real data table.
16 |             cols (list[str]):
17 |                 The names for the target columns (usually the sensitive cols).
18 |         """
19 | 
20 |     def measure(self, pred, real):
21 |         """Calculate the loss of a single prediction.
22 | 
23 |         Args:
24 |             pred (tuple):
25 |                 The predicted value.
26 |             real (tuple):
27 |                 The actual value.
28 |         """
29 |         raise NotImplementedError('Please implement the loss measuring algorithm!')
30 | 
31 | 
32 | class InverseCDFDistance(LossFunction):
33 |     """Measure the distance between continuous key fields.
34 | 
35 |     This loss function first applies the fitted cdfs to every single entry (i.e. turning
36 |     the numerical values into their respective percentiles) and then measures the Lp distance
37 |     to the pth power, between the predicted value and the real value.
38 | 
39 |     Args:
40 |         p (float):
41 |             The p parameter in L_p metric. Must be positive.
42 |     """
43 | 
44 |     def __init__(self, p=2):
45 |         self.p = p
46 |         self.cdfs = []
47 | 
48 |     def fit(self, data, cols):
49 |         """Fits univariate distributions (automatically selected).
50 | 
51 |         Args:
52 |             data (DataFrame):
53 |                 Data, where each column in `cols` is a continuous column.
54 |             cols (list[str]):
55 |                 Column names.
56 |         """
57 |         for col in cols:
58 |             col_data = np.array(data[col])
59 |             dist_model = Univariate()
60 |             dist_model.fit(col_data)
61 |             self.cdfs.append(dist_model)
62 | 
63 |     def measure(self, pred, real):
64 |         """Compute the distance (L_p norm) between the pred and real values.
65 | 
66 |         This uses the probability integral transform to map the pred/real values
67 |         to a CDF value (between 0.0 and 1.0). Then, it computes the L_p norm
68 |         between the CDF(pred) and CDF(real).
69 | 
70 |         Args:
71 |             pred (tuple):
72 |                 Predicted value(s) corresponding to the columns specified in fit.
73 |             real (tuple):
74 |                 Real value(s) corresponding to the columns specified in fit.
75 | 
76 |         Returns:
77 |             float:
78 |                 The L_p norm of the CDF value.
79 |         """
80 |         assert len(pred) == len(real)
81 | 
82 |         dist = 0
83 |         for idx in range(len(real)):
84 |             percentiles = self.cdfs[idx].cdf(np.array([pred[idx], real[idx]]))
85 |             dist += abs(percentiles[0] - percentiles[1]) ** self.p
86 | 
87 |         return dist
88 | 


--------------------------------------------------------------------------------
/sdmetrics/single_table/table_structure.py:
--------------------------------------------------------------------------------
 1 | """Table Format metric."""
 2 | 
 3 | from sdmetrics.goal import Goal
 4 | from sdmetrics.single_table.base import SingleTableMetric
 5 | 
 6 | 
 7 | class TableStructure(SingleTableMetric):
 8 |     """TableStructure Single Table metric.
 9 | 
10 |     This metric computes whether the names and data types of each column are
11 |     the same in the real and synthetic data.
12 | 
13 |     Attributes:
14 |         name (str):
15 |             Name to use when reports about this metric are printed.
16 |         goal (sdmetrics.goal.Goal):
17 |             The goal of this metric.
18 |         min_value (Union[float, tuple[float]]):
19 |             Minimum value or values that this metric can take.
20 |         max_value (Union[float, tuple[float]]):
21 |             Maximum value or values that this metric can take.
22 |     """
23 | 
24 |     name = 'TableStructure'
25 |     goal = Goal.MAXIMIZE
26 |     min_value = 0
27 |     max_value = 1
28 | 
29 |     @classmethod
30 |     def compute_breakdown(cls, real_data, synthetic_data):
31 |         """Compute the score breakdown of the table format metric.
32 | 
33 |         Args:
34 |             real_data (pandas.DataFrame):
35 |                 The real data.
36 |             synthetic_data (pandas.DataFrame):
37 |                 The synthetic data.
38 |         """
39 |         real_columns_dtypes = set(zip(real_data.columns, map(str, real_data.dtypes)))
40 |         synthetic_columns_dtypes = set(zip(synthetic_data.columns, map(str, synthetic_data.dtypes)))
41 | 
42 |         intersection = real_columns_dtypes & synthetic_columns_dtypes
43 |         union = real_columns_dtypes | synthetic_columns_dtypes
44 |         score = len(intersection) / len(union)
45 | 
46 |         return {'score': score}
47 | 
48 |     @classmethod
49 |     def compute(cls, real_data, synthetic_data):
50 |         """Compute the table format metric score.
51 | 
52 |         Args:
53 |             real_data (pandas.DataFrame):
54 |                 The real data.
55 |             synthetic_data (pandas.DataFrame):
56 |                 The synthetic data.
57 | 
58 |         Returns:
59 |             float:
60 |                 The metric score.
61 |         """
62 |         return cls.compute_breakdown(real_data, synthetic_data)['score']
63 | 


--------------------------------------------------------------------------------
/sdmetrics/timeseries/__init__.py:
--------------------------------------------------------------------------------
 1 | """Metrics for timeseries datasets."""
 2 | 
 3 | from sdmetrics.timeseries import base, detection, efficacy, ml_scorers
 4 | from sdmetrics.timeseries.base import TimeSeriesMetric
 5 | from sdmetrics.timeseries.detection import LSTMDetection, TimeSeriesDetectionMetric
 6 | from sdmetrics.timeseries.efficacy import TimeSeriesEfficacyMetric
 7 | from sdmetrics.timeseries.efficacy.classification import LSTMClassifierEfficacy
 8 | 
 9 | __all__ = [
10 |     'base',
11 |     'detection',
12 |     'efficacy',
13 |     'ml_scorers',
14 |     'TimeSeriesMetric',
15 |     'TimeSeriesDetectionMetric',
16 |     'LSTMDetection',
17 |     'TimeSeriesEfficacyMetric',
18 |     'LSTMClassifierEfficacy',
19 | ]
20 | 


--------------------------------------------------------------------------------
/sdmetrics/timeseries/efficacy/__init__.py:
--------------------------------------------------------------------------------
 1 | """Machine Learning Efficacy metrics for Time Series."""
 2 | 
 3 | from sdmetrics.timeseries.efficacy.base import TimeSeriesEfficacyMetric
 4 | from sdmetrics.timeseries.efficacy.classification import (
 5 |     LSTMClassifierEfficacy,
 6 |     TimeSeriesClassificationEfficacyMetric,
 7 | )
 8 | 
 9 | __all__ = [
10 |     'TimeSeriesEfficacyMetric',
11 |     'TimeSeriesClassificationEfficacyMetric',
12 |     'LSTMClassifierEfficacy',
13 | ]
14 | 


--------------------------------------------------------------------------------
/sdmetrics/timeseries/efficacy/classification.py:
--------------------------------------------------------------------------------
 1 | """Machine Learning Classification Efficacy based metrics for Time Series."""
 2 | 
 3 | from sdmetrics.timeseries import ml_scorers
 4 | from sdmetrics.timeseries.efficacy.base import TimeSeriesEfficacyMetric
 5 | 
 6 | 
 7 | class TimeSeriesClassificationEfficacyMetric(TimeSeriesEfficacyMetric):
 8 |     """TimeSeriesEfficacy metrics for Time Series Classification problems."""
 9 | 
10 | 
11 | class LSTMClassifierEfficacy(TimeSeriesClassificationEfficacyMetric):
12 |     """TimeSeriesEfficacy metric based on an LSTM Classifier."""
13 | 
14 |     _scorer = ml_scorers.lstm_classifier
15 | 


--------------------------------------------------------------------------------
/sdmetrics/timeseries/ml_scorers.py:
--------------------------------------------------------------------------------
 1 | """Machine Learning Detection based metrics for Time Series."""
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from sklearn.preprocessing import LabelEncoder
 6 | 
 7 | 
 8 | def _stack(row):
 9 |     return np.stack(row.to_numpy())  # noqa
10 | 
11 | 
12 | def _to_numpy(dataframe):
13 |     return np.stack(dataframe.apply(_stack, axis=1))  # noqa
14 | 
15 | 
16 | def _x_to_packed_sequence(X, torch):
17 |     sequences = []
18 |     for _, row in X.iterrows():
19 |         sequence = []
20 |         for _, values in row.items():
21 |             sequence.append(values)
22 | 
23 |         sequences.append(torch.FloatTensor(np.array(sequence)).T)
24 | 
25 |     return torch.nn.utils.rnn.pack_sequence(sequences, enforce_sorted=False)
26 | 
27 | 
28 | def lstm_classifier(X_train, X_test, y_train, y_test):
29 |     """ML Scorer based on a simple LSTM based NN implemented using torch."""
30 |     try:
31 |         import torch
32 |     except ImportError:
33 |         raise ImportError('Please install torch with `pip install torch`')
34 | 
35 |     input_dim = len(X_train.columns)
36 |     output_dim = len(set(y_train))
37 |     hidden_dim = 32
38 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
39 | 
40 |     lstm = torch.nn.LSTM(input_dim, hidden_dim).to(device)
41 |     linear = torch.nn.Linear(hidden_dim, output_dim).to(device)
42 | 
43 |     X_train = _x_to_packed_sequence(X_train, torch).to(device)
44 |     X_test = _x_to_packed_sequence(X_test, torch).to(device)
45 | 
46 |     transformer = LabelEncoder()
47 |     column = 'target'
48 |     y_train = pd.DataFrame(y_train, columns=[column])
49 |     y_test = pd.DataFrame(y_test, columns=[column])
50 | 
51 |     y_train = transformer.fit_transform(y_train[column])
52 |     y_train = torch.LongTensor(y_train).to(device)
53 |     y_test = torch.LongTensor(transformer.transform(y_test[column])).to(device)
54 | 
55 |     optimizer = torch.optim.Adam(list(lstm.parameters()) + list(linear.parameters()), lr=1e-2)
56 | 
57 |     for _ in range(1024):
58 |         _, (y, _) = lstm(X_train)
59 |         y_pred = linear(y[0])
60 |         loss = torch.nn.functional.cross_entropy(y_pred, y_train)
61 | 
62 |         optimizer.zero_grad()
63 |         loss.backward()
64 |         optimizer.step()
65 | 
66 |     _, (y, _) = lstm(X_test)
67 |     y_pred = linear(y[0])
68 |     y_pred = torch.argmax(y_pred, axis=1)
69 |     return (y_test == y_pred).sum().item() / len(y_test)
70 | 


--------------------------------------------------------------------------------
/sdmetrics/warnings.py:
--------------------------------------------------------------------------------
 1 | """Warnings for sdmetrics."""
 2 | 
 3 | 
 4 | class SDMetricsWarning(RuntimeWarning):
 5 |     """Class to represent SDMetrics warnings."""
 6 | 
 7 | 
 8 | class ConstantInputWarning(SDMetricsWarning):
 9 |     """Thrown when the input data has all the same values."""
10 | 
11 |     def __init__(self, message):
12 |         self.message = message
13 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics testing package."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing package."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/column_pairs/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the column_pairs module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/column_pairs/statistical/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the column_pairs statistical module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/column_pairs/statistical/test_contingency_similarity.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from sdmetrics.column_pairs.statistical import ContingencySimilarity
 4 | from sdmetrics.demos import load_demo
 5 | 
 6 | 
 7 | def test_with_num_rows_subsample():
 8 |     """Test the metric with `num_rows_subsample`.
 9 | 
10 |     Here the `real_data` and `syntehtic_data` have 218 rows.
11 |     """
12 |     # Setup
13 |     np.random.seed(42)
14 |     real_data, synthetic_data, _ = load_demo('single_table')
15 |     real_data = real_data[['degree_type', 'high_spec']]
16 |     synthetic_data = synthetic_data[['degree_type', 'high_spec']]
17 |     num_rows_subsample = 100
18 | 
19 |     # Run
20 |     result_1 = ContingencySimilarity.compute(
21 |         real_data=real_data,
22 |         synthetic_data=synthetic_data,
23 |         num_rows_subsample=num_rows_subsample,
24 |     )
25 |     result_2 = ContingencySimilarity.compute(
26 |         real_data=real_data,
27 |         synthetic_data=synthetic_data,
28 |         num_rows_subsample=num_rows_subsample,
29 |     )
30 |     result_entire_data = ContingencySimilarity.compute(
31 |         real_data=real_data,
32 |         synthetic_data=synthetic_data,
33 |         num_rows_subsample=None,
34 |     )
35 | 
36 |     # Assert
37 |     assert result_1 != result_2
38 |     assert result_1 != result_entire_data
39 |     assert result_2 != result_entire_data
40 |     assert np.isclose(result_1, result_entire_data, atol=0.1)
41 |     assert np.isclose(result_2, result_entire_data, atol=0.1)
42 | 


--------------------------------------------------------------------------------
/tests/integration/multi_table/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the multi_table module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/multi_table/test_multi_single_table.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import pytest
  4 | 
  5 | from sdmetrics.multi_table.multi_single_table import (
  6 |     CSTest,
  7 |     KSComplement,
  8 |     LogisticDetection,
  9 |     SVCDetection,
 10 | )
 11 | 
 12 | METRICS = [CSTest, KSComplement, LogisticDetection, SVCDetection]
 13 | 
 14 | 
 15 | @pytest.fixture
 16 | def ones():
 17 |     data = pd.DataFrame({
 18 |         'a': [1] * 100,
 19 |         'b': [True] * 100,
 20 |     })
 21 |     return {'a': data, 'b': data.copy()}
 22 | 
 23 | 
 24 | @pytest.fixture
 25 | def zeros():
 26 |     data = pd.DataFrame({
 27 |         'a': [0] * 100,
 28 |         'b': [False] * 100,
 29 |     })
 30 |     return {'a': data, 'b': data.copy()}
 31 | 
 32 | 
 33 | @pytest.fixture
 34 | def real_data():
 35 |     data = pd.DataFrame({
 36 |         'a': np.random.normal(size=600),
 37 |         'b': np.random.randint(0, 10, size=600),
 38 |         'c': ['a', 'b', 'b', 'c', 'c', 'c'] * 100,
 39 |         'd': [True, True, True, True, True, False] * 100,
 40 |     })
 41 |     return {'a': data, 'b': data.copy()}
 42 | 
 43 | 
 44 | @pytest.fixture
 45 | def good_data():
 46 |     data = pd.DataFrame({
 47 |         'a': np.random.normal(loc=0.01, size=600),
 48 |         'b': np.random.randint(0, 10, size=600),
 49 |         'c': ['a', 'b', 'b', 'b', 'c', 'c'] * 100,
 50 |         'd': [True, True, True, True, False, False] * 100,
 51 |     })
 52 |     return {'a': data, 'b': data.copy()}
 53 | 
 54 | 
 55 | @pytest.fixture
 56 | def bad_data():
 57 |     data = pd.DataFrame({
 58 |         'a': np.random.normal(loc=5, scale=3, size=600),
 59 |         'b': np.random.randint(5, 15, size=600),
 60 |         'c': ['a', 'a', 'a', 'a', 'b', 'b'] * 100,
 61 |         'd': [True, False, False, False, False, False] * 100,
 62 |     })
 63 |     return {'a': data, 'b': data.copy()}
 64 | 
 65 | 
 66 | @pytest.mark.parametrize('metric', METRICS)
 67 | def test_max(metric, ones):
 68 |     output = metric.compute(ones, ones.copy())
 69 |     normalized = metric.normalize(output)
 70 | 
 71 |     assert output == 1
 72 |     assert normalized == 1
 73 | 
 74 | 
 75 | @pytest.mark.parametrize('metric', METRICS)
 76 | def test_min(metric, ones, zeros):
 77 |     output = metric.compute(ones, zeros)
 78 |     normalized = metric.normalize(output)
 79 | 
 80 |     assert np.round(output, decimals=5) == 0
 81 |     assert np.round(normalized, decimals=5) == 0
 82 | 
 83 | 
 84 | @pytest.mark.parametrize('metric', METRICS)
 85 | def test_good(metric, real_data, good_data):
 86 |     output = metric.compute(real_data, good_data)
 87 |     normalized = metric.normalize(output)
 88 | 
 89 |     assert 0.5 < output <= 1
 90 |     assert 0.5 < normalized <= 1
 91 | 
 92 | 
 93 | @pytest.mark.parametrize('metric', METRICS)
 94 | def test_bad(metric, real_data, bad_data):
 95 |     output = metric.compute(real_data, bad_data)
 96 |     normalized = metric.normalize(output)
 97 | 
 98 |     assert 0 <= output < 0.5
 99 |     assert 0 <= normalized < 0.5
100 | 
101 | 
102 | @pytest.mark.parametrize('metric', METRICS)
103 | def test_fail(metric):
104 |     error_msg = '`real_data` and `synthetic_data` must have the same tables'
105 |     with pytest.raises(ValueError, match=error_msg):
106 |         metric.compute({'a': None, 'b': None}, {'a': None})
107 | 


--------------------------------------------------------------------------------
/tests/integration/multi_table/test_multi_table.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from sdmetrics import compute_metrics
 4 | from sdmetrics.demos import load_multi_table_demo
 5 | from sdmetrics.multi_table.base import MultiTableMetric
 6 | 
 7 | 
 8 | def test_compute_all():
 9 |     real_data, synthetic_data, metadata = load_multi_table_demo()
10 | 
11 |     output = compute_metrics(
12 |         MultiTableMetric.get_subclasses(), real_data, synthetic_data, metadata=metadata
13 |     )
14 | 
15 |     assert not pd.isna(output.raw_score.mean())
16 | 
17 |     scores = output[output.raw_score.notna()]
18 | 
19 |     assert scores.raw_score.between(scores.min_value, scores.max_value).all()
20 | 


--------------------------------------------------------------------------------
/tests/integration/reports/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the reports module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/reports/multi_table/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the multi-table reports module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/reports/multi_table/_properties/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the reports multi_table _properties module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/reports/multi_table/_properties/test_boundary.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | from tqdm import tqdm
 4 | 
 5 | from sdmetrics.demos import load_demo
 6 | from sdmetrics.reports.multi_table._properties import Boundary
 7 | 
 8 | 
 9 | class TestBoundary:
10 |     def test_end_to_end(self):
11 |         """Test the ``Boundary`` multi-table property end to end."""
12 |         # Setup
13 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
14 |         boundary = Boundary()
15 | 
16 |         # Run
17 |         result = boundary.get_score(real_data, synthetic_data, metadata)
18 | 
19 |         # Assert
20 |         assert result == 1.0
21 | 
22 |     def test_with_progress_bar(self):
23 |         """Test that the progress bar is correctly updated."""
24 |         # Setup
25 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
26 |         boundary = Boundary()
27 |         num_columns = sum(len(table['columns']) for table in metadata['tables'].values())
28 | 
29 |         progress_bar = tqdm(total=num_columns)
30 |         mock_update = Mock()
31 |         progress_bar.update = mock_update
32 | 
33 |         # Run
34 |         result = boundary.get_score(real_data, synthetic_data, metadata, progress_bar)
35 | 
36 |         # Assert
37 |         assert result == 1.0
38 |         assert mock_update.call_count == num_columns
39 | 


--------------------------------------------------------------------------------
/tests/integration/reports/multi_table/_properties/test_cardinality.py:
--------------------------------------------------------------------------------
 1 | """Test multi-table cardinality properties."""
 2 | 
 3 | import pandas as pd
 4 | from plotly.graph_objs._figure import Figure
 5 | 
 6 | from sdmetrics.demos import load_multi_table_demo
 7 | from sdmetrics.reports.multi_table._properties import Cardinality
 8 | 
 9 | 
10 | def test_cardinality_property():
11 |     """Test the ``Cardinality`` with the multi table demo."""
12 |     # Setup
13 |     cardinality_property = Cardinality()
14 |     real_data, synthetic_data, metadata = load_multi_table_demo()
15 | 
16 |     # Run
17 |     score = cardinality_property.get_score(real_data, synthetic_data, metadata)
18 |     figure = cardinality_property.get_visualization('users')
19 | 
20 |     # Assert
21 |     assert score == 0.95
22 |     assert isinstance(figure, Figure)
23 | 
24 | 
25 | def test_with_multi_foreign_key():
26 |     """Test the ``Cardinality`` with multiple foreign keys."""
27 |     # Setup
28 |     real_data = {
29 |         'bank': pd.DataFrame({
30 |             'primary_key': [1, 2, 3, 4, 5],
31 |             'category': ['a', 'b', 'c', 'd', 'e'],
32 |             'numerical': [1, 2, 3, 4, 5],
33 |         }),
34 |         'transactions': pd.DataFrame({
35 |             'f_key_1': [1, 2, 3, 2, 1],
36 |             'f_key_2': [1, 5, 3, 2, 4],
37 |         }),
38 |     }
39 | 
40 |     synthetic_data = {
41 |         'bank': pd.DataFrame({
42 |             'primary_key': [1, 2, 3, 4, 5],
43 |             'category': ['a', 'b', 'c', 'd', 'e'],
44 |             'numerical': [1, 2, 3, 4, 5],
45 |         }),
46 |         'transactions': pd.DataFrame({
47 |             'f_key_1': [5, 2, 3, 4, 1],
48 |             'f_key_2': [1, 5, 5, 2, 4],
49 |         }),
50 |     }
51 | 
52 |     metadata = {
53 |         'tables': {
54 |             'bank': {
55 |                 'primary_key': 'primary_key',
56 |                 'columns': {
57 |                     'primary_key': {'sdtype': 'id'},
58 |                     'category': {'sdtype': 'categorical'},
59 |                     'numerical': {'sdtype': 'numerical'},
60 |                 },
61 |             },
62 |             'transactions': {'columns': {'f_key_1': {'sdtype': 'id'}, 'f_key_2': {'sdtype': 'id'}}},
63 |         },
64 |         'relationships': [
65 |             {
66 |                 'parent_table_name': 'bank',
67 |                 'child_table_name': 'transactions',
68 |                 'parent_primary_key': 'primary_key',
69 |                 'child_foreign_key': 'f_key_1',
70 |             },
71 |             {
72 |                 'parent_table_name': 'bank',
73 |                 'child_table_name': 'transactions',
74 |                 'parent_primary_key': 'primary_key',
75 |                 'child_foreign_key': 'f_key_2',
76 |             },
77 |         ],
78 |     }
79 | 
80 |     cardinality_property = Cardinality()
81 | 
82 |     # Run
83 |     cardinality_property.get_score(real_data, synthetic_data, metadata)
84 |     fig = cardinality_property.get_visualization('bank')
85 | 
86 |     # Assert
87 |     expected_labels = ['transactions (f_key_1) → bank', 'transactions (f_key_2) → bank']
88 |     assert fig.data[0].x.tolist() == expected_labels
89 | 


--------------------------------------------------------------------------------
/tests/integration/reports/multi_table/_properties/test_column_pair_trends.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | import numpy as np
 4 | from tqdm import tqdm
 5 | 
 6 | from sdmetrics.demos import load_demo
 7 | from sdmetrics.reports.multi_table._properties import ColumnPairTrends
 8 | 
 9 | 
10 | class TestColumnPairTrends:
11 |     def test_end_to_end(self):
12 |         """Test ``ColumnPairTrends`` multi-table property end to end."""
13 |         # Setup
14 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
15 |         column_pair_trends = ColumnPairTrends()
16 | 
17 |         # Run
18 |         result = column_pair_trends.get_score(real_data, synthetic_data, metadata)
19 | 
20 |         # Assert
21 |         assert np.isclose(result, 0.45654629583521095, atol=1e-8)
22 | 
23 |     def test_with_progress_bar(self):
24 |         """Test that the progress bar is correctly updated."""
25 |         # Setup
26 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
27 |         column_pair_trends = ColumnPairTrends()
28 |         num_iter = sum(
29 |             int(0.5 * len(table['columns']) * (len(table['columns']) - 1))
30 |             for table in metadata['tables'].values()
31 |         )
32 | 
33 |         progress_bar = tqdm(total=num_iter)
34 |         mock_update = Mock()
35 |         progress_bar.update = mock_update
36 | 
37 |         # Run
38 |         result = column_pair_trends.get_score(real_data, synthetic_data, metadata, progress_bar)
39 | 
40 |         # Assert
41 |         assert np.isclose(result, 0.45654629583521095, atol=1e-8)
42 |         assert mock_update.call_count == num_iter
43 | 


--------------------------------------------------------------------------------
/tests/integration/reports/multi_table/_properties/test_column_shapes.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | from tqdm import tqdm
 4 | 
 5 | from sdmetrics.demos import load_demo
 6 | from sdmetrics.reports.multi_table._properties import ColumnShapes
 7 | 
 8 | 
 9 | class TestColumnShapes:
10 |     def test_end_to_end(self):
11 |         """Test the ``ColumnShapes`` multi-table property end to end."""
12 |         # Setup
13 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
14 |         column_shapes = ColumnShapes()
15 | 
16 |         # Run
17 |         result = column_shapes.get_score(real_data, synthetic_data, metadata)
18 | 
19 |         # Assert
20 |         assert result == 0.7978174603174604
21 | 
22 |     def test_with_progress_bar(self):
23 |         """Test that the progress bar is correctly updated."""
24 |         # Setup
25 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
26 |         column_shapes = ColumnShapes()
27 |         num_columns = sum(len(table['columns']) for table in metadata['tables'].values())
28 | 
29 |         progress_bar = tqdm(total=num_columns)
30 |         mock_update = Mock()
31 |         progress_bar.update = mock_update
32 | 
33 |         # Run
34 |         result = column_shapes.get_score(real_data, synthetic_data, metadata, progress_bar)
35 | 
36 |         # Assert
37 |         assert result == 0.7978174603174604
38 |         assert mock_update.call_count == num_columns
39 | 


--------------------------------------------------------------------------------
/tests/integration/reports/multi_table/_properties/test_coverage.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | from tqdm import tqdm
 4 | 
 5 | from sdmetrics.demos import load_demo
 6 | from sdmetrics.reports.multi_table._properties import Coverage
 7 | 
 8 | 
 9 | class TestCoverage:
10 |     def test_end_to_end(self):
11 |         """Test the ``Coverage`` multi-table property end to end."""
12 |         # Setup
13 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
14 |         coverage = Coverage()
15 | 
16 |         # Run
17 |         result = coverage.get_score(real_data, synthetic_data, metadata)
18 | 
19 |         # Assert
20 |         assert result == 0.8244218804937835
21 | 
22 |     def test_with_progress_bar(self):
23 |         """Test that the progress bar is correctly updated."""
24 |         # Setup
25 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
26 |         coverage = Coverage()
27 |         num_columns = sum(len(table['columns']) for table in metadata['tables'].values())
28 | 
29 |         progress_bar = tqdm(total=num_columns)
30 |         mock_update = Mock()
31 |         progress_bar.update = mock_update
32 | 
33 |         # Run
34 |         result = coverage.get_score(real_data, synthetic_data, metadata, progress_bar)
35 | 
36 |         # Assert
37 |         assert result == 0.8244218804937835
38 |         assert mock_update.call_count == num_columns
39 | 


--------------------------------------------------------------------------------
/tests/integration/reports/multi_table/_properties/test_data_validity.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | from tqdm import tqdm
 4 | 
 5 | from sdmetrics.demos import load_demo
 6 | from sdmetrics.reports.multi_table._properties import DataValidity
 7 | 
 8 | 
 9 | class TestDataValidity:
10 |     def test_end_to_end(self):
11 |         """Test the ``DataValidity`` multi-table property end to end."""
12 |         # Setup
13 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
14 |         column_shapes = DataValidity()
15 | 
16 |         # Run
17 |         result = column_shapes.get_score(real_data, synthetic_data, metadata)
18 | 
19 |         # Assert
20 |         assert result == 1.0
21 | 
22 |     def test_with_progress_bar(self):
23 |         """Test that the progress bar is correctly updated."""
24 |         # Setup
25 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
26 |         column_shapes = DataValidity()
27 |         num_columns = sum(len(table['columns']) for table in metadata['tables'].values())
28 | 
29 |         progress_bar = tqdm(total=num_columns)
30 |         mock_update = Mock()
31 |         progress_bar.update = mock_update
32 | 
33 |         # Run
34 |         result = column_shapes.get_score(real_data, synthetic_data, metadata, progress_bar)
35 | 
36 |         # Assert
37 |         assert result == 1.0
38 |         assert mock_update.call_count == num_columns
39 | 


--------------------------------------------------------------------------------
/tests/integration/reports/multi_table/_properties/test_inter_table_trends.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | from tqdm import tqdm
 4 | 
 5 | from sdmetrics.demos import load_demo
 6 | from sdmetrics.reports.multi_table._properties import InterTableTrends
 7 | 
 8 | 
 9 | class TestInterTableTrends:
10 |     def test_end_to_end(self):
11 |         """Test ``ColumnPairTrends`` multi-table property end to end."""
12 |         # Setup
13 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
14 |         inter_table_trends = InterTableTrends()
15 | 
16 |         # Run
17 |         result = inter_table_trends.get_score(real_data, synthetic_data, metadata)
18 | 
19 |         # Assert
20 |         assert result == 0.4416666666666666
21 | 
22 |     def test_with_progress_bar(self):
23 |         """Test that the progress bar is correctly updated."""
24 |         # Setup
25 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
26 |         inter_table_trends = InterTableTrends()
27 |         num_iter = sum(
28 |             len(metadata['tables'][relationship['parent_table_name']]['columns'])
29 |             * len(metadata['tables'][relationship['child_table_name']]['columns'])
30 |             for relationship in metadata['relationships']
31 |         )
32 | 
33 |         progress_bar = tqdm(total=num_iter)
34 |         mock_update = Mock()
35 |         progress_bar.update = mock_update
36 | 
37 |         # Run
38 |         result = inter_table_trends.get_score(real_data, synthetic_data, metadata, progress_bar)
39 | 
40 |         # Assert
41 |         assert result == 0.4416666666666666
42 |         assert mock_update.call_count == num_iter
43 | 


--------------------------------------------------------------------------------
/tests/integration/reports/multi_table/_properties/test_relationship_validity.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from tqdm import tqdm
 4 | 
 5 | from sdmetrics.demos import load_demo
 6 | from sdmetrics.reports.multi_table._properties import RelationshipValidity
 7 | 
 8 | 
 9 | class TestRelationshipValidity:
10 |     def test_end_to_end(self):
11 |         """Test the ``RelationshipValidity`` multi-table property end to end."""
12 |         # Setup
13 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
14 |         relationship_validity = RelationshipValidity()
15 | 
16 |         # Run
17 |         result = relationship_validity.get_score(real_data, synthetic_data, metadata)
18 | 
19 |         # Assert
20 |         assert result == 1.0
21 | 
22 |     def test_with_progress_bar(self, capsys):
23 |         """Test that the progress bar is correctly updated."""
24 |         # Setup
25 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
26 |         relationship_validity = RelationshipValidity()
27 |         num_relationship = 2
28 | 
29 |         progress_bar = tqdm(total=num_relationship, file=sys.stdout)
30 | 
31 |         # Run
32 |         result = relationship_validity.get_score(real_data, synthetic_data, metadata, progress_bar)
33 |         progress_bar.close()
34 |         captured = capsys.readouterr()
35 |         output = captured.out
36 | 
37 |         # Assert
38 |         assert result == 1.0
39 |         assert '100%' in output
40 |         assert f'{num_relationship}/{num_relationship}' in output
41 | 


--------------------------------------------------------------------------------
/tests/integration/reports/multi_table/_properties/test_structure.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | import pandas as pd
 4 | from tqdm import tqdm
 5 | 
 6 | from sdmetrics.demos import load_demo
 7 | from sdmetrics.reports.multi_table._properties import Structure
 8 | 
 9 | 
10 | class TestStructure:
11 |     def test_end_to_end(self):
12 |         """Test Structure multi-table."""
13 |         # Setup
14 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
15 |         structure = Structure()
16 | 
17 |         # Run
18 |         result = structure.get_score(real_data, synthetic_data, metadata)
19 | 
20 |         # Assert
21 |         assert result == 1.0
22 | 
23 |         expected_details = pd.DataFrame({
24 |             'Table': ['users', 'sessions', 'transactions'],
25 |             'Metric': ['TableStructure', 'TableStructure', 'TableStructure'],
26 |             'Score': [1.0, 1.0, 1.0],
27 |         })
28 |         pd.testing.assert_frame_equal(structure.details, expected_details)
29 | 
30 |     def test_with_progress_bar(self):
31 |         """Test that the progress bar is correctly updated."""
32 |         # Setup
33 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
34 |         structure = Structure()
35 |         num_tables = len(metadata['tables'])
36 | 
37 |         progress_bar = tqdm(total=num_tables)
38 |         mock_update = Mock()
39 |         progress_bar.update = mock_update
40 | 
41 |         # Run
42 |         result = structure.get_score(real_data, synthetic_data, metadata, progress_bar)
43 | 
44 |         # Assert
45 |         assert result == 1.0
46 |         assert mock_update.call_count == num_tables
47 | 


--------------------------------------------------------------------------------
/tests/integration/reports/multi_table/_properties/test_synthesis.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock
 2 | 
 3 | from tqdm import tqdm
 4 | 
 5 | from sdmetrics.demos import load_demo
 6 | from sdmetrics.reports.multi_table._properties import Synthesis
 7 | 
 8 | 
 9 | class TestSynthesis:
10 |     def test_end_to_end(self):
11 |         """Test Synthesis multi-table."""
12 |         # Setup
13 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
14 |         synthesis = Synthesis()
15 | 
16 |         # Run
17 |         result = synthesis.get_score(real_data, synthetic_data, metadata)
18 | 
19 |         # Assert
20 |         assert result == 0.6333333333333333
21 | 
22 |     def test_with_progress_bar(self):
23 |         """Test that the progress bar is correctly updated."""
24 |         # Setup
25 |         real_data, synthetic_data, metadata = load_demo(modality='multi_table')
26 |         synthesis = Synthesis()
27 |         num_tables = len(metadata['tables'])
28 | 
29 |         progress_bar = tqdm(total=num_tables)
30 |         mock_update = Mock()
31 |         progress_bar.update = mock_update
32 | 
33 |         # Run
34 |         result = synthesis.get_score(real_data, synthetic_data, metadata, progress_bar)
35 | 
36 |         # Assert
37 |         assert result == 0.6333333333333333
38 |         assert mock_update.call_count == num_tables
39 | 


--------------------------------------------------------------------------------
/tests/integration/reports/single_table/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the single-table reports module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/reports/single_table/_properties/test_boundary.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from sdmetrics.demos import load_demo
 5 | from sdmetrics.reports.single_table._properties import Boundary
 6 | 
 7 | 
 8 | class TestBoundary:
 9 |     def test_get_score(self):
10 |         """Test the ``get_score`` method."""
11 |         # Setup
12 |         real_data, synthetic_data, metadata = load_demo(modality='single_table')
13 |         boundary_property = Boundary()
14 | 
15 |         # Run
16 |         score = boundary_property.get_score(real_data, synthetic_data, metadata)
17 | 
18 |         # Assert
19 |         assert score == 1.0
20 |         expected_details = pd.DataFrame({
21 |             'Column': [
22 |                 'start_date',
23 |                 'end_date',
24 |                 'salary',
25 |                 'duration',
26 |                 'high_perc',
27 |                 'second_perc',
28 |                 'degree_perc',
29 |                 'experience_years',
30 |                 'employability_perc',
31 |                 'mba_perc',
32 |             ],
33 |             'Metric': ['BoundaryAdherence'] * 10,
34 |             'Score': [1.0] * 10,
35 |         })
36 | 
37 |         pd.testing.assert_frame_equal(boundary_property.details, expected_details)
38 | 
39 |     def test_get_score_error(self):
40 |         """Test the ``get_score`` method with errors."""
41 |         # Setup
42 |         real_data, synthetic_data, metadata = load_demo(modality='single_table')
43 |         real_data['start_date'].iloc[0] = 0
44 |         real_data['employability_perc'].iloc[2] = 'a'
45 |         real_data['salary'] = np.nan
46 | 
47 |         boundary_property = Boundary()
48 | 
49 |         # Run
50 |         score = boundary_property.get_score(real_data, synthetic_data, metadata)
51 | 
52 |         # Assert
53 |         expected_message_1 = (
54 |             "TypeError: '<=' not supported between instances of 'int' and 'Timestamp'"
55 |         )
56 |         expected_message_2 = 'InvalidDataError: All NaN values in real data.'
57 |         expected_message_3 = "TypeError: '<=' not supported between instances of 'float' and 'str'"
58 | 
59 |         details = boundary_property.details
60 |         details_nan = details.loc[pd.isna(details['Score'])]
61 |         column_names_nan = details_nan['Column'].tolist()
62 |         error_messages = details_nan['Error'].tolist()
63 |         assert column_names_nan == ['start_date', 'salary', 'employability_perc']
64 |         assert error_messages[0] == expected_message_1
65 |         assert error_messages[1] == expected_message_2
66 |         assert error_messages[2] == expected_message_3
67 |         assert score == 1.0
68 | 


--------------------------------------------------------------------------------
/tests/integration/reports/single_table/_properties/test_structure.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from sdmetrics.demos import load_demo
 5 | from sdmetrics.reports.single_table._properties import Structure
 6 | 
 7 | 
 8 | class TestStructure:
 9 |     def test_get_score(self):
10 |         """Test the ``get_score`` method."""
11 |         # Setup
12 |         real_data, synthetic_data, metadata = load_demo('single_table')
13 | 
14 |         # Run
15 |         synthesis_property = Structure()
16 |         score = synthesis_property.get_score(real_data, synthetic_data, metadata)
17 | 
18 |         # Assert
19 |         assert score == 1.0
20 | 
21 |         expected_details = pd.DataFrame(
22 |             {
23 |                 'Metric': 'TableStructure',
24 |                 'Score': 1.0,
25 |             },
26 |             index=[0],
27 |         )
28 | 
29 |         pd.testing.assert_frame_equal(synthesis_property.details, expected_details)
30 | 
31 |     def test_get_score_error(self):
32 |         """Test the ``get_score`` method with an error.
33 |         Give an empty synthetic data to get an error.
34 |         """
35 |         # Setup
36 |         real_data, _, metadata = load_demo('single_table')
37 | 
38 |         # Run
39 |         synthesis_property = Structure()
40 |         score = synthesis_property.get_score(real_data.iloc[:20], [], metadata)
41 | 
42 |         # Assert
43 |         assert pd.isna(score)
44 | 
45 |         expected_details = pd.DataFrame(
46 |             {
47 |                 'Metric': 'TableStructure',
48 |                 'Score': np.nan,
49 |                 'Error': "AttributeError: 'list' object has no attribute 'columns'",
50 |             },
51 |             index=[0],
52 |         )
53 | 
54 |         pd.testing.assert_frame_equal(synthesis_property.details, expected_details)
55 | 


--------------------------------------------------------------------------------
/tests/integration/reports/single_table/_properties/test_synthesis.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from sdmetrics.demos import load_demo
 5 | from sdmetrics.reports.single_table._properties import Synthesis
 6 | 
 7 | 
 8 | class TestSynthesis:
 9 |     def test_get_score(self):
10 |         """Test the ``get_score`` method."""
11 |         # Setup
12 |         real_data, _, metadata = load_demo('single_table')
13 | 
14 |         # Run
15 |         synthesis_property = Synthesis()
16 |         score = synthesis_property.get_score(real_data.iloc[:20], real_data.iloc[10:30], metadata)
17 | 
18 |         # Assert
19 |         assert score == 0.5
20 | 
21 |     def test_get_score_error(self):
22 |         """Test the ``get_score`` method with an error.
23 | 
24 |         Give an empty synthetic data to get an error.
25 |         """
26 |         # Setup
27 |         real_data, _, metadata = load_demo('single_table')
28 | 
29 |         # Run
30 |         synthesis_property = Synthesis()
31 |         score = synthesis_property.get_score(real_data.iloc[:20], [], metadata)
32 | 
33 |         # Assert
34 |         assert pd.isna(score)
35 | 
36 |         expected_details = pd.DataFrame(
37 |             {
38 |                 'Metric': 'NewRowSynthesis',
39 |                 'Score': np.nan,
40 |                 'Num Matched Rows': np.nan,
41 |                 'Num New Rows': np.nan,
42 |                 'Error': "AttributeError: 'list' object has no attribute 'columns'",
43 |             },
44 |             index=[0],
45 |         )
46 | 
47 |         pd.testing.assert_frame_equal(synthesis_property.details, expected_details)
48 | 


--------------------------------------------------------------------------------
/tests/integration/single_column/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the single_column module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/single_column/statistical/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the single_column module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/single_column/statistical/test_cstest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | from sdmetrics.single_column.statistical.cstest import CSTest
 6 | 
 7 | 
 8 | @pytest.mark.parametrize('array_like', [np.array, pd.Series])
 9 | def test_max(array_like):
10 |     data = array_like(['a', 'b', 'b', 'c', 'c', 'c'] * 100)
11 |     output = CSTest.compute(data, data)
12 |     normalized = CSTest.normalize(output)
13 | 
14 |     assert output == 1
15 |     assert normalized == 1
16 | 
17 | 
18 | @pytest.mark.parametrize('array_like', [np.array, pd.Series])
19 | def test_min(array_like):
20 |     real = array_like(['a', 'b', 'b', 'c', 'c', 'c'] * 100)
21 |     synth = array_like(['d', 'e', 'e', 'f', 'f', 'f'] * 100)
22 |     output = CSTest.compute(real, synth)
23 |     normalized = CSTest.normalize(output)
24 | 
25 |     assert output == 0
26 |     assert normalized == 0
27 | 
28 | 
29 | @pytest.mark.parametrize('array_like', [np.array, pd.Series])
30 | def test_good(array_like):
31 |     real = array_like(['a', 'b', 'b', 'c', 'c', 'c'] * 100)
32 |     synth = array_like(['a', 'b', 'b', 'b', 'c', 'c'] * 100)
33 |     output = CSTest.compute(real, synth)
34 |     normalized = CSTest.normalize(output)
35 | 
36 |     assert 0.5 < output <= 1.0
37 |     assert 0.5 < normalized <= 1.0
38 | 
39 | 
40 | @pytest.mark.parametrize('array_like', [np.array, pd.Series])
41 | def test_bad(array_like):
42 |     real = array_like(['a', 'b', 'b', 'c', 'c', 'c'] * 100)
43 |     synth = array_like(['a', 'a', 'a', 'a', 'b', 'c'] * 100)
44 |     output = CSTest.compute(real, synth)
45 |     normalized = CSTest.normalize(output)
46 | 
47 |     assert 0.0 <= output < 0.5
48 |     assert 0.0 <= normalized < 0.5
49 | 


--------------------------------------------------------------------------------
/tests/integration/single_column/statistical/test_kscomplement.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | from sdmetrics.single_column.statistical.kscomplement import KSComplement
 6 | 
 7 | 
 8 | @pytest.mark.parametrize('array_like', [np.array, pd.Series])
 9 | def test_max(array_like):
10 |     data = array_like(np.random.normal(size=1000))
11 |     output = KSComplement.compute(data, data)
12 |     normalized = KSComplement.normalize(output)
13 | 
14 |     assert output == 1
15 |     assert normalized == 1
16 | 
17 | 
18 | @pytest.mark.parametrize('array_like', [np.array, pd.Series])
19 | def test_min(array_like):
20 |     real = array_like(np.random.normal(size=1000))
21 |     synth = array_like(np.random.normal(loc=1000, scale=10, size=1000))
22 |     output = KSComplement.compute(real, synth)
23 |     normalized = KSComplement.normalize(output)
24 | 
25 |     assert output == 0
26 |     assert normalized == 0
27 | 
28 | 
29 | @pytest.mark.parametrize('array_like', [np.array, pd.Series])
30 | def test_good(array_like):
31 |     real = array_like(np.random.normal(size=1000))
32 |     synth = array_like(np.random.normal(loc=0.1, size=1000))
33 |     output = KSComplement.compute(real, synth)
34 |     normalized = KSComplement.normalize(output)
35 | 
36 |     assert 0.5 < output <= 1.0
37 |     assert 0.5 < normalized <= 1.0
38 | 
39 | 
40 | @pytest.mark.parametrize('array_like', [np.array, pd.Series])
41 | def test_bad(array_like):
42 |     real = array_like(np.random.normal(size=1000))
43 |     synth = array_like(np.random.normal(loc=3, scale=3, size=1000))
44 |     output = KSComplement.compute(real, synth)
45 |     normalized = KSComplement.normalize(output)
46 | 
47 |     assert 0.0 <= output < 0.5
48 |     assert 0.0 <= normalized < 0.5
49 | 
50 | 
51 | def test_one_float_value():
52 |     """Test KSComplement.compute when both data have the same float values GH#652."""
53 |     # Setup
54 |     real = pd.Series([0.3 - 0.2])
55 |     synth = pd.Series([0.2 - 0.1])
56 | 
57 |     # Run
58 |     output = KSComplement.compute(real, synth)
59 | 
60 |     # Assert
61 |     assert output == 1
62 | 


--------------------------------------------------------------------------------
/tests/integration/single_table/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the single_table module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/single_table/data_augmentation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdv-dev/SDMetrics/d52733646855d9d4606f95235f5c65e10afdc439/tests/integration/single_table/data_augmentation/__init__.py


--------------------------------------------------------------------------------
/tests/integration/single_table/efficacy/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the single_table efficacy module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/single_table/efficacy/test_binary.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | from sklearn.datasets import load_breast_cancer
 5 | 
 6 | from sdmetrics.single_table.efficacy.binary import (
 7 |     BinaryAdaBoostClassifier,
 8 |     BinaryDecisionTreeClassifier,
 9 |     BinaryLogisticRegression,
10 |     BinaryMLPClassifier,
11 | )
12 | 
13 | METRICS = [
14 |     BinaryAdaBoostClassifier,
15 |     BinaryDecisionTreeClassifier,
16 |     BinaryLogisticRegression,
17 |     BinaryMLPClassifier,
18 | ]
19 | 
20 | 
21 | @pytest.fixture
22 | def test_data():
23 |     return load_breast_cancer(as_frame=True).frame
24 | 
25 | 
26 | @pytest.fixture
27 | def good_data():
28 |     breast_cancer = load_breast_cancer(as_frame=True)
29 |     data = breast_cancer.data
30 |     stds = data.std(axis=0) * 2.5
31 |     columns = len(data.columns)
32 |     rows = len(data)
33 |     zeros = np.zeros(columns)
34 |     noise = np.random.normal(loc=zeros, scale=stds, size=(rows, columns))
35 |     good = data + noise
36 |     good['target'] = breast_cancer.target
37 |     return good
38 | 
39 | 
40 | @pytest.fixture
41 | def bad_data():
42 |     breast_cancer = load_breast_cancer(as_frame=True)
43 |     data = breast_cancer.data
44 |     stds = data.std(axis=0)
45 |     mus = data.mean(axis=0)
46 |     columns = len(data.columns)
47 |     rows = len(data)
48 |     bad = np.random.normal(loc=mus, scale=stds, size=(rows, columns))
49 |     bad = pd.DataFrame(bad, columns=data.columns)
50 |     bad['target'] = breast_cancer.target
51 | 
52 |     return bad
53 | 
54 | 
55 | @pytest.mark.parametrize('metric', METRICS)
56 | def test_rank(metric, test_data, bad_data, good_data):
57 |     bad = metric.compute(test_data, bad_data, target='target')
58 |     good = metric.compute(test_data, good_data, target='target')
59 |     test = metric.compute(test_data, test_data, target='target')
60 | 
61 |     normalized_bad = metric.normalize(bad)
62 |     normalized_good = metric.normalize(good)
63 |     normalized_test = metric.normalize(test)
64 | 
65 |     assert metric.min_value <= bad < good <= test <= metric.max_value
66 |     assert 0.0 <= normalized_bad < normalized_good <= normalized_test <= 1.0
67 | 
68 | 
69 | @pytest.mark.parametrize('metric', METRICS)
70 | def test_rank_object(metric, test_data, bad_data, good_data):
71 |     bad = metric.compute(test_data, bad_data, target='target')
72 |     good = metric.compute(test_data, good_data, target='target')
73 |     test = metric.compute(test_data, test_data, target='target')
74 | 
75 |     normalized_bad = metric.normalize(bad)
76 |     normalized_good = metric.normalize(good)
77 |     normalized_test = metric.normalize(test)
78 | 
79 |     assert metric.min_value <= bad < good <= test <= metric.max_value
80 |     assert 0.0 <= normalized_bad < normalized_good <= normalized_test <= 1.0
81 | 


--------------------------------------------------------------------------------
/tests/integration/single_table/efficacy/test_detection.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from sdmetrics import load_demo
 4 | from sdmetrics.single_table.detection import LogisticDetection, SVCDetection
 5 | 
 6 | METRICS = [LogisticDetection, SVCDetection]
 7 | 
 8 | 
 9 | @pytest.mark.parametrize('metric', METRICS)
10 | def test_primary_key(metric):
11 |     """Test that primary keys don't affect detection metric."""
12 |     real_data_with_primary_key, synthetic_data_with_primary_key, metadata = load_demo(
13 |         modality='single_table'
14 |     )
15 | 
16 |     real_data_sin_primary_key = real_data_with_primary_key.drop(metadata['primary_key'], axis=1)
17 |     synthetic_data_sin_primary_key = synthetic_data_with_primary_key.drop(
18 |         metadata['primary_key'], axis=1
19 |     )
20 | 
21 |     test_with_primary_key = metric.compute(
22 |         real_data_with_primary_key, synthetic_data_with_primary_key, metadata
23 |     )
24 |     test_sin_primary_key = metric.compute(real_data_sin_primary_key, synthetic_data_sin_primary_key)
25 | 
26 |     normalized_with_primary_key = metric.normalize(test_with_primary_key)
27 |     normalized_sin_primary_key = metric.normalize(test_sin_primary_key)
28 | 
29 |     # Approximately equal because detection metrics vary when receiving the same data.
30 |     assert pytest.approx(normalized_with_primary_key, abs=0.06) == normalized_sin_primary_key
31 | 


--------------------------------------------------------------------------------
/tests/integration/single_table/efficacy/test_multiclass.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | from sklearn.datasets import load_wine
 5 | 
 6 | from sdmetrics.single_table.efficacy.multiclass import (
 7 |     MulticlassDecisionTreeClassifier,
 8 |     MulticlassMLPClassifier,
 9 | )
10 | 
11 | METRICS = [
12 |     MulticlassDecisionTreeClassifier,
13 |     MulticlassMLPClassifier,
14 | ]
15 | 
16 | 
17 | @pytest.fixture
18 | def test_data():
19 |     return load_wine(as_frame=True).frame
20 | 
21 | 
22 | @pytest.fixture
23 | def good_data():
24 |     wine = load_wine(as_frame=True)
25 |     data = wine.data
26 |     stds = data.std(axis=0) * 2.5
27 |     columns = len(data.columns)
28 |     rows = len(data)
29 |     zeros = np.zeros(columns)
30 |     noise = np.random.normal(loc=zeros, scale=stds, size=(rows, columns))
31 |     good = data + noise
32 |     good['target'] = wine.target
33 |     return good
34 | 
35 | 
36 | @pytest.fixture
37 | def bad_data():
38 |     wine = load_wine(as_frame=True)
39 |     data = wine.data
40 |     stds = data.std(axis=0)
41 |     mus = data.mean(axis=0)
42 |     columns = len(data.columns)
43 |     rows = len(data)
44 |     bad = np.random.normal(loc=mus, scale=stds, size=(rows, columns))
45 |     bad = pd.DataFrame(bad, columns=data.columns)
46 |     bad['target'] = wine.target
47 | 
48 |     return bad
49 | 
50 | 
51 | @pytest.mark.parametrize('metric', METRICS)
52 | def test_rank(metric, test_data, good_data, bad_data):
53 |     bad = metric.compute(test_data, bad_data, target='target')
54 |     good = metric.compute(test_data, good_data, target='target')
55 |     test = metric.compute(test_data, test_data, target='target')
56 | 
57 |     normalized_bad = metric.normalize(bad)
58 |     normalized_good = metric.normalize(good)
59 |     normalized_test = metric.normalize(test)
60 | 
61 |     assert metric.min_value <= bad < good < test <= metric.max_value
62 |     assert 0.0 <= normalized_bad < normalized_good <= normalized_test <= 1.0
63 | 


--------------------------------------------------------------------------------
/tests/integration/single_table/efficacy/test_regression.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | from sklearn.datasets import load_diabetes
 5 | 
 6 | from sdmetrics.single_table.efficacy.regression import LinearRegression, MLPRegressor
 7 | 
 8 | METRICS = [
 9 |     LinearRegression,
10 |     MLPRegressor,
11 | ]
12 | 
13 | 
14 | @pytest.fixture
15 | def test_data():
16 |     boston = load_diabetes()
17 |     data = pd.DataFrame(boston.data, columns=boston.feature_names)
18 |     data['target'] = boston.target
19 |     return data
20 | 
21 | 
22 | @pytest.fixture
23 | def good_data():
24 |     boston = load_diabetes()
25 |     data = pd.DataFrame(boston.data, columns=boston.feature_names)
26 | 
27 |     columns = len(data.columns)
28 |     rows = len(data)
29 |     data = boston.data
30 | 
31 |     stds = data.std(axis=0) / 4
32 |     zeros = np.zeros(columns)
33 |     noise = np.random.normal(loc=zeros, scale=stds, size=(rows, columns))
34 |     good = data + noise * 4
35 | 
36 |     good = pd.DataFrame(good, columns=boston.feature_names)
37 |     good['target'] = boston.target
38 |     return good
39 | 
40 | 
41 | @pytest.fixture
42 | def bad_data():
43 |     boston = load_diabetes()
44 |     data = pd.DataFrame(boston.data, columns=boston.feature_names)
45 | 
46 |     stds = data.std(axis=0)
47 |     mus = data.mean(axis=0)
48 |     columns = len(data.columns)
49 |     rows = len(data)
50 |     bad = np.random.normal(loc=mus, scale=stds, size=(rows, columns))
51 |     bad = pd.DataFrame(bad, columns=data.columns)
52 | 
53 |     bad['target'] = boston.target
54 | 
55 |     return bad
56 | 
57 | 
58 | @pytest.mark.parametrize('metric', METRICS)
59 | def test_rank(metric, test_data, good_data, bad_data):
60 |     bad = metric.compute(test_data, bad_data, target='target')
61 |     good = metric.compute(test_data, good_data, target='target')
62 |     test = metric.compute(test_data, test_data, target='target')
63 | 
64 |     normalized_bad = metric.normalize(bad)
65 |     normalized_good = metric.normalize(good)
66 |     normalized_test = metric.normalize(test)
67 | 
68 |     assert metric.min_value <= bad < good < test <= metric.max_value
69 |     assert 0.0 <= normalized_bad <= normalized_good <= normalized_test <= 1.0
70 | 


--------------------------------------------------------------------------------
/tests/integration/single_table/privacy/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the single_table privacy module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/single_table/privacy/test_dcr_utils.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from sdmetrics.demos import load_single_table_demo
 6 | from sdmetrics.single_table.privacy.dcr_utils import (
 7 |     calculate_dcr,
 8 | )
 9 | 
10 | 
11 | def test_calculate_dcr():
12 |     """Test calculate_dcr with numerical values."""
13 |     # Setup
14 |     real_data_num = [0, 5, 8, 9, 10]
15 |     synthetic_data_num_diff = [3, 5]
16 | 
17 |     real_df = pd.DataFrame({'num_col': real_data_num})
18 |     synthetic_df_diff = pd.DataFrame({
19 |         'num_col': synthetic_data_num_diff,
20 |     })
21 |     metadata = {'columns': {'num_col': {'sdtype': 'numerical'}}}
22 | 
23 |     # Run
24 |     result = calculate_dcr(reference_dataset=real_df, dataset=synthetic_df_diff, metadata=metadata)
25 | 
26 |     # Assert
27 |     expected_result = pd.Series([0.2, 0.0])
28 |     pd.testing.assert_series_equal(result, expected_result)
29 | 
30 | 
31 | def test_calculate_dcr_with_zero_col_range():
32 |     """Test calculate_dcr with a range of zero."""
33 |     # Setup
34 |     real_data_num = [5.0]
35 |     real_data_date = [datetime(2025, 1, 5)]
36 |     synthetic_data_num_diff = [1, 2, 3, 5, 5]
37 |     synthetic_data_date_diff = [
38 |         datetime(2025, 1, 1),
39 |         datetime(2025, 1, 2),
40 |         datetime(2025, 1, 3),
41 |         datetime(2025, 1, 4),
42 |         datetime(2025, 1, 5),
43 |     ]
44 | 
45 |     real_df = pd.DataFrame({'num_col': real_data_num, 'date_col': real_data_date})
46 |     synthetic_df_diff = pd.DataFrame({
47 |         'num_col': synthetic_data_num_diff,
48 |         'date_col': synthetic_data_date_diff,
49 |     })
50 |     metadata = {'columns': {'num_col': {'sdtype': 'numerical'}, 'date_col': {'sdtype': 'datetime'}}}
51 | 
52 |     # Run
53 |     result = calculate_dcr(reference_dataset=real_df, dataset=synthetic_df_diff, metadata=metadata)
54 | 
55 |     # Assert
56 |     expected_result = pd.Series([1.0, 1.0, 1.0, 0.5, 0.0])
57 |     pd.testing.assert_series_equal(result, expected_result)
58 | 
59 | 
60 | def test_calculate_dcr_chunked():
61 |     """Test calculate_dcr with chunking calculations."""
62 |     # Setup
63 |     real_data, synthetic_data, metadata = load_single_table_demo()
64 | 
65 |     # Run
66 |     result = calculate_dcr(
67 |         reference_dataset=real_data,
68 |         dataset=synthetic_data,
69 |         metadata=metadata,
70 |         chunk_size=1000,
71 |     )
72 |     chunked_result = calculate_dcr(
73 |         reference_dataset=real_data,
74 |         dataset=synthetic_data,
75 |         metadata=metadata,
76 |         chunk_size=50,
77 |     )
78 | 
79 |     # Assert
80 |     assert len(result) == len(real_data)
81 |     pd.testing.assert_series_equal(result, chunked_result)
82 | 


--------------------------------------------------------------------------------
/tests/integration/single_table/test_gaussian_mixture.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pytest
 4 | 
 5 | from sdmetrics.single_table.gaussian_mixture import GMLogLikelihood
 6 | 
 7 | 
 8 | @pytest.fixture
 9 | def ones():
10 |     return pd.DataFrame({
11 |         'a': [1] * 300,
12 |         'b': [True] * 300,
13 |         'c': [1.0] * 300,
14 |         'd': [True] * 300,
15 |     })
16 | 
17 | 
18 | @pytest.fixture
19 | def zeros():
20 |     return pd.DataFrame({
21 |         'a': [0] * 300,
22 |         'b': [False] * 300,
23 |         'c': [0.0] * 300,
24 |         'd': [False] * 300,
25 |     })
26 | 
27 | 
28 | @pytest.fixture
29 | def real_data():
30 |     return pd.DataFrame({
31 |         'a': np.random.normal(size=1800),
32 |         'b': np.random.randint(0, 10, size=1800),
33 |         'c': ['a', 'b', 'b', 'c', 'c', 'c'] * 300,
34 |         'd': [True, True, True, True, True, False] * 300,
35 |     })
36 | 
37 | 
38 | @pytest.fixture
39 | def good_data():
40 |     return pd.DataFrame({
41 |         'a': np.random.normal(loc=0.01, size=1800),
42 |         'b': np.random.randint(0, 10, size=1800),
43 |         'c': ['a', 'b', 'b', 'b', 'c', 'c'] * 300,
44 |         'd': [True, True, True, True, False, False] * 300,
45 |     })
46 | 
47 | 
48 | @pytest.fixture
49 | def bad_data():
50 |     return pd.DataFrame({
51 |         'a': np.random.normal(loc=5, scale=3, size=1800),
52 |         'b': np.random.randint(5, 15, size=1800),
53 |         'c': ['a', 'a', 'a', 'a', 'b', 'b'] * 300,
54 |         'd': [True, False, False, False, False, False] * 300,
55 |     })
56 | 
57 | 
58 | def test_rank(ones, zeros, real_data, good_data, bad_data):
59 |     worst = GMLogLikelihood.compute(ones, zeros)
60 |     normalized_worst = GMLogLikelihood.normalize(worst)
61 |     best = GMLogLikelihood.compute(ones, ones)
62 |     normalized_best = GMLogLikelihood.normalize(best)
63 | 
64 |     assert GMLogLikelihood.min_value <= worst < best <= GMLogLikelihood.max_value
65 |     assert 0.0 <= normalized_worst < normalized_best <= 1.0
66 | 


--------------------------------------------------------------------------------
/tests/integration/test_base.py:
--------------------------------------------------------------------------------
 1 | from sdmetrics.single_table import SingleTableMetric
 2 | 
 3 | SINGLE_TABLE_METRICS = [
 4 |     'BNLikelihood',
 5 |     'BNLogLikelihood',
 6 |     'LogisticDetection',
 7 |     'SVCDetection',
 8 |     'BinaryDecisionTreeClassifier',
 9 |     'BinaryAdaBoostClassifier',
10 |     'BinaryLogisticRegression',
11 |     'BinaryMLPClassifier',
12 |     'MulticlassDecisionTreeClassifier',
13 |     'MulticlassMLPClassifier',
14 |     'LinearRegression',
15 |     'MLPRegressor',
16 |     'GMLogLikelihood',
17 |     'CSTest',
18 |     'KSComplement',
19 |     'StatisticSimilarity',
20 |     'BoundaryAdherence',
21 |     'MissingValueSimilarity',
22 |     'CategoryCoverage',
23 |     'TVComplement',
24 |     'RangeCoverage',
25 |     'CategoricalCAP',
26 |     'CategoricalZeroCAP',
27 |     'CategoricalGeneralizedCAP',
28 |     'CategoricalNB',
29 |     'CategoricalKNN',
30 |     'CategoricalRF',
31 |     'CategoricalSVM',
32 |     'CategoricalEnsemble',
33 |     'NumericalLR',
34 |     'NumericalMLP',
35 |     'NumericalSVR',
36 |     'NumericalRadiusNearestNeighbor',
37 |     'ContinuousKLDivergence',
38 |     'DiscreteKLDivergence',
39 |     'ContingencySimilarity',
40 |     'CorrelationSimilarity',
41 |     'NewRowSynthesis',
42 | ]
43 | 
44 | 
45 | def test_get_single_table_subclasses():
46 |     single_table_metrics = SingleTableMetric.get_subclasses()
47 |     for single_table_metric in SINGLE_TABLE_METRICS:
48 |         assert single_table_metric in single_table_metrics
49 | 


--------------------------------------------------------------------------------
/tests/integration/test_property.py:
--------------------------------------------------------------------------------
 1 | """Tests that are common to all properties."""
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | 
 6 | from sdmetrics.demos import load_demo
 7 | from sdmetrics.reports.multi_table import _properties as multi_table_properties
 8 | from sdmetrics.reports.single_table import _properties as single_table_properties
 9 | 
10 | REAL_DATA_ST, SYNTHETIC_DATA_ST, METADATA_ST = load_demo(modality='single_table')
11 | REAL_DATA_MT, SYNTHETIC_DATA_MT, METADATA_MT = load_demo(modality='multi_table')
12 | SINGLE_TABLE_PROPERTIES = [
13 |     property
14 |     for property_name, property in vars(single_table_properties).items()
15 |     if property_name != 'BaseSingleTableProperty' and isinstance(property, type)
16 | ]
17 | MULTI_TABLE_PROPERTIES = [
18 |     property
19 |     for property_name, property in vars(multi_table_properties).items()
20 |     if property_name != 'BaseMultiTableProperty' and isinstance(property, type)
21 | ]
22 | 
23 | 
24 | @pytest.mark.parametrize('property', SINGLE_TABLE_PROPERTIES)
25 | def test_shuffling_data_single_table(property):
26 |     """Test the property score is the same when shuffling the data for single-table."""
27 |     # Setup
28 |     property_instance = property()
29 | 
30 |     # Run
31 |     score = property_instance.get_score(REAL_DATA_ST, SYNTHETIC_DATA_ST, METADATA_ST)
32 |     score_shuffled = property_instance.get_score(
33 |         REAL_DATA_ST.sample(frac=1), SYNTHETIC_DATA_ST.sample(frac=1), METADATA_ST
34 |     )
35 | 
36 |     # Assert
37 |     assert score_shuffled == score
38 | 
39 | 
40 | @pytest.mark.parametrize('property', MULTI_TABLE_PROPERTIES)
41 | def test_shuffling_data_multi_table(property):
42 |     """Test the property score is the same when shuffling the data for multi-table."""
43 |     # Setup
44 |     property_instance = property()
45 |     real_data_shuffled = {
46 |         table_name: table.sample(frac=1) for table_name, table in REAL_DATA_MT.items()
47 |     }
48 |     synthetic_data_shuffled = {
49 |         table_name: SYNTHETIC_DATA_MT[table_name].sample(frac=1) for table_name in SYNTHETIC_DATA_MT
50 |     }
51 | 
52 |     # Run
53 |     score = property_instance.get_score(REAL_DATA_MT, SYNTHETIC_DATA_MT, METADATA_MT)
54 |     score_shuffled = property_instance.get_score(
55 |         real_data_shuffled, synthetic_data_shuffled, METADATA_MT
56 |     )
57 | 
58 |     # Assert
59 |     assert np.isclose(score, score_shuffled, rtol=1e-12)
60 | 


--------------------------------------------------------------------------------
/tests/integration/timeseries/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the timeseries module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/timeseries/efficacy/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics integration testing for the timeseries efficacy module."""
2 | 


--------------------------------------------------------------------------------
/tests/integration/timeseries/efficacy/test_classification.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from sdmetrics.demos import load_timeseries_demo
 4 | from sdmetrics.timeseries.efficacy.classification import LSTMClassifierEfficacy
 5 | 
 6 | METRICS = [
 7 |     LSTMClassifierEfficacy,
 8 | ]
 9 | 
10 | 
11 | @pytest.mark.parametrize('metric', METRICS)
12 | def test_rank(metric):
13 |     real_data, synthetic_data, metadata = load_timeseries_demo()
14 | 
15 |     real_score = metric.compute(real_data, real_data, metadata, target='region')
16 |     synthetic_score = metric.compute(real_data, synthetic_data, metadata, target='region')
17 | 
18 |     normalized_real_score = metric.normalize(real_score)
19 |     normalized_synthetic_score = metric.normalize(synthetic_score)
20 | 
21 |     assert metric.min_value <= synthetic_score <= real_score <= metric.max_value
22 |     assert 0.0 <= normalized_synthetic_score <= normalized_real_score <= 1.0
23 | 


--------------------------------------------------------------------------------
/tests/test_tasks.py:
--------------------------------------------------------------------------------
 1 | """Tests for the ``tasks.py`` file."""
 2 | 
 3 | from tasks import _get_minimum_versions
 4 | 
 5 | 
 6 | def test_get_minimum_versions():
 7 |     """Test the ``_get_minimum_versions`` method.
 8 | 
 9 |     The method should return the minimum versions of the dependencies for the given python version.
10 |     If a library is linked to an URL, the minimum version should be the URL.
11 |     """
12 |     # Setup
13 |     dependencies = [
14 |         "numpy>=1.20.0,<2;python_version<'3.10'",
15 |         "numpy>=1.23.3,<2;python_version>='3.10'",
16 |         "pandas>=1.2.0,<2;python_version<'3.10'",
17 |         "pandas>=1.3.0,<2;python_version>='3.10'",
18 |         'humanfriendly>=8.2,<11',
19 |         'pandas @ git+https://github.com/pandas-dev/pandas.git@master',
20 |     ]
21 | 
22 |     # Run
23 |     minimum_versions_39 = _get_minimum_versions(dependencies, '3.9')
24 |     minimum_versions_310 = _get_minimum_versions(dependencies, '3.10')
25 | 
26 |     # Assert
27 |     expected_versions_39 = [
28 |         'numpy==1.20.0',
29 |         'git+https://github.com/pandas-dev/pandas.git@master#egg=pandas',
30 |         'humanfriendly==8.2',
31 |     ]
32 |     expected_versions_310 = [
33 |         'numpy==1.23.3',
34 |         'git+https://github.com/pandas-dev/pandas.git@master#egg=pandas',
35 |         'humanfriendly==8.2',
36 |     ]
37 | 
38 |     assert minimum_versions_39 == expected_versions_39
39 |     assert minimum_versions_310 == expected_versions_310
40 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics unit testing package."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/column_pairs/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit tests for the column pairs module."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/column_pairs/statistical/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit tests for the column pairs statistical metrics."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/column_pairs/statistical/test_cardinality_boundary_adherence.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from sdmetrics.column_pairs.statistical import CardinalityBoundaryAdherence
 4 | 
 5 | 
 6 | class TestCardinalityBoundaryAdherence:
 7 |     def test_compute_breakdown(self):
 8 |         """Test the ``compute_breakdown`` method."""
 9 |         # Setup
10 |         real_parent_keys = pd.Series([1, 2, 3, 4, 5])
11 |         real_foreign_keys = pd.Series([1, 1, 2, 3, 4, 5, 5])
12 |         real_data = (real_parent_keys, real_foreign_keys)
13 |         synthetic_parent_keys = pd.Series([1, 2, 3, 4, 5])
14 |         synthetic_foreign_keys = pd.Series([2, 2, 2, 3, 4, 5])
15 |         synthetic_data = (synthetic_parent_keys, synthetic_foreign_keys)
16 | 
17 |         metric = CardinalityBoundaryAdherence()
18 | 
19 |         # Run
20 |         result = metric.compute_breakdown(real_data, synthetic_data)
21 | 
22 |         # Assert
23 |         assert result == {'score': 0.6}
24 | 
25 |     def test_compute(self):
26 |         """Test the ``compute`` method."""
27 |         # Setup
28 |         real_parent_keys = pd.Series([1, 2, 3, 4, 5])
29 |         real_foreign_keys = pd.Series([1, 1, 2, 3, 4, 5, 5])
30 |         real_data = (real_parent_keys, real_foreign_keys)
31 |         synthetic_parent_keys = pd.Series([1, 2, 3, 4, 5])
32 |         synthetic_foreign_keys = pd.Series([2, 2, 2, 3, 4, 5])
33 |         synthetic_data = (synthetic_parent_keys, synthetic_foreign_keys)
34 | 
35 |         metric = CardinalityBoundaryAdherence()
36 | 
37 |         # Run
38 |         result = metric.compute(real_data, synthetic_data)
39 | 
40 |         # Assert
41 |         assert result == 0.6
42 | 


--------------------------------------------------------------------------------
/tests/unit/column_pairs/test_base.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock, patch
 2 | 
 3 | from sdmetrics.column_pairs.base import ColumnPairsMetric
 4 | 
 5 | 
 6 | class TestColumnPairsMetric:
 7 |     def test_compute_breakdown(self):
 8 |         """Test the ``compute_breakdown`` method.
 9 | 
10 |         Expect a breakdown dictionary is returned that contains the score.
11 | 
12 |         Setup:
13 |         - Mock the ``compute`` method to return a fake score.
14 | 
15 |         Input:
16 |         - Real data.
17 |         - Synthetic data.
18 | 
19 |         Output:
20 |         - The evaluated metric.
21 |         """
22 |         # Setup
23 |         metric = ColumnPairsMetric()
24 |         test_metric_score = 0.5
25 | 
26 |         # Run
27 |         with patch.object(ColumnPairsMetric, 'compute', return_value=test_metric_score):
28 |             result = metric.compute_breakdown(Mock(), Mock())
29 | 
30 |         # Assert
31 |         assert result == {'score': test_metric_score}
32 | 


--------------------------------------------------------------------------------
/tests/unit/multi_table/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics unit testing for the multi_table module."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/multi_table/statistical/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics unit testing for the multi_table statistical module."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/reports/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics unit testing for the reports module."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/reports/multi_table/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics unit testing for the reports multi_table module."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/reports/multi_table/_properties/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics unit testing for the reports multi_table _properties module."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/reports/multi_table/_properties/test_boundary.py:
--------------------------------------------------------------------------------
 1 | """Test Boundary multi-table class."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import Boundary
 4 | from sdmetrics.reports.single_table._properties import Boundary as SingleTableBoundary
 5 | 
 6 | 
 7 | def test__init__():
 8 |     """Test the ``__init__`` method."""
 9 |     # Setup
10 |     boundary = Boundary()
11 | 
12 |     # Assert
13 |     assert boundary._properties == {}
14 |     assert boundary._single_table_property == SingleTableBoundary
15 |     assert boundary._num_iteration_case == 'column'
16 | 


--------------------------------------------------------------------------------
/tests/unit/reports/multi_table/_properties/test_column_pair_trends.py:
--------------------------------------------------------------------------------
 1 | """Test ColumnPairTrends multi-table class."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import ColumnPairTrends
 4 | from sdmetrics.reports.single_table._properties import (
 5 |     ColumnPairTrends as SingleTableColumnPairTrends,
 6 | )
 7 | 
 8 | 
 9 | def test__init__():
10 |     """Test the ``__init__`` method."""
11 |     # Setup
12 |     column_pair_trends = ColumnPairTrends()
13 | 
14 |     # Assert
15 |     assert column_pair_trends._properties == {}
16 |     assert column_pair_trends._single_table_property == SingleTableColumnPairTrends
17 |     assert column_pair_trends._num_iteration_case == 'column_pair'
18 | 


--------------------------------------------------------------------------------
/tests/unit/reports/multi_table/_properties/test_column_shapes.py:
--------------------------------------------------------------------------------
 1 | """Test ColumnShapes multi-table class."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import ColumnShapes
 4 | from sdmetrics.reports.single_table._properties import ColumnShapes as SingleTableColumnShapes
 5 | 
 6 | 
 7 | def test__init__():
 8 |     """Test the ``__init__`` method."""
 9 |     # Setup
10 |     column_shapes = ColumnShapes()
11 | 
12 |     # Assert
13 |     assert column_shapes._properties == {}
14 |     assert column_shapes._single_table_property == SingleTableColumnShapes
15 |     assert column_shapes._num_iteration_case == 'column'
16 | 


--------------------------------------------------------------------------------
/tests/unit/reports/multi_table/_properties/test_coverage.py:
--------------------------------------------------------------------------------
 1 | """Test Coverage multi-table class."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import Coverage
 4 | from sdmetrics.reports.single_table._properties import Coverage as SingleTableCoverage
 5 | 
 6 | 
 7 | def test__init__():
 8 |     """Test the ``__init__`` method."""
 9 |     # Setup
10 |     coverage = Coverage()
11 | 
12 |     # Assert
13 |     assert coverage._properties == {}
14 |     assert coverage._single_table_property == SingleTableCoverage
15 |     assert coverage._num_iteration_case == 'column'
16 | 


--------------------------------------------------------------------------------
/tests/unit/reports/multi_table/_properties/test_synthesis.py:
--------------------------------------------------------------------------------
 1 | """Test Synthesis multi-table class."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import Synthesis
 4 | from sdmetrics.reports.single_table._properties import Synthesis as SingleTableSynthesis
 5 | 
 6 | 
 7 | def test__init__():
 8 |     """Test the ``__init__`` method."""
 9 |     # Setup
10 |     synthesis = Synthesis()
11 | 
12 |     # Assert
13 |     assert synthesis._properties == {}
14 |     assert synthesis._single_table_property == SingleTableSynthesis
15 |     assert synthesis._num_iteration_case == 'table'
16 | 


--------------------------------------------------------------------------------
/tests/unit/reports/multi_table/_properties/test_validity.py:
--------------------------------------------------------------------------------
 1 | """Test Data Validity multi-table class."""
 2 | 
 3 | from sdmetrics.reports.multi_table._properties import DataValidity
 4 | from sdmetrics.reports.single_table._properties import DataValidity as SingleTableDataValidity
 5 | 
 6 | 
 7 | def test__init__():
 8 |     """Test the ``__init__`` method."""
 9 |     # Setup
10 |     column_shapes = DataValidity()
11 | 
12 |     # Assert
13 |     assert column_shapes._properties == {}
14 |     assert column_shapes._single_table_property == SingleTableDataValidity
15 |     assert column_shapes._num_iteration_case == 'column'
16 | 


--------------------------------------------------------------------------------
/tests/unit/reports/multi_table/test_diagnostic_report.py:
--------------------------------------------------------------------------------
 1 | from sdmetrics.reports.multi_table import DiagnosticReport
 2 | from sdmetrics.reports.multi_table._properties import DataValidity, RelationshipValidity, Structure
 3 | 
 4 | 
 5 | class TestDiagnosticReport:
 6 |     def test___init__(self):
 7 |         """Test that the ``__init__`` method"""
 8 |         # Setup
 9 |         report = DiagnosticReport()
10 | 
11 |         # Assert
12 |         assert report._overall_score is None
13 |         assert report.is_generated is False
14 |         assert report.table_names == []
15 |         assert isinstance(report._properties['Data Validity'], DataValidity)
16 |         assert isinstance(report._properties['Data Structure'], Structure)
17 |         assert isinstance(report._properties['Relationship Validity'], RelationshipValidity)
18 | 


--------------------------------------------------------------------------------
/tests/unit/reports/multi_table/test_quality_report.py:
--------------------------------------------------------------------------------
 1 | from sdmetrics.reports.multi_table import QualityReport
 2 | from sdmetrics.reports.multi_table._properties import (
 3 |     Cardinality,
 4 |     ColumnPairTrends,
 5 |     ColumnShapes,
 6 |     InterTableTrends,
 7 | )
 8 | 
 9 | 
10 | class TestQualityReport:
11 |     def test___init__(self):
12 |         """Test that the ``__init__`` method"""
13 |         # Setup
14 |         report = QualityReport()
15 | 
16 |         # Assert
17 |         assert report._overall_score is None
18 |         assert report.is_generated is False
19 |         assert report.table_names == []
20 |         assert isinstance(report._properties['Column Shapes'], ColumnShapes)
21 |         assert isinstance(report._properties['Column Pair Trends'], ColumnPairTrends)
22 |         assert isinstance(report._properties['Cardinality'], Cardinality)
23 |         assert isinstance(report._properties['Intertable Trends'], InterTableTrends)
24 | 


--------------------------------------------------------------------------------
/tests/unit/reports/single_table/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics unit testing for the reports single_table module."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/reports/single_table/_properties/__init__.py:
--------------------------------------------------------------------------------
1 | """Test package for single table properties."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/reports/single_table/test_diagnostic_report.py:
--------------------------------------------------------------------------------
 1 | from sdmetrics.reports.single_table import DiagnosticReport
 2 | from sdmetrics.reports.single_table._properties import DataValidity, Structure
 3 | 
 4 | 
 5 | class TestDiagnosticReport:
 6 |     def test___init__(self):
 7 |         """Test the ``__init__`` method."""
 8 |         # Run
 9 |         report = DiagnosticReport()
10 | 
11 |         # Assert
12 |         assert report._overall_score is None
13 |         assert report.is_generated is False
14 |         assert isinstance(report._properties['Data Validity'], DataValidity)
15 |         assert isinstance(report._properties['Data Structure'], Structure)
16 | 


--------------------------------------------------------------------------------
/tests/unit/reports/single_table/test_quality_report.py:
--------------------------------------------------------------------------------
 1 | from sdmetrics.reports.single_table import QualityReport
 2 | from sdmetrics.reports.single_table._properties import ColumnPairTrends, ColumnShapes
 3 | 
 4 | 
 5 | class TestQualityReport:
 6 |     def test___init__(self):
 7 |         """Test the ``__init__`` method."""
 8 |         # Run
 9 |         report = QualityReport()
10 | 
11 |         # Assert
12 |         assert report._overall_score is None
13 |         assert not report.is_generated
14 |         assert isinstance(report._properties['Column Shapes'], ColumnShapes)
15 |         assert isinstance(report._properties['Column Pair Trends'], ColumnPairTrends)
16 | 


--------------------------------------------------------------------------------
/tests/unit/single_column/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit tests for the single column module."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/single_column/statistical/__init__.py:
--------------------------------------------------------------------------------
1 | """Unit tests for the statistical single column metrics."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/single_column/statistical/test_category_adherence.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from sdmetrics.single_column.statistical import CategoryAdherence
 7 | 
 8 | 
 9 | class TestCategoryAdherence:
10 |     def test_compute_breakdown(self):
11 |         """Test the ``compute_breakdown`` method."""
12 |         # Setup
13 |         real_data = pd.Series(['A', 'B', 'C', 'B', 'A'])
14 |         synthetic_data = pd.Series(['A', 'B', 'C', 'D', 'E'])
15 | 
16 |         metric = CategoryAdherence()
17 | 
18 |         # Run
19 |         result = metric.compute_breakdown(real_data, synthetic_data)
20 | 
21 |         # Assert
22 |         assert result == {'score': 0.6}
23 | 
24 |     def test_compute_breakdown_with_nans(self):
25 |         """Test the ``compute_breakdown`` method with NaNs."""
26 |         # Setup
27 |         real_data = pd.Series(['A', 'B', 'C', 'B', 'A', None])
28 |         synthetic_data = pd.Series(['A', 'B', np.nan, 'C', np.nan, 'B', 'A', None, 'D', 'C'])
29 | 
30 |         metric = CategoryAdherence()
31 | 
32 |         # Run
33 |         result = metric.compute_breakdown(real_data, synthetic_data)
34 | 
35 |         # Assert
36 |         assert result == {'score': 0.9}
37 | 
38 |     @patch(
39 |         'sdmetrics.single_column.statistical.category_adherence.CategoryAdherence.compute_breakdown'
40 |     )
41 |     def test_compute(self, compute_breakdown_mock):
42 |         """Test the ``compute`` method."""
43 |         # Setup
44 |         real_data = pd.Series(['A', 'B', 'C', 'B', 'A'])
45 |         synthetic_data = pd.Series(['A', 'B', 'C', 'D', 'E'])
46 |         metric = CategoryAdherence()
47 |         compute_breakdown_mock.return_value = {'score': 0.6}
48 | 
49 |         # Run
50 |         result = metric.compute(real_data, synthetic_data)
51 | 
52 |         # Assert
53 |         compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data)
54 |         assert result == 0.6
55 | 


--------------------------------------------------------------------------------
/tests/unit/single_column/statistical/test_category_coverage.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock, patch
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from sdmetrics.single_column.statistical import CategoryCoverage
 6 | 
 7 | 
 8 | class TestCategoryCoverage:
 9 |     def test_compute_breakdown(self):
10 |         """Test the ``compute_breakdown`` method.
11 | 
12 |         Expect that the number of unique categories is computed for both real and synthetic data.
13 | 
14 |         Input:
15 |         - Real data.
16 |         - Synthetic data.
17 | 
18 |         Output:
19 |         - A mapping of the metric results, containing the score and the real and synthetic results.
20 |         """
21 |         # Setup
22 |         real_data = pd.Series(['a', 'b', 'a', 'b', 'c'])
23 |         synthetic_data = pd.Series(['a', 'a', 'a', 'b', 'b'])
24 | 
25 |         metric = CategoryCoverage()
26 | 
27 |         # Run
28 |         result = metric.compute_breakdown(real_data, synthetic_data)
29 | 
30 |         # Assert
31 |         assert result == {'score': 2 / 3, 'real': 3, 'synthetic': 2}
32 | 
33 |     def test_compute_breakdown_missing_categories(self):
34 |         """Test the ``compute_breakdown`` method with missing categorical values.
35 | 
36 |         Expect that the number of unique categories is computed for both real and synthetic data.
37 |         """
38 |         # Setup
39 |         real_data = pd.Series(['a', 'b', 'a', 'b', 'c'])
40 |         synthetic_data = pd.Series(['d', 'e', 'f', 'f', 'e'])
41 | 
42 |         metric = CategoryCoverage()
43 | 
44 |         # Run
45 |         result = metric.compute_breakdown(real_data, synthetic_data)
46 | 
47 |         # Assert
48 |         assert result == {'score': 0, 'real': 3, 'synthetic': 0}
49 | 
50 |     def test_compute(self):
51 |         """Test the ``compute`` method.
52 | 
53 |         Expect that the number of unique categories is computed for both real and synthetic data.
54 | 
55 |         Setup:
56 |         - Patch the ``compute_breakdown`` method to return a mapping of the metric results.
57 | 
58 |         Input:
59 |         - Real data.
60 |         - Synthetic data.
61 | 
62 |         Output:
63 |         - The evaluated metric.
64 |         """
65 |         # Setup
66 |         metric_breakdown = {'score': 2 / 3, 'real': 3, 'synthetic': 2}
67 | 
68 |         metric = CategoryCoverage()
69 | 
70 |         # Run
71 |         with patch.object(CategoryCoverage, 'compute_breakdown', return_value=metric_breakdown):
72 |             result = metric.compute(Mock(), Mock())
73 | 
74 |         # Assert
75 |         assert result == 2 / 3
76 | 
77 |     @patch('sdmetrics.single_column.statistical.category_coverage.SingleColumnMetric.normalize')
78 |     def test_normalize(self, normalize_mock):
79 |         """Test the ``normalize`` method.
80 | 
81 |         Expect that the inherited ``normalize`` method is called.
82 | 
83 |         Input:
84 |         - Raw score
85 | 
86 |         Output:
87 |         - The output of the inherited ``normalize`` method.
88 |         """
89 |         # Setup
90 |         metric = CategoryCoverage()
91 |         raw_score = 0.9
92 | 
93 |         # Run
94 |         result = metric.normalize(raw_score)
95 | 
96 |         # Assert
97 |         normalize_mock.assert_called_once_with(raw_score)
98 |         assert result == normalize_mock.return_value
99 | 


--------------------------------------------------------------------------------
/tests/unit/single_column/statistical/test_key_uniqueness.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import patch
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from sdmetrics.single_column.statistical import KeyUniqueness
 7 | 
 8 | 
 9 | class TestKeyUniqueness:
10 |     def test_compute_breakdown(self):
11 |         """Test the ``compute_breakdown`` method."""
12 |         # Setup
13 |         real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
14 |         synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None])
15 | 
16 |         metric = KeyUniqueness()
17 | 
18 |         # Run
19 |         result = metric.compute_breakdown(real_data, synthetic_data)
20 | 
21 |         # Assert
22 |         assert result == {'score': 0.5}
23 | 
24 |     @patch('sdmetrics.single_column.statistical.key_uniqueness.LOGGER')
25 |     def test_compute_breakdown_with_duplicates_in_real_data(self, logger_mock):
26 |         """Test the ``compute_breakdown`` method with duplicates in the real data."""
27 |         # Setup
28 |         real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 2)
29 |         synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None])
30 |         metric = KeyUniqueness()
31 | 
32 |         # Run
33 |         metric.compute_breakdown(real_data, synthetic_data)
34 | 
35 |         # Assert
36 |         expected_message = 'The real data contains NA or duplicate values.'
37 |         logger_mock.info.assert_called_once_with(expected_message)
38 | 
39 |     @patch('sdmetrics.single_column.statistical.key_uniqueness.KeyUniqueness.compute_breakdown')
40 |     def test_compute(self, compute_breakdown_mock):
41 |         """Test the ``compute`` method."""
42 |         # Setup
43 |         real_data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
44 |         synthetic_data = pd.Series([1, 2, np.nan, 3, np.nan, 5, 2, np.nan, 6, None])
45 |         metric = KeyUniqueness()
46 |         compute_breakdown_mock.return_value = {'score': 0.6}
47 | 
48 |         # Run
49 |         result = metric.compute(real_data, synthetic_data)
50 | 
51 |         # Assert
52 |         compute_breakdown_mock.assert_called_once_with(real_data, synthetic_data)
53 |         assert result == 0.6
54 | 


--------------------------------------------------------------------------------
/tests/unit/single_column/statistical/test_missing_value_similarity.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock, patch
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from sdmetrics.single_column.statistical import MissingValueSimilarity
 7 | 
 8 | 
 9 | class TestMissingValueSimilarity:
10 |     def test_compute_breakdown(self):
11 |         """Test the ``compute_breakdown`` method.
12 | 
13 |         Expect that the number of missing values is computed for both real and synthetic data.
14 | 
15 |         Input:
16 |         - Real data.
17 |         - Synthetic data.
18 | 
19 |         Output:
20 |         - A mapping of the metric results, containing the score and the real and synthetic results.
21 |         """
22 |         # Setup
23 |         real_data = pd.Series([1.0, np.nan, 2.6, 0.8])
24 |         synthetic_data = pd.Series([0.9, 1.8, None, None])
25 | 
26 |         metric = MissingValueSimilarity()
27 | 
28 |         # Run
29 |         result = metric.compute_breakdown(real_data, synthetic_data)
30 | 
31 |         # Assert
32 |         assert result == {'score': 0.75, 'real': 0.25, 'synthetic': 0.5}
33 | 
34 |     def test_compute(self):
35 |         """Test the ``compute`` method.
36 | 
37 |         Expect that the number of missing values is computed for both real and synthetic data.
38 | 
39 |         Setup:
40 |         - Patch the ``compute_breakdown`` method to return a mapping of the metric results.
41 | 
42 |         Input:
43 |         - Real data.
44 |         - Synthetic data.
45 | 
46 |         Output:
47 |         - The evaluated metric.
48 |         """
49 |         # Setup
50 |         metric_breakdown = {'score': 0.75, 'real': 0.25, 'synthetic': 0.75}
51 | 
52 |         metric = MissingValueSimilarity()
53 | 
54 |         # Run
55 |         with patch.object(
56 |             MissingValueSimilarity,
57 |             'compute_breakdown',
58 |             return_value=metric_breakdown,
59 |         ):
60 |             result = metric.compute(Mock(), Mock())
61 | 
62 |         # Assert
63 |         assert result == 0.75
64 | 
65 |     @patch(
66 |         'sdmetrics.single_column.statistical.missing_value_similarity.SingleColumnMetric.normalize'
67 |     )
68 |     def test_normalize(self, normalize_mock):
69 |         """Test the ``normalize`` method.
70 | 
71 |         Expect that the inherited ``normalize`` method is called.
72 | 
73 |         Input:
74 |         - Raw score
75 | 
76 |         Output:
77 |         - The output of the inherited ``normalize`` method.
78 |         """
79 |         # Setup
80 |         metric = MissingValueSimilarity()
81 |         raw_score = 0.9
82 | 
83 |         # Run
84 |         result = metric.normalize(raw_score)
85 | 
86 |         # Assert
87 |         normalize_mock.assert_called_once_with(raw_score)
88 |         assert result == normalize_mock.return_value
89 | 


--------------------------------------------------------------------------------
/tests/unit/single_column/statistical/test_sequence_length_similarity.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from sdmetrics.single_column import SequenceLengthSimilarity
 4 | 
 5 | 
 6 | class TestSequenceLengthSimilarity:
 7 |     def test_compute_breakdown(self):
 8 |         """Test `compute_breakdown` works."""
 9 |         # Setup
10 |         real_data = pd.Series([1, 1, 2, 2, 2])
11 |         synthetic_data = pd.Series([3, 4, 5, 6, 6])
12 | 
13 |         metric = SequenceLengthSimilarity()
14 | 
15 |         # Run
16 |         result = metric.compute_breakdown(real_data, synthetic_data)
17 | 
18 |         # Assert
19 |         assert result == {'score': 0.25}
20 | 
21 |     def test_compute(self):
22 |         """Test it runs."""
23 |         # Setup
24 |         real_data = pd.Series(['id1', 'id2', 'id2', 'id3'])
25 |         synthetic_data = pd.Series(['id4', 'id5', 'id6'])
26 | 
27 |         # Run
28 |         score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
29 | 
30 |         # Assert
31 |         assert score == 0.6666666666666667
32 | 
33 |     def test_compute_one(self):
34 |         """Test it returns 1 when real and synthetic data have the same distribution."""
35 |         # Setup
36 |         real_data = pd.Series(['id1', 'id1', 'id2', 'id2', 'id2', 'id3'])
37 |         synthetic_data = pd.Series(['id4', 'id4', 'id5', 'id6', 'id6', 'id6'])
38 | 
39 |         # Run
40 |         score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
41 | 
42 |         # Assert
43 |         assert score == 1
44 | 
45 |     def test_compute_low_score(self):
46 |         """Test it for distinct distributions."""
47 |         # Setup
48 |         real_data = pd.Series([f'id{i}' for i in range(100)])
49 |         synthetic_data = pd.Series(['id100'] * 100)
50 | 
51 |         # Run
52 |         score = SequenceLengthSimilarity.compute(real_data, synthetic_data)
53 | 
54 |         # Assert
55 |         assert score == 0
56 | 


--------------------------------------------------------------------------------
/tests/unit/single_column/test_base.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import Mock, patch
 2 | 
 3 | from sdmetrics.single_column.base import SingleColumnMetric
 4 | 
 5 | 
 6 | class TestSingleColumnMetric:
 7 |     def test_compute_breakdown(self):
 8 |         """Test the ``compute_breakdown`` method.
 9 | 
10 |         Expect a breakdown dictionary is returned that contains the score.
11 | 
12 |         Setup:
13 |         - Mock the ``compute`` method to return a fake score.
14 | 
15 |         Input:
16 |         - Real data.
17 |         - Synthetic data.
18 | 
19 |         Output:
20 |         - The evaluated metric.
21 |         """
22 |         # Setup
23 |         metric = SingleColumnMetric()
24 |         test_metric_score = 0.5
25 | 
26 |         # Run
27 |         with patch.object(SingleColumnMetric, 'compute', return_value=test_metric_score):
28 |             result = metric.compute_breakdown(Mock(), Mock())
29 | 
30 |         # Assert
31 |         assert result == {'score': test_metric_score}
32 | 


--------------------------------------------------------------------------------
/tests/unit/single_table/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics unit testing for the single_table module."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/single_table/data_augmentation/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics unit testing for the single_table data_augmentation module."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/single_table/detection/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics unit testing for the single_table detection module."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/single_table/privacy/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics unit testing for the single_table privacy module."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/single_table/privacy/test_cap.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | from sdmetrics.single_table.privacy.cap import (
 7 |     CategoricalCAP,
 8 |     CategoricalGeneralizedCAP,
 9 |     CategoricalZeroCAP,
10 | )
11 | 
12 | 
13 | @pytest.mark.parametrize('metric', [CategoricalCAP, CategoricalZeroCAP, CategoricalGeneralizedCAP])
14 | def test_CAP_deprecation_message(metric):
15 |     """Test deprecation warning is raised when running the metric directly."""
16 |     # Setup
17 |     real_data = pd.DataFrame({'col1': range(5), 'col2': ['A', 'B', 'C', 'A', 'B']})
18 |     synthetic_data = pd.DataFrame({'col1': range(5), 'col2': ['C', 'A', 'A', 'B', 'C']})
19 | 
20 |     # Run and Assert
21 |     expected_warning = re.escape(
22 |         'Computing CAP metrics directly is deprecated. For improved privacy metrics, '
23 |         "please use the 'DisclosureProtection' and 'DisclosureProtectionEstimate' "
24 |         'metrics instead.'
25 |     )
26 |     with pytest.warns(DeprecationWarning, match=expected_warning):
27 |         metric.compute(real_data, synthetic_data, key_fields=['col1'], sensitive_fields=['col2'])
28 | 


--------------------------------------------------------------------------------
/tests/unit/single_table/privacy/test_util.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import pytest
 4 | 
 5 | from sdmetrics.single_table.privacy.util import (
 6 |     closest_neighbors,
 7 |     validate_num_samples_num_iteration,
 8 | )
 9 | 
10 | 
11 | def test_closest_neighbors_exact():
12 |     samples = [
13 |         ('a', '1'),
14 |         ('a', '2'),
15 |         ('a', '3'),
16 |         ('b', '1'),
17 |         ('b', '2'),
18 |         ('b', '3'),
19 |     ]
20 |     target = ('a', '2')
21 |     results = closest_neighbors(samples, target)
22 |     assert len(results) == 1
23 |     assert results[0] == ('a', '2')
24 | 
25 | 
26 | def test_closest_neighbors_non_exact():
27 |     samples = [
28 |         ('a', '1'),
29 |         ('a', '3'),
30 |         ('b', '1'),
31 |         ('b', '2'),
32 |         ('b', '3'),
33 |     ]
34 |     target = ('a', '2')
35 |     results = closest_neighbors(samples, target)
36 |     assert len(results) == 3
37 |     assert ('a', '1') in results
38 |     assert ('a', '3') in results
39 |     assert ('b', '2') in results
40 | 
41 | 
42 | def test_validate_num_samples_num_iteration():
43 |     # Run and Assert
44 |     num_subsample_error_post = re.escape('must be an integer greater than 1.')
45 | 
46 |     with pytest.raises(ValueError, match=num_subsample_error_post):
47 |         validate_num_samples_num_iteration(0, 1)
48 | 
49 |     with pytest.raises(ValueError, match=num_subsample_error_post):
50 |         validate_num_samples_num_iteration('X', 1)
51 | 
52 |     subsample_none_msg = re.escape(
53 |         'num_iterations should not be greater than 1 if there is no subsampling.'
54 |     )
55 |     num_iterations = 3
56 |     with pytest.raises(ValueError, match=subsample_none_msg):
57 |         validate_num_samples_num_iteration(None, num_iterations)
58 | 
59 |     zero_iteration_msg = re.escape('num_iterations (0) must be an integer greater than 1.')
60 |     with pytest.raises(ValueError, match=zero_iteration_msg):
61 |         validate_num_samples_num_iteration(1, 0)
62 | 


--------------------------------------------------------------------------------
/tests/unit/test_base.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from sdmetrics.base import BaseMetric
 4 | from sdmetrics.goal import Goal
 5 | 
 6 | 
 7 | class TestBaseMetric:
 8 |     def test_normalize_bounded(self):
 9 |         BaseMetric.max_value = 1
10 |         BaseMetric.min_value = -1
11 |         BaseMetric.goal = Goal.MAXIMIZE
12 | 
13 |         raw_score = 0
14 |         normalized = BaseMetric.normalize(raw_score)
15 | 
16 |         assert normalized == 0.5
17 | 
18 |     def test_normalize_high_bound(self):
19 |         BaseMetric.max_value = 1
20 |         BaseMetric.min_value = float('-inf')
21 |         BaseMetric.goal = Goal.MAXIMIZE
22 | 
23 |         raw_score = 1
24 |         normalized = BaseMetric.normalize(raw_score)
25 | 
26 |         assert normalized == 1
27 | 
28 |     def test_normalize_low_bound(self):
29 |         BaseMetric.max_value = float('inf')
30 |         BaseMetric.min_value = -1
31 |         BaseMetric.goal = Goal.MAXIMIZE
32 | 
33 |         raw_score = -1
34 |         normalized = BaseMetric.normalize(raw_score)
35 | 
36 |         assert normalized == 0
37 | 
38 |     def test_normalize_unbounded(self):
39 |         BaseMetric.max_value = float('inf')
40 |         BaseMetric.min_value = float('-inf')
41 |         BaseMetric.goal = Goal.MAXIMIZE
42 | 
43 |         raw_score = 0
44 |         normalized = BaseMetric.normalize(raw_score)
45 | 
46 |         assert normalized == 0.5
47 | 
48 |     def test_normalize_minimize(self):
49 |         BaseMetric.max_value = 1
50 |         BaseMetric.min_value = -1
51 |         BaseMetric.goal = Goal.MINIMIZE
52 | 
53 |         raw_score = 1
54 |         normalized = BaseMetric.normalize(raw_score)
55 | 
56 |         assert normalized == 0
57 | 
58 |     def test_normalize_out_of_bounds(self):
59 |         BaseMetric.max_value = 1
60 |         BaseMetric.min_value = -1
61 |         BaseMetric.goal = Goal.MAXIMIZE
62 | 
63 |         raw_score = 2
64 |         error_msg = '`raw_score` must be between `min_value` and `max_value`.'
65 |         with pytest.raises(ValueError, match=error_msg):
66 |             BaseMetric.normalize(raw_score)
67 | 


--------------------------------------------------------------------------------
/tests/unit/test_demos.py:
--------------------------------------------------------------------------------
 1 | from sdmetrics.demos import load_demo
 2 | 
 3 | 
 4 | def test_load_single_table_demo():
 5 |     """Test loading the single tale demo data and expect the correct demo data to be returned."""
 6 |     # Setup
 7 |     modality = 'single_table'
 8 | 
 9 |     # Run
10 |     real_data, synthetic_data, metadata = load_demo(modality)
11 | 
12 |     # Assert
13 |     assert metadata['columns']['duration'] == {
14 |         'sdtype': 'numerical',
15 |         'computer_representation': 'Int64',
16 |     }
17 |     assert real_data['duration'].dtype == 'float64'
18 |     assert synthetic_data['duration'].dtype == 'float64'
19 | 
20 | 
21 | def test_load_multi_table_demo():
22 |     """Test loading the multi table demo data and expect the correct demo data to be returned."""
23 |     # Setup
24 |     modality = 'multi_table'
25 | 
26 |     # Run
27 |     real_data, synthetic_data, metadata = load_demo(modality)
28 | 
29 |     # Assert
30 |     assert metadata['tables']['transactions']['columns']['timestamp'] == {
31 |         'sdtype': 'datetime',
32 |         'datetime_format': '%Y-%m-%d %H:%M:%S',
33 |     }
34 |     assert real_data['transactions']['timestamp'].dtype == 'datetime64[ns]'
35 |     assert synthetic_data['transactions']['timestamp'].dtype == 'datetime64[ns]'
36 | 


--------------------------------------------------------------------------------
/tests/unit/timeseries/__init__.py:
--------------------------------------------------------------------------------
1 | """SDMetrics unit testing for the timeseries module."""
2 | 


--------------------------------------------------------------------------------
/tests/unit/timeseries/test_timeseries.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import pandas as pd
 4 | import pytest
 5 | 
 6 | from sdmetrics.timeseries.base import TimeSeriesMetric
 7 | 
 8 | 
 9 | def test__validate_inputs_for_TimeSeriesMetric():
10 |     """Test ``_validate_inputs`` crashes when datetime column doesn't match metadata."""
11 |     # Setup
12 |     df1 = pd.DataFrame({
13 |         's_key': [1, 2, 3],
14 |         'visits': pd.to_datetime(['1/1/2019', '1/2/2019', '1/3/2019']),
15 |     })
16 |     df1['visits'] = df1['visits'].dt.date
17 |     df2 = pd.DataFrame({
18 |         's_key': [1, 2, 3],
19 |         'visits': ['not', 'a', 'datetime'],
20 |     })
21 |     metadata = {
22 |         'columns': {
23 |             's_key': {'sdtype': 'numerical'},
24 |             'visits': {'sdtype': 'datetime', 'datetime_format': '%Y-%m-%d %H:%M:%S'},
25 |         },
26 |         'sequence_key': 's_key',
27 |     }
28 | 
29 |     # Run and Assert
30 |     expected_msg = re.escape("Error converting column 'visits' to timestamp: ")
31 |     with pytest.raises(ValueError, match=expected_msg):
32 |         TimeSeriesMetric._validate_inputs(
33 |             real_data=df1, synthetic_data=df2, sequence_key=['s_key'], metadata=metadata
34 |         )
35 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | """Utils for testing."""
 2 | 
 3 | import pandas as pd
 4 | 
 5 | 
 6 | class DataFrameMatcher:
 7 |     """Match a given Pandas DataFrame in a mock function call."""
 8 | 
 9 |     def __init__(self, df):
10 |         """Initialize the DataFrame."""
11 |         self.df = df
12 | 
13 |     def __eq__(self, other):
14 |         """Assert equality using pandas testing module."""
15 |         pd.testing.assert_frame_equal(self.df, other)
16 |         return True
17 | 
18 | 
19 | class SeriesMatcher:
20 |     """Match a given Pandas Series in a mock function call."""
21 | 
22 |     def __init__(self, data):
23 |         """Initialize the Series."""
24 |         self.data = data
25 | 
26 |     def __eq__(self, other):
27 |         """Assert equality using pandas testing module."""
28 |         pd.testing.assert_series_equal(self.data, other)
29 |         return True
30 | 
31 | 
32 | class IteratorMatcher:
33 |     """Match a given iterator in a mock function call."""
34 | 
35 |     def __init__(self, iterator):
36 |         """Initialize the iterator."""
37 |         self.iterator = iterator
38 | 
39 |     def __eq__(self, other):
40 |         """Assert equality by expanding the iterator."""
41 |         assert all(x == y for x, y in zip(self.iterator, other))
42 |         return True
43 | 
44 | 
45 | def get_error_type(error):
46 |     if error is not None:
47 |         colon_index = error.find(':')
48 |         return error[:colon_index]
49 |     return None
50 | 
51 | 
52 | def check_if_value_in_threshold(value, expected_value, threshold):
53 |     assert abs(value - expected_value) < threshold
54 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py39-lint, py3{8,9,10,11,12,13}-{readme,integration,unit,minimum}
 3 | 
 4 | [testenv]
 5 | skipsdist = false
 6 | skip_install = false
 7 | deps =
 8 |     invoke
 9 |     readme: rundoc
10 | extras =
11 |     lint: dev
12 |     unit: test
13 |     integration: test
14 |     minimum: test
15 | commands =
16 |     lint: invoke lint
17 |     readme: invoke readme
18 |     unit: invoke unit
19 |     integration: invoke integration
20 |     minimum: invoke minimum
21 |     invoke rmdir --path {envdir}
22 | 


--------------------------------------------------------------------------------