├── .github └── workflows │ ├── pythonpackage.yml │ └── pythonpublish.yml ├── .gitignore ├── LICENSE.txt ├── README.md ├── VERSION ├── docs ├── Makefile ├── api.rst ├── conf.py ├── index.rst └── make.bat ├── pysynth ├── __init__.py ├── __main__.py ├── catdecat.py ├── ipf.py └── similarity.py ├── requirements.txt ├── requirements_test.txt ├── setup.py └── tests ├── test_catdecat.py ├── test_data.py ├── test_init.py ├── test_ipf.py └── test_similarity.py /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | max-parallel: 4 11 | matrix: 12 | python-version: [3.7] 13 | 14 | steps: 15 | - uses: actions/checkout@v1 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v1 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Install dependencies 21 | run: | 22 | python -m pip install --upgrade pip 23 | pip install -r requirements.txt 24 | - name: Lint with flake8 25 | run: | 26 | pip install flake8 27 | # stop the build if there are Python syntax errors or undefined names 28 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 29 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 30 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 31 | - name: Test with pytest 32 | run: | 33 | pip install pytest 34 | pytest tests 35 | -------------------------------------------------------------------------------- /.github/workflows/pythonpublish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package to PyPI 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@master 12 | - name: Set up Python 13 | uses: actions/setup-python@v1 14 | with: 15 | python-version: '3.7' 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install setuptools wheel twine 20 | python setup.py sdist bdist_wheel 21 | - name: Build and publish to Test PyPI 22 | uses: pypa/gh-action-pypi-publish@master 23 | with: 24 | password: ${{ secrets.test_pypi_apitoken }} 25 | repository_url: https://test.pypi.org/legacy/ 26 | - name: Build and publish 27 | uses: pypa/gh-action-pypi-publish@master 28 | with: 29 | password: ${{ secrets.pypi_apitoken }} 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2019, Jan Šimbera 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PySynth: Dataset Synthesis for Python 2 | 3 | PySynth is a package to create synthetic datasets - that is, datasets that look 4 | just like the original in terms of statistical properties, variable values, 5 | distributions and correlations, but do not have exactly the same contents 6 | so are safe against data disclosure. An alternative to R's 7 | [Synthpop](https://www.r-bloggers.com/generating-synthetic-data-sets-with-synthpop-in-r/) 8 | with a more permissive license. 9 | 10 | ## Installation 11 | You can get PySynth from PyPI by using the obvious 12 | 13 | pip install pysynth 14 | 15 | ## Usage 16 | You can perform the synthesis with basic settings directly on a CSV file: 17 | 18 | python -m pysynth source.csv synthesized.csv 19 | 20 | This produces a `synthesized.csv` file that will look a lot like the original 21 | (variable names values, distributions, correlations) but will (most likely) 22 | not be the same. 23 | 24 | For better control, it is best to use the synthesizer objects. They follow the 25 | scikit-learn interface for Pandas dataframes so you `fit()` them on the 26 | original and then `sample(n)` to get a synthetic dataframe of `n` rows. 27 | 28 | So far, only a synthesizer based on iterative proportional fitting 29 | (`pysynth.ipf.IPFSynthesizer`) is available. This synthesis bins continuous 30 | variables to categories and reconstructs them using fitted univariate 31 | distributions. Missing values (`NaN`) are preserved. 32 | 33 | Synthesis quality measurement modules to be added. 34 | 35 | ## Contributors 36 | Feedback, additions, suggestions, issues and pull requests are welcome and much 37 | appreciated on [GitHub](https://github.com/simberaj/pysynth). 38 | 39 | How to add features: 40 | 41 | 1. Fork it (https://github.com/simberaj/pysynth/fork) 42 | 2. Create your feature branch (`git checkout -b feature/feature-name`) 43 | 3. Commit your changes (`git commit -am "feature-name added"`) 44 | 4. Push to the branch (`git push origin feature/feature-name`) 45 | 5. Create a new pull request 46 | 47 | Development requires `pytest` for testing and `sphinx` to generate 48 | documentation. Tests can be run using simple 49 | 50 | pytest tests 51 | 52 | ### Intended development directions 53 | - Synthesis quality measurement in terms of anonymization/similarity 54 | - Model-based synthesis along the lines of R's Synthpop 55 | 56 | ## License and author info 57 | PySynth is developed by Jan Šimbera . 58 | 59 | PySynth is available under the MIT license. See `LICENSE.txt` for more details. 60 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.0.4 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | PySynth API 2 | ========================== 3 | 4 | Entry points 5 | -------------- 6 | 7 | .. automodule:: pysynth 8 | :members: 9 | 10 | 11 | IPF synthesis 12 | -------------- 13 | 14 | .. automodule:: pysynth.ipf 15 | :members: 16 | 17 | 18 | Categorization and continuous variable reconstruction 19 | ----------------------------------------------------- 20 | 21 | .. automodule:: pysynth.catdecat 22 | :members: 23 | :exclude-members: Binner, Distributor 24 | 25 | 26 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | # import recommonmark 5 | # from recommonmark.transform import AutoStructify 6 | 7 | # to allow autodoc to discover the documented modules 8 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 9 | 10 | project = 'pysynth' 11 | copyright = '2019, Jan Šimbera' 12 | author = 'Jan Šimbera' 13 | 14 | extensions = [ 15 | 'sphinx.ext.autodoc', 16 | 'recommonmark', 17 | ] 18 | 19 | source_suffix = { 20 | '.rst': 'restructuredtext', 21 | '.md': 'markdown', 22 | } 23 | 24 | templates_path = ['_templates'] 25 | 26 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 27 | 28 | html_theme = 'sphinxdoc' 29 | 30 | html_static_path = ['_static'] 31 | 32 | # At the bottom of conf.py 33 | # def setup(app): 34 | # app.add_config_value('recommonmark_config', { 35 | # # 'url_resolver': (lambda url: github_doc_root + url), 36 | # 'auto_toc_tree_section': 'Contents', 37 | # }, True) 38 | # app.add_transform(AutoStructify) 39 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | PySynth: Dataset Synthesis for Python 2 | ===================================== 3 | 4 | PySynth is a package to create synthetic datasets - that is, datasets that look 5 | just like the original in terms of statistical properties, variable values, 6 | distributions and correlations, but do not have exactly the same contents 7 | so are safe against data disclosure. 8 | 9 | 10 | .. toctree:: 11 | :maxdepth: 2 12 | :caption: Contents: 13 | 14 | api 15 | 16 | 17 | Indices and tables 18 | ================== 19 | 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` 23 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /pysynth/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | import pandas as pd 4 | 5 | from . import ipf 6 | 7 | SYNTHESIZERS = { 8 | 'ipf': ipf.IPFSynthesizer, 9 | } 10 | 11 | DEFAULT_METHOD = 'ipf' 12 | 13 | 14 | def synthesize(dataframe: pd.DataFrame, 15 | n_rows: Optional[int] = None, 16 | method: str = DEFAULT_METHOD, 17 | ignore_cols: List[str] = [], 18 | **kwargs) -> pd.DataFrame: 19 | '''Synthesize an analog to a given dataframe. 20 | 21 | Optional keyword arguments are passed to the selected synthesizer. 22 | 23 | :param dataframe: Data to be synthesized. 24 | :param n_rows: Number of output rows. If omitted, the same 25 | length as the input dataframe will be used. 26 | :param method: Method to use for synthesis. So far, only the `ipf` method 27 | using :class:`ipf.IPFSynthesizer` is available. 28 | :param ignore_cols: Columns not to be synthesized in the output (such as 29 | personal identifiers). 30 | ''' 31 | synther = SYNTHESIZERS[method](ignore_cols=ignore_cols, **kwargs) 32 | synther.fit(dataframe) 33 | return synther.sample(n_rows) 34 | 35 | 36 | def main(in_file: str, 37 | out_file: str, 38 | n_rows: str = None, 39 | method: str = DEFAULT_METHOD 40 | ) -> None: 41 | '''Synthesize an analog to a given CSV file. 42 | 43 | :param in_file: A CSV file with data to serve as basis for synthesis. 44 | :param out_file: A path to output the synthesized CSV. Will be 45 | semicolon-delimited. 46 | :param n_rows: Number of rows for the output file. If omitted, the same 47 | length as the input file will be used. 48 | :param method: Synthesis method to be used (see :func:`synthesize`). 49 | ''' 50 | if n_rows is not None: 51 | n_rows = int(n_rows) 52 | orig_df = pd.read_csv(in_file, sep=None, engine='python') 53 | synth_df = synthesize(orig_df, n_rows=n_rows, method=method) 54 | synth_df.to_csv(out_file, sep=';', index=False) 55 | -------------------------------------------------------------------------------- /pysynth/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import pysynth 4 | 5 | if __name__ == '__main__': 6 | pysynth.main(*sys.argv[1:]) 7 | -------------------------------------------------------------------------------- /pysynth/catdecat.py: -------------------------------------------------------------------------------- 1 | '''Bin continuous variables to categorical and reconstruct them back. 2 | 3 | An auxiliary module that enables categorical-only synthesizers to work with 4 | continuous variables by binning them to categories for the synthesis while 5 | remembering the value distributions within each category, and then converting 6 | the synthesized categories back to continuous values using those distributions. 7 | 8 | The main work is done by the :class:`Categorizer` that does this trick for a 9 | single variable (pandas Series). It might be further configured by using an 10 | appropriate *binner* such as :class:`QuantileBinner` to choose the numeric 11 | bounds for the categories 12 | and an appropriate *distributor* such as :class:`FittingDistributor` 13 | to remember and regenerate the intra-category value distribution. 14 | ''' 15 | 16 | from __future__ import annotations 17 | from typing import Union, Optional, List, Dict, Callable 18 | 19 | import numpy as np 20 | import pandas as pd 21 | import scipy.stats 22 | import sklearn.model_selection 23 | import sklearn.neighbors 24 | 25 | 26 | class Binner: 27 | '''Interface for numeric variable interval boundary determiners.''' 28 | def get(self, data: pd.Series) -> List[float]: 29 | '''Return a list of right-inclusive cut values, without endpoints.''' 30 | raise NotImplementedError 31 | 32 | 33 | class QuantileBinner(Binner): 34 | '''A binner that gives quantile cuts. 35 | 36 | :param bins: Number of quantiles to bin to. 37 | ''' 38 | def __init__(self, bins: int): 39 | self.bins = bins 40 | 41 | def get(self, data: pd.Series) -> List[float]: 42 | return data.quantile( 43 | (np.arange(self.bins - 1) + 1) / self.bins 44 | ).drop_duplicates().tolist() 45 | 46 | 47 | class EqualRangeBinner(Binner): 48 | '''A binner that gives equal-range cuts. 49 | 50 | :param bins: Number of bins to bin to. 51 | ''' 52 | def __init__(self, bins: int): 53 | self.bins = bins 54 | 55 | def get(self, data: pd.Series) -> List[float]: 56 | return np.linspace(data.min(), data.max(), self.bins + 1)[1:-1].tolist() 57 | 58 | 59 | class AprioriBinner(Binner): 60 | '''A dummy binner that returns cut values it was initialized with.''' 61 | def __init__(self, bins: List[float]): 62 | self.bins = bins 63 | 64 | def get(self, data: pd.Series) -> List[float]: 65 | return self.bins 66 | 67 | 68 | BINNERS = { 69 | 'quantile': QuantileBinner, 70 | 'equalrange': EqualRangeBinner, 71 | } 72 | 73 | 74 | class Distributor: 75 | '''Interface for numeric variable reconstructors. 76 | 77 | Fits itself on values for a single interval, and reproduces the 78 | distribution for a given number of output values by random sampling. 79 | ''' 80 | def copy(self) -> Distributor: 81 | raise NotImplementedError 82 | 83 | def fit(self, values: np.ndarray) -> None: 84 | '''Fit a distribution on the values for a given interval.''' 85 | raise NotImplementedError 86 | 87 | def sample(self, n: int) -> np.ndarray: 88 | '''Generate a given count of random values from the fitted distribution.''' 89 | raise NotImplementedError 90 | 91 | @classmethod 92 | def create(cls, code: str, *args, **kwargs): 93 | return cls.CODES[code](*args, **kwargs) 94 | 95 | 96 | class SelectingDistributor: 97 | '''Randomly sample from a value set according to value frequencies. 98 | 99 | Useful for variables with a small number of unique values. 100 | ''' 101 | def __init__(self, seed: Optional[int] = None): 102 | self.seed = seed 103 | 104 | def copy(self) -> SelectingDistributor: 105 | return SelectingDistributor(seed=self.seed) 106 | 107 | def fit(self, values: np.ndarray) -> SelectingDistributor: 108 | valcounts = pd.Series(values).value_counts() 109 | self.targets = valcounts.index.values 110 | self.probs = valcounts.values.astype(float) 111 | self.probs /= self.probs.sum() 112 | 113 | def sample(self, n: int) -> np.ndarray: 114 | return np.random.choice(self.targets, size=n, p=self.probs) 115 | 116 | 117 | class DiscreteDistributor(Distributor): 118 | CODES = { 119 | 'select': SelectingDistributor, 120 | } 121 | 122 | 123 | class MeanDistributor: 124 | '''Reproduce the values as a constant value of their mean.''' 125 | def __init__(self, seed=None): 126 | pass 127 | 128 | def copy(self) -> MeanDistributor: 129 | return MeanDistributor() 130 | 131 | def fit(self, values: np.ndarray) -> MeanDistributor: 132 | self.mean = values.mean() 133 | return self 134 | 135 | def sample(self, n: int) -> np.ndarray: 136 | return np.full(n, self.mean) 137 | 138 | 139 | class StatisticalDistributor: 140 | '''Reproduce the values from a univariate distribution fitted to the originals. 141 | 142 | Find the continuous distribution from a provided list that 143 | approximates the distribution of the input values the best according to 144 | the Kolmogorov-Smirnov two-sample statistic, fit its parameters and sample 145 | from it. Values outside the range of fitting data are discarded and 146 | re-sampled. 147 | 148 | :param distributions: `scipy.stats`-like continuous distributions. Need to 149 | support a class method `fit()` that produces all required constructor 150 | arguments as a tuple, and a `rvs(int)` method to generate random 151 | samples. Defaults to `DEFAULT_DISTRIBUTIONS`. The distributions should 152 | be approximately truncated, otherwise convergence is not guaranteed. 153 | :param min_samples: Minimum number of generated samples to use when 154 | evaluating the KS fit statistic. 155 | :param seed: Random generator seed, applied both before fitting and before 156 | each generator run. 157 | ''' 158 | DEFAULT_DISTRIBUTIONS: List[scipy.stats._distn_infrastructure.rv_continuous] = [ 159 | scipy.stats.uniform, 160 | scipy.stats.truncnorm, 161 | scipy.stats.truncexpon, 162 | scipy.stats.triang, 163 | ] 164 | 165 | def __init__(self, 166 | distributions: List[scipy.stats._distn_infrastructure.rv_continuous] = DEFAULT_DISTRIBUTIONS, 167 | min_samples: int = 100, 168 | seed: Optional[int] = None, 169 | ): 170 | self.distributions = distributions 171 | self.min_samples = min_samples 172 | self.seed = seed 173 | 174 | def copy(self) -> StatisticalDistributor: 175 | return StatisticalDistributor( 176 | self.distributions, 177 | self.min_samples, 178 | self.seed, 179 | ) 180 | 181 | def fit(self, values: np.ndarray) -> StatisticalDistributor: 182 | self.minval = values.min() 183 | self.maxval = values.max() 184 | self.valrange = self.maxval - self.minval 185 | if self.valrange == 0: 186 | # does not matter what goes here, will be multiplied by zero anyway 187 | best_distro = scipy.stats.norm(0, 1) 188 | else: 189 | best_distro = None 190 | normalized = (values - self.minval).astype(float) / self.valrange 191 | best_fit = 1 192 | test_size = max(len(normalized), self.min_samples) 193 | for distro in self.distributions: 194 | distro_obj = self._fit_distribution(distro, normalized) 195 | if distro_obj is not None: 196 | fit_est = scipy.stats.ks_2samp( 197 | normalized, 198 | distro_obj.rvs(test_size) 199 | )[0] 200 | if fit_est < best_fit: 201 | best_distro = distro_obj 202 | best_fit = fit_est 203 | if best_distro is None: 204 | raise ValueError('no distribution could be estimated') 205 | self.distribution = best_distro 206 | self.generator = restricted_sampler( 207 | lambda n: self.minval + self.valrange * self.distribution.rvs(n), 208 | self.minval, 209 | self.maxval, 210 | ) 211 | return self 212 | 213 | def _fit_distribution(self, 214 | distribution: scipy.stats._distn_infrastructure.rv_continuous, 215 | values: np.ndarray, 216 | ) -> Optional[scipy.stats._distn_infrastructure.rv_frozen]: 217 | try: 218 | old_setting = np.seterr(all='raise') 219 | args = distribution.fit(values) 220 | except FloatingPointError: 221 | return None 222 | finally: 223 | np.seterr(**old_setting) 224 | if np.isnan(args).any(): # invalid distribution estimated 225 | return None 226 | else: 227 | return distribution(*args) 228 | 229 | def sample(self, n: int) -> np.ndarray: 230 | np.random.seed(self.seed) 231 | return self.generator(n) 232 | 233 | 234 | class KDEDistributor: 235 | '''Reproduce the values from a kernel density estimate fitted to the originals. 236 | 237 | Find the continuous distribution from a provided list that 238 | approximates the distribution of the input values the best according to 239 | the Kolmogorov-Smirnov two-sample statistic, fit its parameters and sample 240 | from it. Values outside the range of fitting data are discarded and 241 | re-sampled. 242 | 243 | Warning, this is apparently highly computationally demanding for large 244 | datasets. 245 | 246 | :param n_bandwidths: Number of tries for the KDE bandwidth estimation, 247 | in a logarithmic range between .1 and 10. The best fitting output is 248 | kept using grid search. 249 | :param seed: Random generator seed, applied both before fitting and before 250 | each generator run. 251 | ''' 252 | def __init__(self, 253 | n_bandwidths: int = 10, 254 | seed: Optional[int] = None, 255 | ): 256 | self.n_bandwidths = n_bandwidths 257 | self.seed = seed 258 | 259 | def copy(self) -> KDEDistributor: 260 | return KDEDistributor( 261 | self.min_unique_continuous, 262 | self.max_iter, 263 | self.n_bandwidths, 264 | self.seed, 265 | ) 266 | 267 | def fit(self, values: np.ndarray) -> KDEDistributor: 268 | np.random.seed(self.seed) 269 | bandwidths = 10 ** np.linspace(-1, 1, self.n_bandwidths) 270 | grid = sklearn.model_selection.GridSearchCV( 271 | sklearn.neighbors.KernelDensity(), 272 | {'bandwidth': bandwidths} 273 | ) 274 | grid.fit(values.reshape(-1, 1)) 275 | self.kde = grid.best_estimator_ 276 | self.generator = restricted_sampler( 277 | self.kde.sample, 278 | values.min(), 279 | values.max(), 280 | ) 281 | 282 | def sample(self, n: int) -> np.ndarray: 283 | np.random.seed(self.seed) 284 | return self.generator(n) 285 | 286 | 287 | def restricted_sampler(generator: Callable[int], 288 | minval: float, 289 | maxval: float, 290 | max_iter: int = 10, 291 | ) -> Callable[int]: 292 | '''Restrict a value generator to a specified range of values. 293 | 294 | :param generator: A function generating random values in specified counts. 295 | :param minval: Minimum value to generate. 296 | :param maxval: Maximum value to generate. 297 | :param max_iter: Maximum number of iterations. If unable to generate enough 298 | values within range by generating this times more values from the 299 | underlying generator, fail. 300 | :raises ValueError: If max_iter is exceeded. 301 | ''' 302 | def sampler(n): 303 | g = 0 304 | i = 0 305 | results = [] 306 | while g < n: 307 | if i == max_iter: 308 | raise ValueError('faulty generator, could not get values in range') 309 | vals = generator(n) 310 | sel_vals = vals[(minval <= vals) & (vals <= maxval)] 311 | results.append(sel_vals) 312 | g += len(sel_vals) 313 | i += 1 314 | return np.hstack(results)[:n] 315 | return sampler 316 | 317 | 318 | class ContinuousDistributor(Distributor): 319 | CODES = { 320 | 'mean': MeanDistributor, 321 | 'statdist': StatisticalDistributor, 322 | 'kde': KDEDistributor, 323 | } 324 | 325 | 326 | class SeriesDiscretizer: 327 | '''Discretize a continuous series to categorical. 328 | 329 | Able to reconstruct variables to their continuous form by estimating 330 | distributions within bins. 331 | 332 | :param binner: Method to use to determine interval boundaries for 333 | discretization. Use a :class:`Binner` instance or one of the following 334 | strings: 335 | 336 | - `'quantile'` for binning into quantiles (:class:`QuantileBinner`), 337 | - `'equalrange'` for binning into equally sized bins 338 | (:class:`EqualRangeBinner`). 339 | 340 | :param bins: Number of intervals to which to bin non-categorical variables, 341 | or boundaries of the intervals as a list. If a list is given, it 342 | overrides the *binner* argument and uses :class:`AprioriBinner`. In 343 | that case, do not specify the minimum or maximum in the list, just the 344 | intermediate cuts. 345 | :param min_unique_continuous: Minimum number of unique values in the input 346 | to regard a distribution as continuous and not discrete. 347 | :param discrete_distributor: Method to use to reconstruct numeric values 348 | for a given category if there is less unique values than 349 | `min_unique_continuous`. Use a Distributor instance 350 | or one of the following strings: 351 | 352 | - `'select'` for :class:`SelectingDistributor` (weighted random sampling). 353 | 354 | :param continuous_distributor: Like `discrete_distributor`, but for cases 355 | when there is many unique values. Use a Distributor instance 356 | or one of the following strings: 357 | 358 | - `'mean'` for :class:`MeanDistributor` (constant mean value), 359 | - `'statdist'` for :class:`StatisticalDistributor` (simple estimated distribution), 360 | - `'kde'` for :class:`KDEDistributor` (KDE-estimated distribution). 361 | 362 | :param seed: Seed for the variable reconstruction. 363 | ''' 364 | def __init__(self, 365 | binner: Union[str, Binner] = 'quantile', 366 | bins: Union[int, List[float]] = 10, 367 | min_for_bin: Optional[int] = 10, 368 | min_unique_continuous: int = 10, 369 | discrete_distributor: Union[str, Distributor] = 'select', 370 | continuous_distributor: Union[str, Distributor] = 'statdist', 371 | seed: Optional[int] = None, 372 | ): 373 | if isinstance(binner, str): 374 | if isinstance(bins, int): 375 | self.binner = BINNERS[binner](bins) 376 | else: 377 | self.binner = AprioriBinner(bins) 378 | else: 379 | self.binner = binner 380 | self.min_for_bin = min_for_bin 381 | self.min_unique_continuous = min_unique_continuous 382 | self.discrete_distributor = DiscreteDistributor.create( 383 | discrete_distributor, seed=seed 384 | ) if isinstance(discrete_distributor, str) else discrete_distributor 385 | self.continuous_distributor = ContinuousDistributor.create( 386 | continuous_distributor, seed=seed 387 | ) if isinstance(continuous_distributor, str) else continuous_distributor 388 | self.active = False 389 | 390 | def copy(self) -> SeriesDiscretizer: 391 | return SeriesDiscretizer( 392 | binner=self.binner, 393 | min_for_bin=self.min_for_bin, 394 | discrete_distributor=self.discrete_distributor, 395 | continuous_distributor=self.continuous_distributor, 396 | ) 397 | 398 | def fit(self, series: pd.Series) -> SeriesDiscretizer: 399 | '''Fit the discretizer on a given series. 400 | 401 | Get cut values from the underlying binner, fit distributors for the bins 402 | and prepare the mapping. 403 | 404 | :raises TypeError: If the series is not numeric. 405 | ''' 406 | if not pd.api.types.is_numeric_dtype(series): 407 | raise TypeError(f'cannot discretize a non-numeric series of dtype {series.dtype}') 408 | n_unique = series.nunique() 409 | if self.min_for_bin is None or n_unique >= self.min_for_bin: 410 | cuts = self._get_cuts(series) 411 | if n_unique >= len(frozenset(cuts)): 412 | transformed = pd.cut(series, cuts, include_lowest=True) 413 | self.active = True 414 | self.index = transformed.cat.categories 415 | self.distributors = self._fit_distributors(series, transformed) 416 | self.dtype = series.dtype 417 | return self 418 | 419 | def _get_cuts(self, series: pd.Series) -> List[float]: 420 | cuts = self.binner.get(series) 421 | minval = series.min() 422 | if cuts[0] != minval: 423 | cuts.insert(0, minval) 424 | cuts.append(series.max() + 1) 425 | return cuts 426 | 427 | def _fit_distributors(self, 428 | original: pd.Series, 429 | transformed: pd.Series, 430 | ) -> List[Distributor]: 431 | distributors = [] 432 | for cat, bin_vals in original.groupby(transformed): 433 | if len(bin_vals.index) > 0: 434 | n_unique = bin_vals.nunique() 435 | if n_unique < self.min_unique_continuous: 436 | d = self.discrete_distributor.copy() 437 | else: 438 | d = self.continuous_distributor.copy() 439 | d.fit(bin_vals.values) 440 | else: 441 | # no values in bin, return a mean-producing distributor 442 | # at the center of the interval 443 | d = MeanDistributor(seed=self.continuous_distributor.seed) 444 | d.fit(np.array([cat.left, cat.right])) 445 | distributors.append(d) 446 | return distributors 447 | 448 | def transform(self, series: pd.Series) -> pd.Series: 449 | '''Discretize the series to a dtype of :class:`pd.Categorical`.''' 450 | if self.active: 451 | return pd.cut(series, self.index) 452 | else: 453 | return series 454 | 455 | def fit_transform(self, series: pd.Series) -> pd.Series: 456 | self.fit(series) 457 | return self.transform(series) 458 | 459 | def inverse_transform(self, series: pd.Series) -> pd.Series: 460 | '''De-discretize the series to a continuous one. 461 | 462 | For each bin, use the fitted distributor to produce continuous values 463 | to fill the series. 464 | ''' 465 | if self.active: 466 | reconstructed = pd.Series(0, dtype=self.dtype, index=series.index) 467 | for category, distributor in zip(self.index, self.distributors): 468 | locator = (series == category) 469 | reconstructed[locator] = distributor.sample(locator.sum()) 470 | na_loc = series.isna() 471 | if na_loc.any(): 472 | reconstructed[na_loc] = np.nan 473 | return reconstructed.astype(self.dtype) 474 | else: 475 | return series 476 | 477 | 478 | class DataFrameDiscretizer: 479 | '''Discretize all continuous columns in a dataframe to categorical. 480 | 481 | Categorical variables are left untouched. 482 | 483 | :param series_discretizer: A discretizer with setup to use for individual 484 | series. If this is None, any remaining constructor parameters are 485 | passed to the constructor of :class:`SeriesDiscretizer`. 486 | If this is a dictionary, the discretizers are applied to the columns 487 | denoted by the dictionary keys and the remaining columns are not 488 | discretized. 489 | :param max_num_cats: Maximum number of categories to accept. High numbers 490 | of categories make categorical synthesizers unstable. If any of the 491 | variables has more distinct values than this number after 492 | categorization, a ValueError is raised. 493 | ''' 494 | def __init__(self, 495 | series_discretizer: Union[ 496 | None, SeriesDiscretizer, Dict[str, SeriesDiscretizer] 497 | ] = None, 498 | max_num_cats: Optional[int] = 50, 499 | **kwargs 500 | ): 501 | if isinstance(series_discretizer, dict): 502 | self.discretizers = series_discretizer 503 | self.pattern = None 504 | else: 505 | self.discretizers = None 506 | if series_discretizer is None: 507 | self.pattern = SeriesDiscretizer(**kwargs) 508 | else: 509 | self.pattern = series_discretizer 510 | self.max_num_cats = max_num_cats 511 | 512 | def fit(self, dataframe: pd.DataFrame) -> DataFrameDiscretizer: 513 | '''Fit series discretizers for all non-categorical columns of the dataframe. 514 | 515 | If per-column discretizers were specified, other columns are ignored. 516 | 517 | :raises TypeError: If any column with an explicitly specified 518 | per-column discretizer in a constructor dict is not numeric. 519 | ''' 520 | self.fit_transform(dataframe) 521 | return self 522 | 523 | def fit_transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: 524 | if self.discretizers is None: 525 | self.discretizers = { 526 | col: self.pattern.copy() 527 | for col in dataframe.columns 528 | } 529 | transformed = dataframe.copy() 530 | for col in dataframe.columns: 531 | if col in self.discretizers: 532 | if pd.api.types.is_numeric_dtype(dataframe[col]): 533 | transformed[col] = self.discretizers[col].fit_transform( 534 | dataframe[col] 535 | ) 536 | else: 537 | if self.pattern is None: 538 | raise TypeError(f'column {col} is not numeric but explicit discretizer provided') 539 | else: 540 | del self.discretizers[col] 541 | if self.max_num_cats is not None: 542 | n_after = transformed[col].nunique() 543 | if n_after > self.max_num_cats: 544 | raise ValueError(f'too many categories for {col} ({n_after})') 545 | return transformed 546 | 547 | def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: 548 | '''Discretize all non-categorical columns.''' 549 | dataframe = dataframe.copy() 550 | for col in self.discretizers: 551 | dataframe[col] = self.discretizers[col].transform(dataframe[col]) 552 | return dataframe 553 | 554 | def inverse_transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: 555 | '''Return all formerly non-categorical columns to continuous.''' 556 | dataframe = dataframe.copy() 557 | for col in self.discretizers: 558 | dataframe[col] = self.discretizers[col].inverse_transform( 559 | dataframe[col] 560 | ) 561 | return dataframe 562 | -------------------------------------------------------------------------------- /pysynth/ipf.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Optional, List, Dict, Tuple 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import itertools 6 | 7 | from . import catdecat 8 | 9 | 10 | class MatrixRounder: 11 | def round(self, matrix: np.ndarray) -> np.ndarray: 12 | '''Round a matrix to integers, preserving its grand total.''' 13 | raise NotImplementedError 14 | 15 | 16 | class LargestRemainderRounder(MatrixRounder): 17 | '''Round a matrix to integers using the largest-remainder method. 18 | 19 | The largest-remainder method (Hare quota) is deterministic and allocates 20 | roundings to the largest remainders. Ties are broken by selecting the cells 21 | with largest indices. 22 | 23 | :param seed: Meaningless, this method is deterministic. 24 | ''' 25 | def __init__(self, seed: Optional[int] = None): 26 | pass # this is a deterministic rounder 27 | 28 | def round(self, matrix: np.ndarray) -> np.ndarray: 29 | # round down to integers, those are sure hits 30 | rounded = matrix.astype(int) 31 | # compute remainders to be distributed 32 | remainders = matrix - rounded 33 | sum_remaining = int(np.round(remainders.sum())) 34 | # locate sum_remaining largest remainders 35 | ind_add = np.argsort( 36 | remainders, axis=None, kind='stable' 37 | )[::-1][:sum_remaining] 38 | rounded[np.unravel_index(ind_add, matrix.shape)] += 1 39 | return rounded 40 | 41 | 42 | class RandomSamplingRounder(MatrixRounder): 43 | '''Round a matrix to integers using random sampling. 44 | 45 | Randomly sample from matrix cells, using their values as probabilities, 46 | until the sum is matched. 47 | 48 | :param seed: Seed for the random sampler. 49 | ''' 50 | def __init__(self, seed: Optional[int] = None): 51 | self.seed = seed 52 | 53 | def round(self, matrix: np.ndarray) -> np.ndarray: 54 | matrix_sum = matrix.sum() 55 | final_total = int(np.round(matrix_sum)) 56 | probs = (matrix / matrix_sum).flatten() 57 | # print('PROBS', probs.sum()) 58 | np.random.seed(self.seed) 59 | # randomly select cells to be included 60 | bucket_is = np.random.choice(len(probs), size=final_total, p=probs) 61 | # count the cells 62 | cell_counts = np.bincount(bucket_is) 63 | return np.hstack(( 64 | cell_counts, 65 | np.zeros(matrix.size - len(cell_counts), dtype=cell_counts.dtype) 66 | )).reshape(*matrix.shape) 67 | 68 | 69 | ROUNDERS = { 70 | 'lrem': LargestRemainderRounder, 71 | 'random': RandomSamplingRounder, 72 | } 73 | 74 | 75 | class IPFSynthesizer: 76 | '''Synthesize a dataframe using iterative proportional fitting. 77 | 78 | Creates a dataframe that has similar statistical properties to the original 79 | but does not replicate its rows directly. Preserves univariate 80 | distributions and covariate distributions to a chosen degree. 81 | Non-categorical variables are converted to categorical for synthesis 82 | and then reconstructed using estimated distributions. 83 | 84 | :param cond_dim: Degree to which to match covariate distributions. 85 | By default, covariates to degree two (two variables' cross-tables) 86 | will be preserved. If you set this higher than the number of columns in 87 | the dataframe, the dataframe will be replicated exactly (except for 88 | the categorization and decategorization of non-categorical variables). 89 | :param discretizer: A :class:`catdecat.DataFrameDiscretizer` instance to 90 | convert numeric variables to and from categorical ones. 91 | Can be specified as a single instance or per variable in a dictionary. 92 | If not given, a single instance with default setup will be created. 93 | :param rounder: Method to use to round the IPF matrix to integer counts to 94 | enable row generation. Use a MatrixRounder instance or one of the 95 | following strings: 96 | 97 | - `'lrem'` uses the deterministic largest remainder method (see 98 | :class:`LargestRemainderRounder` for details) which is more suited 99 | to small datasets. 100 | - `'random'` uses the non-deterministic random generation method (see 101 | :class:`RandomSamplingRounder` for details), more suited to larger 102 | datasets. 103 | 104 | :param ignore_cols: Columns from the input dataframe to not synthesize 105 | (identifiers etc.); will be omitted from the output. 106 | :param seed: Random generator seed for the discretizer and unroller. 107 | (If a custom discretizer is specified, its seed is not overwritten by 108 | this setting.) 109 | ''' 110 | def __init__(self, 111 | cond_dim: int = 2, 112 | discretizer: Optional[catdecat.DataFrameDiscretizer] = None, 113 | rounder: Union[str, MatrixRounder] = 'lrem', 114 | ignore_cols: List[str] = [], 115 | seed: Optional[int] = None, 116 | ): 117 | if cond_dim < 1: 118 | raise ValueError('cannot preserve less than one-dimensional sums') 119 | self.cond_dim = cond_dim 120 | self.rounder = ( 121 | ROUNDERS[rounder](seed=seed) if isinstance(rounder, str) 122 | else rounder 123 | ) 124 | self.discretizer = ( 125 | discretizer if discretizer is not None 126 | else catdecat.DataFrameDiscretizer(seed=seed) 127 | ) 128 | self.ignore_cols = ignore_cols 129 | 130 | def fit(self, dataframe: pd.DataFrame) -> None: 131 | '''Prepare the synthesis according to the provided dataframe. 132 | 133 | :param dataframe: Dataframe to synthesize. Every column is replicated; 134 | if there are any identifier columns that should not be replicated, 135 | remove them beforehand. 136 | ''' 137 | discrete = self.discretizer.fit_transform( 138 | dataframe.drop(self.ignore_cols, axis=1) 139 | ) 140 | # marginals, axis_values = get_marginals(discrete) 141 | self.axis_values = get_axis_values(discrete) 142 | self.synthed_matrix = obscure_seed( 143 | self.calc_true_matrix(discrete), self.cond_dim 144 | ) 145 | self.original_n_rows = dataframe.shape[0] 146 | 147 | def sample(self, n: Optional[int] = None) -> pd.DataFrame: 148 | '''Generate a synthetic dataframe with a given number of rows. 149 | 150 | :param n: Number of rows for the output dataframe. If not given, 151 | it will match the fitting dataframe. 152 | ''' 153 | matrix = self.synthed_matrix 154 | if n is not None: 155 | matrix *= (n / self.original_n_rows) 156 | return self.discretizer.inverse_transform( 157 | map_axes(unroll(self.rounder.round(matrix)), self.axis_values) 158 | ) 159 | 160 | def fit_transform(self, dataframe: pd.DataFrame) -> pd.DataFrame: 161 | '''Fit the synthesizer and synthesize an equal-size dataframe.''' 162 | self.fit(dataframe) 163 | return self.sample() 164 | 165 | def calc_true_matrix(self, dataframe: pd.DataFrame) -> np.ndarray: 166 | '''Calculate a IPF matrix reflecting true observation frequencies.''' 167 | for col, mapper in self.axis_values.items(): 168 | dataframe[col] = dataframe[col].map( 169 | pd.Series(mapper.index, index=mapper.values) 170 | ) 171 | true_seed = np.zeros(tuple(len(mapper) for mapper in self.axis_values.values())) 172 | for indices in dataframe.itertuples(index=False, name=None): 173 | true_seed[indices] += 1 174 | return true_seed 175 | 176 | 177 | def ipf(marginals: List[np.ndarray], 178 | seed_matrix: Optional[np.ndarray] = None, 179 | precision: float = 1e-9 180 | ) -> np.ndarray: 181 | '''Perform iterative proportional fitting (IPF) on 1D marginal sums. 182 | 183 | Reformats the marginals to a generic n-D format and then delegates to 184 | :func:`ipf_multidim`. 185 | 186 | :param marginals: Marginal sums for the IPF dimensions. The marginal sums 187 | of the output matrix will match these. The list should contain 188 | one-dimensional arrays that sum to the same number. 189 | :param seed_matrix: Seed matrix, shows a-priori conditional probabilities 190 | across dimensions. 191 | :param precision: Terminate IPF when the largest difference of an 192 | individual cell value between two iterations drops below this 193 | threshold. 194 | ''' 195 | n_dim = len(marginals) 196 | if seed_matrix is not None and len(seed_matrix.shape) != n_dim: 197 | raise ValueError('marginal dimensions do not match IPF seed') 198 | return ipf_multidim( 199 | [ 200 | marginal.reshape([ 201 | -1 if i == dim_i else 1 for i in range(n_dim) 202 | ]) 203 | for dim_i, marginal in enumerate(marginals) 204 | ], 205 | seed_matrix, 206 | precision=precision 207 | ) 208 | 209 | 210 | def ipf_multidim(marginals: List[np.ndarray], 211 | seed_matrix: Optional[np.ndarray] = None, 212 | precision: float = 1e-9 213 | ) -> np.ndarray: 214 | '''Perform iterative proportional fitting (IPF) on arbitrary marginal sums. 215 | 216 | :param marginals: Marginal sums for the final matrix. The list should 217 | contain arrays with equal sums. Their dimensions should correspond to 218 | the seed matrix or be 1 - at dimensions for which the given marginal 219 | sum is summed (contracted). 220 | :param seed_matrix: Seed matrix, shows a-priori conditional probabilities 221 | across dimensions. If not given, the matrix shape will be computed from 222 | the marginals and it will be initialized by ones. 223 | :param precision: Terminate IPF when the largest difference of an 224 | individual cell value between two iterations drops below this 225 | threshold. 226 | ''' 227 | if seed_matrix is None: 228 | shape = tuple( 229 | max(marg.shape[i] for marg in marginals) 230 | for i in range(min(marg.ndim for marg in marginals)) 231 | ) 232 | matrix = np.ones(shape) 233 | else: 234 | matrix = seed_matrix.astype(float) 235 | shape = matrix.shape 236 | ipf_check_marginals(marginals, shape) 237 | other_dims = [ 238 | tuple( 239 | dim_i for dim_i in range(len(shape)) 240 | if marginal.shape[dim_i] == 1 241 | ) 242 | for marginal in marginals 243 | ] 244 | diff = precision + 1 245 | while diff > precision: 246 | previous = matrix 247 | for marginal, other_dimtup in zip(marginals, other_dims): 248 | dim_sums = matrix.sum(axis=other_dimtup).reshape(marginal.shape) 249 | matrix = matrix / np.where(dim_sums == 0, 1, dim_sums) * marginal 250 | diff = abs(matrix - previous).max() 251 | return matrix 252 | 253 | 254 | def ipf_check_marginals(marginals: List[np.ndarray], shape: Tuple[int]) -> None: 255 | '''Checks whether the marginal sums are valid for IPF of given shape. 256 | 257 | Used internally by :func:`ipf_multidim` so uses the format of marginals 258 | required by that function. 259 | 260 | :param marginals: List of marginal sum arrays to be checked. 261 | :param shape: Shape of the resulting matrix. 262 | ''' 263 | total = marginals[0].sum() 264 | for i, marginal in enumerate(marginals): 265 | if i != 0 and not np.isclose(marginal.sum(), total): 266 | raise ValueError('marginal sum totals do not match') 267 | if marginal.ndim != len(shape): 268 | raise ValueError('marginal dimensions do not match seed') 269 | for j, mshape in enumerate(marginal.shape): 270 | if mshape != 1 and mshape != shape[j]: 271 | raise ValueError('marginal shape does not match seed') 272 | 273 | 274 | def unroll(matrix: np.ndarray) -> np.ndarray: 275 | '''Convert a matrix of cell counts to a matrix of cell indices with those counts. 276 | 277 | :param matrix: A matrix of non-negative integers denoting counts of 278 | observations. Each cell will generate this many rows with its positional 279 | indices. 280 | ''' 281 | cumcounts = np.cumsum(matrix) 282 | inds = np.zeros(cumcounts[-1], dtype=int) 283 | np.add.at(inds, cumcounts[:np.searchsorted(cumcounts, cumcounts[-1])], 1) 284 | return np.stack(np.unravel_index( 285 | np.cumsum(inds), matrix.shape 286 | )).transpose() 287 | 288 | 289 | def map_axes(indices: np.ndarray, 290 | axis_values: Dict[str, pd.Series], 291 | ) -> pd.DataFrame: 292 | '''Convert a category index array to a dataframe with categories. 293 | 294 | :param indices: A 2-D integer array. 295 | :param axis_values: A dictionary with length matching the column count of 296 | `indices`. Its keys are names of the columns to be assigned to the 297 | dataframe, while values map the category indices from the given column 298 | of the integer array to the expected dataframe values. 299 | ''' 300 | dataframe = pd.DataFrame(indices, columns=list(axis_values.keys())) 301 | for col, mapper in axis_values.items(): 302 | dataframe[col] = dataframe[col].map(mapper) 303 | return dataframe 304 | 305 | 306 | def obscure_seed(true: np.ndarray, 307 | cond_dim: int = 2 308 | ) -> np.ndarray: 309 | '''Produce a matrix preserving some cross-sums of the original. 310 | 311 | :param true: The matrix to be obscured. The output matrix will match 312 | sums of its cells as aggregated to each combination of `cond_dim` 313 | dimensions. 314 | :param cond_dim: The number of dimensions to preserve cross-sums for. 315 | ''' 316 | if cond_dim < 1: 317 | raise ValueError('invalid preservation dimension count') 318 | marginals = [] 319 | dim_is = list(range(true.ndim)) 320 | for sel_dim_is in itertools.combinations(dim_is, cond_dim): 321 | left_dim_is = [] 322 | sum_indexer = [] 323 | for dim_i in dim_is: 324 | if dim_i in sel_dim_is: 325 | sum_indexer.append(true.shape[dim_i]) 326 | else: 327 | sum_indexer.append(1) 328 | left_dim_is.append(dim_i) 329 | marginals.append(true.sum(axis=tuple(left_dim_is)).reshape(sum_indexer)) 330 | return ipf_multidim(marginals) 331 | 332 | 333 | def get_axis_values(dataframe: pd.DataFrame 334 | ) -> Dict[str, pd.Series]: 335 | '''Compute mappings of indices to categories for each dataframe column.''' 336 | maps = {} 337 | for col in dataframe: 338 | values = pd.Series(dataframe[col].unique()).sort_values().values 339 | maps[col] = pd.Series(values, index=np.arange(len(values))) 340 | return maps 341 | -------------------------------------------------------------------------------- /pysynth/similarity.py: -------------------------------------------------------------------------------- 1 | '''Measure the statistical similarity of synthesized data to the original.''' 2 | 3 | from typing import Any, List, Optional, Tuple, Collection, Dict, Iterable, Union, Callable 4 | 5 | import numpy as np 6 | import scipy.stats 7 | import pandas as pd 8 | import sklearn.ensemble 9 | import sklearn.linear_model 10 | import sklearn.naive_bayes 11 | import sklearn.neighbors 12 | import sklearn.svm 13 | import sklearn.tree 14 | 15 | from . import catdecat 16 | 17 | SUMMARY_STATS: List[str] = [ 18 | 'mean', 19 | 'std', 20 | 'min', 21 | 'q1', 22 | 'median', 23 | 'q3', 24 | 'max', 25 | 'skew', 26 | 'kurt' 27 | ] 28 | 29 | 30 | def summary_stats(series: pd.Series) -> pd.Series: 31 | '''Produce univariate summary statistics for a numerical series. 32 | 33 | Provides quartiles (q1, median and q3 respectively), mean, standard 34 | deviation (std), skewness (skew), kurtosis (kurt) and extremes (min, max). 35 | Note that for very short series, the higher moments (std, skew, kurt) 36 | might come out as NaN. 37 | 38 | :param series: A numerical series to compute summary statistics for. 39 | ''' 40 | sumstat = series.describe().drop('count') 41 | # rename quartiles 42 | index = sumstat.index.tolist() 43 | index[index.index('25%'):index.index('75%')+1] = ['q1', 'median', 'q3'] 44 | sumstat.index = index 45 | # add what pandas describe does not provide 46 | for key in SUMMARY_STATS: 47 | if key not in index: 48 | sumstat[key] = getattr(series, key)() 49 | return sumstat 50 | 51 | 52 | DIFF_METHODS = { 53 | 'diff': lambda orig, synth: synth - orig, 54 | 'ape': lambda orig, synth: ((synth - orig) / orig).where(synth != orig, 0.), 55 | } 56 | 57 | 58 | def summary_stat_diff(orig: pd.DataFrame, 59 | synth: pd.DataFrame, 60 | method: str = 'diff', 61 | ) -> pd.DataFrame: 62 | '''Compute differences of summary statistics for the synthesized dataset. 63 | 64 | For all numerical columns of the synthesized dataset, compute its summary 65 | statistics and compare them with the original using the given method. 66 | 67 | :param orig: The original dataset. 68 | :param synth: The synthesized dataset. 69 | :param method: The method to use for comparing the statistics: 70 | 71 | - `'diff'` for absolute difference, 72 | - `'ape'` for absolute percentage difference. 73 | :returns: A dataframe with a row for each column of the synthetic 74 | dataframe, with columns for different summary statistics 75 | containing their differences. 76 | ''' 77 | method_fx = DIFF_METHODS[method] 78 | num_cols = [col for col in synth.columns 79 | if pd.api.types.is_numeric_dtype(synth[col])] 80 | diff_df = pd.DataFrame.from_records([ 81 | method_fx( 82 | summary_stats(synth[col]), 83 | summary_stats(orig[col]) 84 | ).rename(col) 85 | for col in num_cols 86 | ], index=num_cols) 87 | # add _diff to stat names in columns to be more descriptive 88 | return diff_df.rename(columns={ 89 | stat: stat + '_' + method for stat in diff_df.columns 90 | }) 91 | 92 | 93 | def aligned_freqs(orig: pd.Series, 94 | synth: pd.Series, 95 | bins: Optional[int] = 10, 96 | ) -> Tuple[Optional[pd.Series], Optional[pd.Series]]: 97 | '''Return relative frequencies of values in the original and synthesized series. 98 | 99 | The relative frequency series will be aligned so that all values from 100 | both columns are present in both outputs. 101 | 102 | :param orig: A column from the original dataframe. 103 | :param synth: The corresponding column from the synthesized dataframe. 104 | :param bins: Number of bins (quantiles) to which to discretize the 105 | columns if they are numeric. Numeric columns with less unique values 106 | than this number will not be discretized. The quantiles are measured 107 | on the original column. If this is None, Nones will be returned for 108 | both outputs if the columns are numeric. 109 | :returns: A tuple of relative frequency series (summing to 1) for the 110 | original and synthesized dataset respectively, or a tuple of two Nones, 111 | if the originals are numeric and number of bins is not set. 112 | ''' 113 | if pd.api.types.is_numeric_dtype(synth): 114 | if bins is None: 115 | return None, None 116 | elif synth.nunique() > bins or orig.nunique() > bins: 117 | quantiles = ( 118 | [min(orig.min(), synth.min()) - 1] 119 | + catdecat.QuantileBinner(bins).get(orig) 120 | + [max(orig.max(), synth.max()) + 1] 121 | ) 122 | orig = pd.cut(orig, quantiles) 123 | synth = pd.cut(synth, quantiles) 124 | orig_counts = orig.value_counts(normalize=True) 125 | synth_counts = synth.value_counts(normalize=True) 126 | orig_counts, synth_counts = orig_counts.align(synth_counts) 127 | return orig_counts.fillna(0), synth_counts.fillna(0) 128 | 129 | 130 | def frequency_mismatch(orig: pd.DataFrame, 131 | synth: pd.DataFrame, 132 | bins: Optional[int] = 10, 133 | metrics: Optional[List[str]] = None, 134 | ) -> pd.DataFrame: 135 | '''Return mismatch metrics for the dataframe columns' value frequencies. 136 | 137 | This only looks at univariate value frequencies, not considering whether 138 | the values occur in conjunction with "correct" values from other columns. 139 | 140 | Computes the following metrics: 141 | 142 | - `rtae`: Relative Total Absolute Error (sum of absolute differences). 143 | Goes from 0 for perfect match to 2 for totally different values. 144 | - `overlap_coef`: Overlap coefficient (magnitude of set-wise frequency 145 | intersection). Goes from 1 for perfect match to 0 for totally 146 | different values. 147 | - `morisita_overlap`: Morisita's overlap index[#], a measure of frequency 148 | overlap. Goes from 0 for no overlap to 1 for identical proportions. 149 | - `rank_damerau`: Normalized Damerau-Levenshtein distance[#] of 150 | frequency-ordered category sets for both datasets; essentially, 151 | a number of adjustments (additions, deletions, swaps) to arrive 152 | from one to the other. Goes from 0 for matching category 153 | ranks to 1 for total mismatch. 154 | - `mae`: Mean Absolute Error (mean of absolute differences). The less, 155 | the better. 156 | - `rmse`: Root Mean Square Error. The less, the better. 157 | - `jaccard_dist`: Jaccard distance[#] (Intersection over Union) of the 158 | two frequency sets. `jaccard_dist = 1 - overlap_coef` 159 | - `simpson_diff`: Difference between Simpson diversity indices[#] for the 160 | synthetic and original frequencies. 161 | - `entropy_diff`: Difference between the Shannon entropy[#] for the 162 | synthetic and original frequencies, in nats. 163 | 164 | :param orig: The original dataframe. 165 | :param synth: The synthesized analog. 166 | :param bins: Number of bins to (quantiles) to which to discretize the 167 | columns if they are numeric, to be able to measure their frequencies 168 | as well. The quantiles are measured on the original column. If None, 169 | numeric columns will not be measured. 170 | :param metrics: Names of metrics to include. If None, all metrics are 171 | computed. 172 | :returns: A dataframe with a row for each column of the synthetic dataframe 173 | (except numeric columns if bins is None) with columns for different 174 | frequency mismatch statistics. 175 | 176 | [#] "Morisita's overlap index". Wikipedia. 177 | 178 | [#] "Damerau-Levenshtein distance". Wikipedia. 179 | 180 | [#] "Jaccard index". Wikipedia. 181 | 182 | [#] "Simpson index". Wikipedia. In: Diversity index. 183 | 184 | [#] "Shannon index". Wikipedia. In: Diversity index. 185 | 186 | ''' 187 | recs = [] 188 | index = [] 189 | for col in synth.columns: 190 | orig_freqs, synth_freqs = aligned_freqs(orig[col], synth[col], bins) 191 | if orig_freqs is not None and synth_freqs is not None: 192 | recs.append(freqdiff_metrics(orig_freqs, synth_freqs, metrics)) 193 | index.append(col) 194 | return pd.DataFrame.from_records(recs, index=index) 195 | 196 | 197 | def freqdiff_metrics(orig_freqs: pd.Series, 198 | synth_freqs: pd.Series, 199 | metrics: Optional[List[str]] = None, 200 | ) -> pd.Series: 201 | '''Compute frequency mismatch metrics for two value frequency series. 202 | 203 | :param orig_freqs: Frequencies of values (or their intervals) in the 204 | original dataframe column. 205 | :param synth_freqs: Frequencies of values (or their intervals) in the 206 | matching synthesized column. 207 | :param metrics: Names of metrics to include. If None, all metrics are 208 | computed. For a list of metrics, see :func:`frequency_mismatch`. 209 | :returns: A Series with metric values, with their names in the index. 210 | ''' 211 | diff = synth_freqs - orig_freqs 212 | simpson_orig = (orig_freqs ** 2).sum() 213 | simpson_synth = (synth_freqs ** 2).sum() 214 | overlap = orig_freqs.where(orig_freqs <= synth_freqs, synth_freqs) 215 | metric_series = pd.Series({ 216 | 'rtae': abs(diff).sum(), 217 | 'overlap_coef': overlap.sum(), 218 | 'rank_damerau': damerau_levenshtein( 219 | orig_freqs.sort_values().index.tolist(), 220 | synth_freqs.sort_values().index.tolist(), 221 | ) / len(orig_freqs.index), 222 | 'morisita_overlap': ( 223 | 2 * (orig_freqs * synth_freqs).sum() 224 | / (simpson_orig + simpson_synth) 225 | ), 226 | 'mae': abs(diff).mean(), 227 | 'rmse': (diff ** 2).mean() ** .5, 228 | 'jaccard_dist': 1 - overlap.sum(), 229 | 'simpson_diff': simpson_synth - simpson_orig, 230 | 'entropy_diff': ( 231 | (synth_freqs[synth_freqs>0] * np.log(synth_freqs[synth_freqs>0])).sum() 232 | - (orig_freqs[orig_freqs>0] * np.log(orig_freqs[orig_freqs>0])).sum() 233 | ) 234 | }) 235 | if metrics is None: 236 | return metric_series 237 | else: 238 | return metric_series[metrics] 239 | 240 | 241 | def damerau_levenshtein(seq1: Collection[Any], seq2: Collection[Any]) -> int: 242 | """Calculate the Damerau-Levenshtein distance between sequences. 243 | 244 | This distance is the number of additions, deletions, substitutions, 245 | and transpositions needed to transform the first sequence into the 246 | second. Although generally used with strings, any sequences of 247 | comparable objects will work. 248 | 249 | Transpositions are exchanges of *consecutive* characters; all other 250 | operations are self-explanatory. 251 | 252 | Based on code by Michael Homer, released under MIT License, retrieved 253 | from https://web.archive.org/web/20150909134357/http://mwh.geek.nz:80/2009/04/26/python-damerau-levenshtein-distance/ 254 | """ 255 | # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F 256 | # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix. 257 | # However, only the current and two previous rows are needed at once, 258 | # so we only store those. 259 | oneago = None 260 | thisrow = list(range(1, len(seq2) + 1)) + [0] 261 | for x in range(len(seq1)): 262 | # Python lists wrap around for negative indices, so put the 263 | # leftmost column at the *end* of the list. This matches with 264 | # the zero-indexed strings and saves extra calculation. 265 | twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] 266 | for y in range(len(seq2)): 267 | delcost = oneago[y] + 1 268 | addcost = thisrow[y - 1] + 1 269 | subcost = oneago[y - 1] + (seq1[x] != seq2[y]) 270 | thisrow[y] = min(delcost, addcost, subcost) 271 | # This block deals with transpositions 272 | is_transposition = ( 273 | x > 0 and y > 0 and seq1[x] == seq2[y - 1] 274 | and seq1[x-1] == seq2[y] and seq1[x] != seq2[y] 275 | ) 276 | if is_transposition: 277 | thisrow[y] = min(thisrow[y], twoago[y - 2] + 1) 278 | return thisrow[len(seq2) - 1] 279 | 280 | 281 | def correlation_diff(orig: pd.DataFrame, 282 | synth: pd.DataFrame, 283 | method: Union[str, Callable] = 'pearson', 284 | ) -> pd.DataFrame: 285 | '''Return the difference of correlation matrices of the two dataframes. 286 | 287 | :param orig: The original dataframe. 288 | :param synth: The synthesized analog. 289 | :param method: A method for Pandas `corr()` to specify the manner of 290 | correlation (Pearson, Kendall, Spearman 291 | or arbitrary through a callable). 292 | :returns: A Dataframe with synthesized column names in the index and 293 | columns, with numerical differences of correlation coefficients in the 294 | dataframes as values. Might contain NaNs where the coefficient in either 295 | of the dataframes is NaN, e.g. when the given column only contains a 296 | single value. 297 | ''' 298 | return ( 299 | synth.corr(method=method) 300 | - orig[synth.columns.tolist()].corr(method=method) 301 | ) 302 | 303 | 304 | def stat_tests(orig: pd.DataFrame, synth: pd.DataFrame) -> pd.DataFrame: 305 | '''Test equality of mean and variance of synthesized columns to originals. 306 | 307 | Performs a two-sample independent t-test (`t_`) for mean equality 308 | and Levene's test for variance equality with median center (`levene_`), 309 | omitting NaNs. 310 | Omits non-numeric columns. 311 | 312 | :param orig: The original dataframe. 313 | :param synth: The synthesized analog. 314 | :returns: A dataframe with a row for each numeric column of the synthesized 315 | dataset, with a test statistic (`_stat`) and p-value (`_pval`) column 316 | for each of the tests performed. 317 | ''' 318 | recs = [] 319 | index = [] 320 | for col in synth.columns: 321 | if pd.api.types.is_numeric_dtype(synth[col]): 322 | t, tp = scipy.stats.ttest_ind( 323 | orig[col], synth[col], nan_policy='omit' 324 | ) 325 | if not isinstance(t, float): 326 | t, tp = np.nan, np.nan 327 | lev, levp = scipy.stats.levene( 328 | orig[col].dropna(), synth[col].dropna(), center='median' 329 | ) 330 | recs.append((t, tp, lev, levp)) 331 | index.append(col) 332 | return pd.DataFrame.from_records( 333 | recs, index=index, 334 | columns=['t_stat', 't_pval', 'levene_stat', 'levene_pval'] 335 | ) 336 | 337 | 338 | DEFAULT_DISCRIMINATORS: List[sklearn.base.ClassifierMixin] = [ 339 | sklearn.ensemble.GradientBoostingClassifier(n_estimators=10), 340 | sklearn.ensemble.RandomForestClassifier(n_estimators=10), 341 | # sklearn.linear_model.LogisticRegression(max_iter=250), 342 | # sklearn.linear_model.Perceptron(), 343 | # sklearn.linear_model.RidgeClassifier(), 344 | sklearn.naive_bayes.GaussianNB(), 345 | sklearn.neighbors.KNeighborsClassifier(), 346 | # sklearn.neighbors.RadiusNeighborsClassifier(), 347 | # sklearn.svm.LinearSVC(), 348 | # sklearn.svm.NuSVC(), 349 | # sklearn.svm.SVC(), 350 | sklearn.tree.DecisionTreeClassifier(), 351 | ] 352 | 353 | 354 | def discrimination(orig: pd.DataFrame, 355 | synth: pd.DataFrame, 356 | classifiers: Iterable[sklearn.base.ClassifierMixin] = DEFAULT_DISCRIMINATORS, 357 | metrics: Optional[List[str]] = None, 358 | test_size: float = .25, 359 | return_best: bool = False, 360 | ) -> Union[ 361 | pd.Series, 362 | Tuple[pd.Series, Optional[sklearn.base.ClassifierMixin]] 363 | ]: 364 | '''Calculate how well the synthesized rows can be discriminated from originals. 365 | 366 | Fits each of the provided classifiers to predict whether the given row is 367 | synthesized or original, measures their accuracy on a test sample and 368 | gives a detailed evaluation of the best-performing one. 369 | 370 | :param orig: The original dataframe. 371 | :param synth: The synthesized analog. 372 | :param classifiers: Unfitted classifiers to try the discrimination. The 373 | one with the best ROC AUC on the test sample is selected. 374 | :param metrics: Names of discrimination accuracy metrics to compute. 375 | If None, all of these metrics are computed using their scikit-learn 376 | implementations: 377 | 378 | - `auc`: ROC Area Under Curve (0.5 is no discrimination, 1 full 379 | discrimination). 380 | - `gini`: ROC Gini coefficient (0 is no discrimination, 1 full 381 | discrimination: `gini = 2 * auc - 1`). 382 | - `ap`: Average Precision (evaluates the precision-recall curve)[#]. 383 | - `matthews`: Matthews' four-square table correlation coefficient. 384 | - `f1`: F1-score. 385 | - `accuracy`: Ordinary accuracy (fraction of equally labeled rows). 386 | - `precision`: Classification precision. 387 | - `recall`: Classification recall. 388 | - `cohen_kappa`: Cohen's kappa score of annotator agreement. 389 | - `hamming`: Hamming loss. 390 | - `jaccard`: Jaccard score. 391 | :param test_size: Fraction of the input to use for evaluating discrimination 392 | performance (and not for discriminator training). The train/test split 393 | is stratified on original/synthetic origins. 394 | :param return_best: Return the best performing fitted discriminator 395 | along with the metrics. 396 | :returns: A series of discrimination classification performance metrics 397 | with their aforementioned names in the index. If return_best is True, 398 | return a tuple with the metrics and the fitted discriminator. 399 | 400 | [#] "Average precision". Wikipedia. In: Information retrieval. 401 | 402 | ''' 403 | feats = _predictor_matrix(pd.concat([orig, synth])) 404 | print(feats) 405 | target = np.hstack(( 406 | np.zeros(len(orig.index), dtype=bool), 407 | np.ones(len(synth.index), dtype=bool) 408 | )) 409 | best_est, best_probs, test_tgts = _find_best_classifier( 410 | feats, target, classifiers, test_size 411 | ) 412 | metric_series = _compute_accuracy_metrics(test_tgts, best_probs, metrics) 413 | if return_best: 414 | return metric_series, best_est 415 | else: 416 | return metric_series 417 | 418 | 419 | DEFAULT_METRICS: Dict[str, Tuple[Union[str, Callable], bool]] = { 420 | 'auc': ('roc_auc_score', False), 421 | 'gini': ( 422 | (lambda trues, probs: 2 * sklearn.metrics.roc_auc_score(trues, probs) - 1), 423 | False 424 | ), 425 | 'ap': ('average_precision_score', False), 426 | 'matthews': ('matthews_corrcoef', True), 427 | 'f1': ('f1_score', True), 428 | 'accuracy': ('accuracy_score', True), 429 | 'precision': ('precision_score', True), 430 | 'recall': ('recall_score', True), 431 | 'cohen_kappa': ('cohen_kappa_score', True), 432 | 'hamming': ('hamming_loss', True), 433 | 'jaccard': ('jaccard_score', True), 434 | } 435 | 436 | 437 | def _find_best_classifier(feats: np.ndarray, 438 | target: np.ndarray, 439 | classifiers: Iterable[sklearn.base.ClassifierMixin] = DEFAULT_DISCRIMINATORS, 440 | test_size: float = .25, 441 | ) -> Tuple[ 442 | sklearn.base.ClassifierMixin, 443 | np.ndarray, np.ndarray 444 | ]: 445 | train_feats, test_feats, train_tgts, test_tgts = \ 446 | sklearn.model_selection.train_test_split( 447 | feats, target, test_size=test_size, stratify=target 448 | ) 449 | best_auc = 0.5 450 | best_est = None 451 | best_probs = np.full_like(test_tgts, .5, dtype=np.double) 452 | for clf in classifiers: 453 | clf.fit(train_feats, train_tgts) 454 | probs = clf.predict_proba(test_feats)[:,1] 455 | auc = sklearn.metrics.roc_auc_score(test_tgts, probs) 456 | if auc > best_auc: 457 | best_est = clf 458 | best_auc = auc 459 | best_probs = probs 460 | return best_est, best_probs, test_tgts 461 | 462 | 463 | def _compute_accuracy_metrics(targets: np.ndarray, 464 | probs: np.ndarray, 465 | metrics: Optional[List[str]] = None, 466 | ) -> pd.Series: 467 | preds = probs >= .5 468 | metric_results = {} 469 | for name, conf in DEFAULT_METRICS.items(): 470 | if metrics is None or name in metrics: 471 | fx, do_threshold = conf 472 | if isinstance(fx, str): 473 | fx = getattr(sklearn.metrics, fx) 474 | metric_results[name] = fx( 475 | targets, 476 | (preds if do_threshold else probs) 477 | ) 478 | return pd.Series(metric_results) 479 | 480 | 481 | def _predictor_matrix(dataframe: pd.DataFrame): 482 | dataframe = pd.get_dummies( 483 | dataframe, 484 | dummy_na=True, 485 | ) 486 | fillers = { 487 | col: dataframe[col].median() 488 | for col in dataframe.columns 489 | if dataframe[col].hasnans 490 | } 491 | return dataframe.fillna(value=fillers).values 492 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | scipy 4 | scikit-learn>=0.22 -------------------------------------------------------------------------------- /requirements_test.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | sphinx 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open('README.md') as infile: 4 | long_description = infile.read() 5 | 6 | with open('requirements.txt') as infile: 7 | required = [line.strip() for line in infile] 8 | 9 | with open('VERSION') as infile: 10 | version = infile.read().strip() 11 | 12 | setuptools.setup( 13 | name='pysynth', 14 | version=version, 15 | description='Dataset synthesis for Python', 16 | long_description=long_description, 17 | long_description_content_type='text/markdown; charset=UTF-8', 18 | author='Jan Šimbera', 19 | author_email='simbera.jan@gmail.com', 20 | python_requires='>=3.7.0', 21 | url='https://github.com/simberaj/pysynth', 22 | packages=setuptools.find_packages(exclude=('tests', )), 23 | install_requires=required, 24 | extras_require={}, 25 | include_package_data=True, 26 | license='MIT', 27 | keywords='synthesis ipf data python', 28 | classifiers=[ 29 | 'Development Status :: 2 - Pre-Alpha', 30 | 'Environment :: Console', 31 | 'Intended Audience :: Developers', 32 | 'License :: OSI Approved :: MIT License', 33 | 'Natural Language :: English', 34 | 'Programming Language :: Python', 35 | 'Programming Language :: Python :: 3', 36 | ], 37 | zip_safe=True 38 | ) 39 | -------------------------------------------------------------------------------- /tests/test_catdecat.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import itertools 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import scipy.stats 8 | import pytest 9 | 10 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 11 | import pysynth.catdecat 12 | 13 | import test_data 14 | 15 | np.random.seed(1711) 16 | 17 | @pytest.mark.parametrize('binner_cls, bins', list(itertools.product( 18 | pysynth.catdecat.BINNERS.values(), [5, 10, 20], 19 | ))) 20 | def test_binners_formal(binner_cls, bins): 21 | binner = binner_cls(bins) 22 | cutvals = binner.get(pd.Series(np.random.rand(100))) 23 | assert isinstance(cutvals, list) 24 | assert len(cutvals) == bins - 1 25 | assert all(isinstance(cutval, float) for cutval in cutvals) 26 | 27 | @pytest.mark.parametrize('bins', [4, 8, 12]) 28 | def test_quantile_binner(bins): 29 | binner = pysynth.catdecat.QuantileBinner(bins) 30 | for i in range(10): 31 | vals = pd.Series(np.random.rand(100)) 32 | cuts = binner.get(vals) 33 | assert np.isclose( 34 | cuts, 35 | np.percentile(vals, (np.arange(bins - 1) + 1) / bins * 100) 36 | ).all() 37 | 38 | @pytest.mark.parametrize('bins', [4, 8, 12]) 39 | def test_equalrange_binner(bins): 40 | binner = pysynth.catdecat.EqualRangeBinner(bins) 41 | for i in range(10): 42 | vals = pd.Series(np.random.rand(100)) 43 | cuts = binner.get(vals) 44 | inner_widths = np.diff(cuts) 45 | assert np.isclose(inner_widths.min(), inner_widths.max()) 46 | assert np.isclose(inner_widths.mean(), cuts[0] - vals.min()) 47 | assert np.isclose(inner_widths.mean(), vals.max() - cuts[-1]) 48 | 49 | def test_apriori_binner(): 50 | for i in range(10): 51 | vals = pd.Series(np.random.rand(100)) 52 | cuts = np.sort(vals.sample(10).unique()).tolist() 53 | binner = pysynth.catdecat.AprioriBinner(cuts) 54 | assert binner.get(vals) == cuts 55 | 56 | 57 | @pytest.mark.parametrize('dist_cls', pysynth.catdecat.ContinuousDistributor.CODES.values()) 58 | def test_continuous_distributors(dist_cls): 59 | distributor = dist_cls(seed=42) 60 | minval = 2 61 | maxval = 7 62 | for i in range(10): 63 | vals = np.random.rand(100) * (maxval - minval) + minval 64 | distributor.fit(vals) 65 | reconst = distributor.sample(100) 66 | assert minval <= reconst.min() <= reconst.max() <= maxval 67 | 68 | @pytest.mark.parametrize('dist_cls', pysynth.catdecat.DiscreteDistributor.CODES.values()) 69 | def test_discrete_distributors(dist_cls): 70 | distributor = dist_cls(seed=42) 71 | minval = 2 72 | maxval = 12 73 | for i in range(10): 74 | vals = (np.random.rand(100) * (maxval - minval) + minval).astype(int) 75 | uniques = np.unique(vals) 76 | distributor.fit(vals) 77 | reconst = distributor.sample(100) 78 | assert minval <= reconst.min() <= reconst.max() <= maxval 79 | assert np.isin(reconst, uniques).all() 80 | 81 | def test_restricted_sampler_ok(): 82 | minval = 1 83 | maxval = 3 84 | testdist = scipy.stats.norm(2, 1) 85 | sampler = pysynth.catdecat.restricted_sampler(testdist.rvs, minval, maxval) 86 | x = sampler(1000) 87 | assert (x >= minval).all() 88 | assert (x <= maxval).all() 89 | assert len(x) == 1000 90 | 91 | def test_restricted_sampler_fail(): 92 | minval = 1 93 | maxval = 3 94 | testgen = lambda n: np.full(n, 4) 95 | sampler = pysynth.catdecat.restricted_sampler(testgen, 1, 3) 96 | with pytest.raises(ValueError): 97 | x = sampler(1000) 98 | 99 | 100 | def test_mean_distributor(): 101 | dist = pysynth.catdecat.MeanDistributor() 102 | for i in range(10): 103 | vals = np.random.rand(100) 104 | val_mean = vals.mean() 105 | dist.fit(vals) 106 | assert (dist.sample(20) == np.array([val_mean] * 20)).all() 107 | 108 | 109 | SERIES_DISCRETIZERS = [ 110 | pysynth.catdecat.SeriesDiscretizer(seed=42), 111 | pysynth.catdecat.SeriesDiscretizer(binner='equalrange', continuous_distributor='mean', seed=42), 112 | ] 113 | 114 | @pytest.mark.parametrize('categ, na_frac', list(itertools.product( 115 | SERIES_DISCRETIZERS, [0, 0.2, 1] 116 | ))) 117 | def test_discretizer_numeric(categ, na_frac): 118 | size = 100 119 | minval = -3 120 | maxval = 10 121 | vals = pd.Series(np.random.rand(size) * 13 - 3) 122 | vals[np.random.rand(size) < na_frac] = np.nan 123 | cats = categ.fit_transform(vals) 124 | check_series_properly_discretized(vals, cats, categ.inverse_transform(cats)) 125 | 126 | @pytest.mark.parametrize('n_cats', [2, 20, 70]) 127 | def test_discretizer_category(n_cats): 128 | vals = pd.Series(np.random.choice([chr(48 + i) for i in range(n_cats)], 300)) 129 | c = pysynth.catdecat.SeriesDiscretizer(seed=42) 130 | with pytest.raises(TypeError): 131 | trans = c.fit_transform(vals) 132 | 133 | 134 | @pytest.mark.parametrize('n_vals', [2, 20, 70]) 135 | def test_discretizer_integer(n_vals): 136 | vals = pd.Series(np.random.randint(n_vals, size=300)) 137 | c = pysynth.catdecat.SeriesDiscretizer(seed=42) 138 | cats = c.fit_transform(vals) 139 | if n_vals < c.min_for_bin: 140 | assert (cats == vals).all() 141 | else: 142 | check_series_properly_discretized(vals, cats, c.inverse_transform(cats)) 143 | 144 | 145 | def check_df_properly_discretized(df, tr_df, reconst_df, max_nums=10): 146 | orig_cols = frozenset(df.columns) 147 | assert orig_cols == frozenset(tr_df.columns) 148 | assert orig_cols == frozenset(reconst_df.columns) 149 | for col in df.columns: 150 | check_series_properly_discretized( 151 | df[col], 152 | tr_df[col], 153 | reconst_df[col], 154 | max_nums=max_nums 155 | ) 156 | 157 | def check_series_properly_discretized(orig, tr, reconst, max_nums=10): 158 | orig_notna = orig.notna() 159 | tr_notna = tr.notna() 160 | reconst_notna = reconst.notna() 161 | assert (orig_notna == tr_notna).all() 162 | assert (orig_notna == reconst_notna).all() 163 | if pd.api.types.is_numeric_dtype(orig): 164 | if pd.api.types.is_categorical_dtype(tr): 165 | for val, interv, reconst in zip(orig[orig_notna], tr[tr_notna], reconst[reconst_notna]): 166 | assert val in interv 167 | assert reconst in interv 168 | else: 169 | assert orig.nunique() <= max_nums 170 | assert (orig[orig_notna] == tr[tr_notna]).all() 171 | else: 172 | assert (orig[orig_notna] == tr[tr_notna]).all() 173 | 174 | 175 | @pytest.mark.parametrize('openml_id', [31, 1461, 40536]) 176 | def test_df_discretizer(openml_id): 177 | disc = pysynth.catdecat.DataFrameDiscretizer(max_num_cats=300) 178 | df = test_data.get_openml(openml_id) 179 | tr_df = disc.fit_transform(df) 180 | tr2_df = disc.transform(df) 181 | pd.testing.assert_frame_equal(tr_df, tr2_df) 182 | reconst_df = disc.inverse_transform(tr_df) 183 | check_df_properly_discretized(df, tr_df, reconst_df, max_nums=10) 184 | 185 | -------------------------------------------------------------------------------- /tests/test_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sklearn.datasets 3 | 4 | def get_openml(id): 5 | return sklearn.datasets.fetch_openml( 6 | data_id=id, 7 | target_column=None, 8 | as_frame=True, 9 | )['data'] 10 | 11 | def test_openml(): 12 | colnames = [ 13 | 'fathers_occupation', 14 | 'sons_occupation', 15 | 'family_structure', 16 | 'race', 17 | 'counts_for_sons_first_occupation', 18 | 'counts_for_sons_current_occupation' 19 | ] 20 | df = get_openml(541) 21 | assert df.columns.tolist() == colnames 22 | assert (df[colnames[:4]].dtypes == 'category').all() 23 | assert df[colnames[4:]].dtypes.apply(pd.api.types.is_numeric_dtype).all() 24 | assert len(df.index) == 1156 25 | 26 | def check_synthdf_equal(df, synth, nrows=None): 27 | assert frozenset(df.columns) == frozenset(synth.columns) 28 | assert len(synth.index) == (len(df.index) if nrows is None else nrows) 29 | for col in df.columns: 30 | assert df[col].dtype == synth[col].dtype 31 | if pd.api.types.is_categorical_dtype(df[col].dtype): 32 | assert frozenset(synth[col].cat.categories).issubset(df[col].cat.categories) 33 | elif pd.api.types.is_object_dtype(df[col].dtype): 34 | assert frozenset(synth[col]).issubset(frozenset(df[col])) 35 | elif pd.api.types.is_numeric_dtype(df[col].dtype): 36 | assert df[col].min() <= synth[col].mean() <= df[col].max() 37 | -------------------------------------------------------------------------------- /tests/test_init.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import tempfile 4 | import shutil 5 | 6 | import pandas as pd 7 | 8 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 9 | import pysynth 10 | import test_data 11 | 12 | def test_synthesize(): 13 | df = test_data.get_openml(469) # analcatdata_dmft 14 | synth = pysynth.synthesize(df) 15 | test_data.check_synthdf_equal(df, synth) 16 | 17 | def test_main(): 18 | tmp_dir = None 19 | try: 20 | tmp_dir = tempfile.mkdtemp() 21 | in_path = os.path.join(tmp_dir, 'source.csv') 22 | out_path = os.path.join(tmp_dir, 'target.csv') 23 | test_data.get_openml(469).to_csv(in_path, sep=';', index=False) 24 | assert os.path.isfile(in_path) 25 | pysynth.main(in_path, out_path, '200') 26 | assert os.path.isfile(out_path) 27 | orig = pd.read_csv(in_path, sep=';') 28 | synth = pd.read_csv(out_path, sep=';') 29 | test_data.check_synthdf_equal(orig, synth, 200) 30 | finally: 31 | if tmp_dir is not None: 32 | shutil.rmtree(tmp_dir) -------------------------------------------------------------------------------- /tests/test_ipf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import itertools 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pytest 8 | 9 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 10 | import pysynth.ipf 11 | import test_data 12 | 13 | IPF_PRECISION = 1e-10 14 | 15 | np.random.seed(1711) 16 | 17 | SEED_GEN_PARAMS = [ 18 | ((4, 4), 0), 19 | ((8, 5), 0), 20 | ((5, 3, 3), 0), 21 | ((2, 8, 7, 4, 3), 0), 22 | ((4, 4), 0.1), 23 | ((8, 5), 0.2), 24 | ((5, 3, 3), 0.1), 25 | ((2, 8, 7, 4, 3), 0.05), 26 | ] 27 | 28 | def generate_seed_matrix(shape, zero_fraction): 29 | seed_matrix = np.random.rand(*shape) 30 | if zero_fraction > 0: 31 | seed_matrix[np.random.rand(*shape) < zero_fraction] = 0 32 | return seed_matrix 33 | 34 | @pytest.mark.parametrize('shape, zero_fraction', SEED_GEN_PARAMS) 35 | def test_ipf_correct(shape, zero_fraction): 36 | seed_matrix = generate_seed_matrix(shape, zero_fraction) 37 | marginals = [ 38 | np.random.rand(dim) for dim in shape 39 | ] 40 | for i, marginal in enumerate(marginals): 41 | margsum = marginal.sum() 42 | marginals[i] = np.array([val * 50 / margsum for val in marginal]) 43 | ipfed = pysynth.ipf.ipf(marginals, seed_matrix, precision=IPF_PRECISION) 44 | # check the shape and zeros are retained 45 | assert ipfed.shape == shape 46 | assert ((seed_matrix == 0) == (ipfed == 0)).all() 47 | if zero_fraction == 0: 48 | for i, marginal in enumerate(marginals): 49 | ipfed_sum = ipfed.sum(axis=tuple(j for j in range(ipfed.ndim) if j != i)) 50 | # check the marginal sums match 51 | assert (abs(ipfed_sum - marginal) < (IPF_PRECISION * ipfed.size / len(marginal) * 10)).all() 52 | 53 | def test_ipf_dim_mismatch(): 54 | with pytest.raises(ValueError): 55 | pysynth.ipf.ipf(list(np.ones((3,2))), np.random.rand(2,2)) 56 | 57 | def test_ipf_sum_mismatch(): 58 | with pytest.raises(ValueError): 59 | pysynth.ipf.ipf([np.ones(2), np.full(2, 2)], np.random.rand(2,2)) 60 | 61 | def test_ipf_shape_mismatch(): 62 | with pytest.raises(ValueError): 63 | pysynth.ipf.ipf([np.ones(2), np.full((2, 4), .25)], np.random.rand(2,2)) 64 | 65 | @pytest.mark.parametrize('openml_id', [31, 1461, 40536]) 66 | def test_get_axis_values(openml_id): 67 | df = test_data.get_openml(openml_id) 68 | df = df.drop( 69 | [col for col, dtype in df.dtypes.iteritems() if not pd.api.types.is_categorical_dtype(dtype)], 70 | axis=1 71 | ) 72 | maps = pysynth.ipf.get_axis_values(df) 73 | for col in maps: 74 | assert col in df.columns 75 | assert (maps[col].index == np.arange(len(maps[col].index))).all() 76 | assert frozenset(maps[col].values) == frozenset(df[col].unique()) 77 | 78 | ROUNDERS = [ 79 | pysynth.ipf.LargestRemainderRounder(), 80 | pysynth.ipf.RandomSamplingRounder(seed=1711), 81 | ] 82 | 83 | UNROUND_MATRICES = [ 84 | np.array([[[2.,.5],[.5,0]],[[1.2,1],[1,.8]],[[1,.2],[1,1.8]]]), 85 | np.random.rand(4,8,7) * 3, 86 | np.where(np.random.rand(3,7,4,2) < .2, 0, np.random.rand(3,7,4,2) * 2), 87 | ] 88 | 89 | @pytest.mark.parametrize('rder, mat', list(itertools.product( 90 | ROUNDERS, UNROUND_MATRICES 91 | ))) 92 | def test_rounders(rder, mat): 93 | result = rder.round(mat) 94 | assert np.issubdtype(result.dtype, np.integer) 95 | assert result.sum() == int(np.round(mat.sum())) 96 | assert result.min() >= 0 97 | assert result[mat == 0].sum() == 0 98 | # for dim_i, dim in enumerate(mat.shape): 99 | # assert (result[:,dim_i] < dim).all() 100 | 101 | @pytest.mark.parametrize('mat', UNROUND_MATRICES) 102 | def test_lrem_rounder(mat): 103 | result = pysynth.ipf.LargestRemainderRounder().round(mat) 104 | assert abs(result - np.round(mat)).max() <= 1 105 | 106 | @pytest.mark.parametrize('mat', [ 107 | np.array([[[2,1],[0,0]],[[1,1],[1,3]],[[1,0],[1,2]]]), 108 | (np.random.rand(4,8,7) * 3).astype(int), 109 | (np.where(np.random.rand(3,7,4,2) < .2, 0, np.random.rand(3,7,4,2) * 2)).astype(int), 110 | ]) 111 | def test_unroll(mat): 112 | unrolled = pysynth.ipf.unroll(mat) 113 | assert unrolled.shape == (mat.sum(), mat.ndim) 114 | assert (unrolled >= 0).all() 115 | for dim_i, dim in enumerate(mat.shape): 116 | assert (unrolled[:,dim_i] < dim).all() 117 | unroll_df = pd.DataFrame(unrolled) 118 | for index, subdf in unroll_df.groupby(unroll_df.columns.tolist()): 119 | assert mat[index] == len(subdf.index) 120 | 121 | 122 | def test_map_axes(): 123 | n_cols = 6 124 | n_cats = 5 125 | indices = (np.random.rand(40, n_cols) * n_cats).astype(int) 126 | axis_values = { 127 | chr(97 + np.random.randint(26)): pd.Series( 128 | [chr(97 + k) for k in np.random.randint(26, size=n_cats)], 129 | index=np.arange(n_cats) 130 | ) for i in range(n_cols) 131 | } 132 | df = pysynth.ipf.map_axes(indices, axis_values) 133 | assert list(df.columns) == list(axis_values.keys()) 134 | assert len(df.index) == indices.shape[0] 135 | i = 0 136 | for col, mapping in axis_values.items(): 137 | for index, value in mapping.iteritems(): 138 | assert (df[col][indices[:,i] == index] == value).all() 139 | i += 1 140 | 141 | def test_calc_true_matrix(): 142 | ipfsynth = pysynth.ipf.IPFSynthesizer(cond_dim=2) 143 | n_rows = 400 144 | n_cols = 4 145 | n_cats = 5 146 | cat_indices = np.arange(n_cats) 147 | cat_objs = np.array([chr(97 + k) for k in np.arange(n_cats)]) 148 | map_to_objs = pd.Series(cat_objs, index=cat_indices) 149 | ind_df = pd.DataFrame( 150 | np.random.randint(n_cats, size=(n_rows, n_cols)), 151 | columns=[chr(97 + k) for k in np.arange(n_cols)] 152 | ) 153 | cat_df = ind_df.copy() 154 | for col in cat_df.columns: 155 | cat_df[col] = cat_df[col].map(map_to_objs) 156 | ipfsynth.axis_values = { 157 | col: map_to_objs for col in cat_df.columns 158 | } 159 | true_mat = ipfsynth.calc_true_matrix(cat_df) 160 | assert true_mat.shape == tuple([n_cats] * n_cols) 161 | assert true_mat.min() >= 0 162 | assert true_mat.sum() == n_rows 163 | assert np.isclose(true_mat, true_mat.astype(int)).all() 164 | for inds, subdf in ind_df.groupby(list(ind_df.columns)): 165 | assert true_mat[inds] == len(subdf.index) 166 | 167 | # @pytest.mark.parametrize('shape, zero_fraction', [((4, 3, 2), .1)]) 168 | @pytest.mark.parametrize('shape, zero_fraction', SEED_GEN_PARAMS) 169 | def test_obscure_seed(shape, zero_fraction): 170 | seed_matrix = (generate_seed_matrix(shape, zero_fraction) * 10).astype(int) 171 | n_dims = seed_matrix.ndim 172 | for cond_dim in range(1, min(n_dims + 1, 4)): 173 | obscured = pysynth.ipf.obscure_seed(seed_matrix, cond_dim) 174 | preserved_dims = [ 175 | tuple(sorted(frozenset(dims))) 176 | for dims in itertools.combinations_with_replacement( 177 | range(n_dims), cond_dim 178 | ) 179 | ] 180 | print(preserved_dims) 181 | for sel_dim_is in preserved_dims: 182 | other_dim_is = tuple(i for i in range(n_dims) if i not in sel_dim_is) 183 | assert np.isclose( 184 | obscured.sum(axis=other_dim_is), 185 | seed_matrix.sum(axis=other_dim_is) 186 | ).all() 187 | 188 | # @pytest.mark.parametrize('openml_id', [11, 31, 1461, 1480]) 189 | @pytest.mark.parametrize('openml_id, rder', [(11, 'lrem'), (23, 'random')]) 190 | def test_synth(openml_id, rder): 191 | df = test_data.get_openml(openml_id) 192 | synth = pysynth.ipf.IPFSynthesizer(rounder=rder).fit_transform(df) 193 | test_data.check_synthdf_equal(df, synth) 194 | assert frozenset(df.columns) == frozenset(synth.columns) 195 | assert len(df.index) == len(synth.index) 196 | for col in df.columns: 197 | assert df[col].dtype == synth[col].dtype 198 | if pd.api.types.is_categorical_dtype(df[col].dtype): 199 | assert frozenset(synth[col].cat.categories).issubset(df[col].cat.categories) 200 | else: 201 | assert df[col].min() <= synth[col].mean() <= df[col].max() 202 | -------------------------------------------------------------------------------- /tests/test_similarity.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import itertools 4 | import warnings 5 | 6 | import scipy.stats 7 | import numpy as np 8 | import pandas as pd 9 | import pytest 10 | import sklearn.base 11 | import sklearn.exceptions 12 | 13 | sys.path.append(os.path.join(os.path.dirname(__file__), '..')) 14 | import pysynth.similarity 15 | import test_data 16 | 17 | 18 | def generate_artificial_df(n=200): 19 | return pd.DataFrame({ 20 | 'prob': np.random.rand(n), 21 | 'iq': scipy.stats.norm(loc=100, scale=15).rvs(n), 22 | 'cat' : np.random.choice(list('abc'), size=n), 23 | }) 24 | 25 | 26 | @pytest.fixture(scope='module') 27 | def testing_df_close_pairs(): 28 | np.random.seed(1711) 29 | pairs = [ 30 | (generate_artificial_df(1000), generate_artificial_df(300)), 31 | ] 32 | for openml_id in [31, 1461, 40536]: 33 | df = test_data.get_openml(openml_id) 34 | half = len(df.index) // 2 35 | pairs.append((df.iloc[:half], df.iloc[half:])) 36 | return pairs 37 | 38 | 39 | @pytest.mark.parametrize('distro_name, n_samples', itertools.product( 40 | ['norm', 'uniform'], 41 | [2, 30, 1000], 42 | )) 43 | def test_summary_stats_artificial(distro_name, n_samples): 44 | distro = getattr(scipy.stats, distro_name)() 45 | data = pd.Series(distro.rvs(n_samples)) 46 | sumstat = pysynth.similarity.summary_stats(data) 47 | for item in pysynth.similarity.SUMMARY_STATS: 48 | assert item in sumstat.index 49 | assert np.isclose(sumstat['mean'], data.mean()) 50 | assert np.isclose(sumstat['skew'], data.skew()) or ( 51 | np.isnan(sumstat['skew']) and np.isnan(data.skew()) 52 | ) 53 | 54 | def test_summary_stat_diff_artificial(): 55 | df1, df2 = generate_artificial_df(), generate_artificial_df() 56 | df_diff = pysynth.similarity.summary_stat_diff(df1, df2, 'diff') 57 | # check range for probs column 58 | assert (-1 < df_diff.loc['prob',:]).all() 59 | assert (df_diff.loc['prob',:] < 1).all() 60 | 61 | 62 | @pytest.mark.parametrize('method', ['diff', 'ape']) 63 | def test_summary_stat_diff_real(testing_df_close_pairs, method): 64 | for df1, df2 in testing_df_close_pairs: 65 | nodiff = pysynth.similarity.summary_stat_diff(df1, df2, method) 66 | # check all stats have their column 67 | assert frozenset(nodiff.columns) == frozenset([ 68 | col + '_' + method for col in pysynth.similarity.SUMMARY_STATS 69 | ]) 70 | # check all numeric columns have their row 71 | assert frozenset(nodiff.index) == frozenset([ 72 | col for col in df2 if pd.api.types.is_numeric_dtype(df2[col]) 73 | ]) 74 | if df1 is df2: 75 | assert np.isclose(nodiff, 0).all() 76 | 77 | 78 | @pytest.mark.parametrize('n_bins', [5, 10, 15]) 79 | def test_aligned_freqs_normal(n_bins): 80 | np.random.seed(1711) 81 | df1, df2 = generate_artificial_df(1000), generate_artificial_df(300) 82 | for col in df1.columns: 83 | f1, f2 = pysynth.similarity.aligned_freqs(df1[col], df2[col], n_bins) 84 | f1_cats = frozenset(f1.index) 85 | assert not f1.hasnans 86 | assert not f2.hasnans 87 | assert f1_cats == frozenset(f2.index) 88 | assert np.isclose(f1.sum(), 1) 89 | assert (f1 >= 0).all() and (f1 <= 1).all() 90 | assert np.isclose(f2.sum(), 1) 91 | assert (f2 >= 0).all() and (f2 <= 1).all() 92 | # since the generation process is the same, the diffs should be low 93 | assert abs(f1 - f2).max() < .1 94 | if pd.api.types.is_numeric_dtype(df1[col]): 95 | assert len(f1_cats) == n_bins 96 | # the frequencies in f1 should be appx equal due to quantile binning 97 | assert (abs(f1[f1 > 0] - 1 / len(f1_cats)) < .01).all() 98 | else: 99 | assert len(f1_cats) == df1[col].nunique() 100 | 101 | 102 | def test_aligned_freqs_nobin(): 103 | x, y = pd.Series(np.random.rand(300)), pd.Series(np.random.rand(100)) 104 | f_x, f_y = pysynth.similarity.aligned_freqs(x, y, bins=None) 105 | assert f_x is None 106 | assert f_y is None 107 | 108 | 109 | @pytest.mark.parametrize('bins, metrics', itertools.product( 110 | [5, 10, None], [None, ['rtae', 'mae']] 111 | )) 112 | def test_frequency_mismatch_normal(testing_df_close_pairs, bins, metrics): 113 | for df1, df2 in testing_df_close_pairs: 114 | metric_df = pysynth.similarity.frequency_mismatch(df1, df2, bins, metrics) 115 | if metrics is None: 116 | metrics = list(freqdiff_metric_bounds.keys()) 117 | assert frozenset(metric_df.columns) == frozenset(metrics) 118 | if bins is None: 119 | assert metric_df.index.tolist() == [ 120 | col for col in df2 if not pd.api.types.is_numeric_dtype(df2[col]) 121 | ] 122 | else: 123 | assert metric_df.index.tolist() == df2.columns.tolist() 124 | assert all(not metric_df[col].hasnans for col in metric_df.columns) 125 | 126 | def random_freqs(n, zero_frac, index): 127 | if not (0 <= zero_frac < 1): raise ValueError 128 | freqs = np.random.rand(n) 129 | freqs[np.random.rand(n) < zero_frac] = 0 130 | return pd.Series(freqs / freqs.sum(), index=index) 131 | 132 | freqdiff_metric_bounds = { 133 | 'rtae': (0, 0, 2), 134 | 'overlap_coef': (1, 0, 1), 135 | 'rank_damerau': (0, 0, 1), 136 | 'morisita_overlap': (1, 0, 1), 137 | 'mae': (0, 0, 1), 138 | 'rmse': (0, 0, 1), 139 | 'jaccard_dist': (0, 0, 1), 140 | 'simpson_diff': (0, -1, 1), 141 | 'entropy_diff': (0, -np.inf, np.inf), 142 | } 143 | 144 | @pytest.mark.parametrize('n_cats, zero_frac, metrics', itertools.product( 145 | [2, 5, 15], [0, .2, .6], [None, ['rtae', 'mae']] 146 | )) 147 | def test_freqdiff_metrics(n_cats, zero_frac, metrics): 148 | np.random.seed(1711) 149 | if n_cats == 2 and zero_frac == .6: return # invalid case 150 | cats = ['c' + str(i) for i in range(n_cats)] 151 | probs1 = random_freqs(n_cats, zero_frac, cats) 152 | probs2 = random_freqs(n_cats, zero_frac, cats) 153 | metric_vals = pysynth.similarity.freqdiff_metrics(probs1, probs2, metrics) 154 | nodiff_vals = pysynth.similarity.freqdiff_metrics(probs1, probs1, metrics) 155 | if metrics is None: 156 | metrics = list(freqdiff_metric_bounds.keys()) 157 | assert frozenset(metric_vals.index) == frozenset(metrics) 158 | assert pd.api.types.is_numeric_dtype(metric_vals) 159 | for metric, value in metric_vals.items(): 160 | assert metric in freqdiff_metric_bounds 161 | nodiff, lo, hi = freqdiff_metric_bounds[metric] 162 | assert lo <= value <= hi 163 | assert metric in nodiff_vals.index 164 | assert np.isclose(nodiff_vals[metric], nodiff) 165 | 166 | 167 | damlev_test_cases = [ 168 | ([8, 3, 7], [8, 3, 7], 0), 169 | (['a', 'b', 'c'], ['a', 'b', 'c'], 0), 170 | (list(range(100)), list(range(100)), 0), 171 | (['c'], ['c'], 0), 172 | ([], [], 0), 173 | ([6, 4, 2], [4, 2], 1), 174 | ([6, 4, 2], [6, 4], 1), 175 | ([3, 8, 1], [], 3), 176 | ([3, 8, 1], [9, 1], 2), 177 | ([3, 8, 1], [8, 3, 1], 1), 178 | ([3, 8, 1], [1, 8, 3], 2), 179 | ([3, 8, 1], [3, 9, 1], 1), 180 | ] 181 | 182 | def test_damerau_levenshtein(): 183 | for seq1, seq2, dist in damlev_test_cases: 184 | assert pysynth.similarity.damerau_levenshtein(seq1, seq2) == dist 185 | assert pysynth.similarity.damerau_levenshtein(seq2, seq1) == dist 186 | 187 | 188 | @pytest.mark.parametrize('method', ['pearson', 'kendall', 'spearman']) 189 | def test_correlation_diff(testing_df_close_pairs, method): 190 | for df1, df2 in testing_df_close_pairs: 191 | corrdiff = pysynth.similarity.correlation_diff(df1, df2, method) 192 | ok_subset = [] 193 | for col in df2.columns: 194 | if pd.api.types.is_numeric_dtype(df2[col]): 195 | assert col in corrdiff.index 196 | assert col in corrdiff.columns 197 | if not np.isnan(corrdiff[col]).all(): 198 | ok_subset.append(col) 199 | selfcorr_diffs = np.diag(corrdiff.loc[ok_subset,ok_subset].values) 200 | assert (selfcorr_diffs == 0).all() 201 | assert (np.isnan(corrdiff) | (corrdiff >= -1)).all(axis=None) 202 | assert (np.isnan(corrdiff) | (corrdiff <= 1)).all(axis=None) 203 | 204 | 205 | def test_stat_tests(testing_df_close_pairs): 206 | for df1, df2 in testing_df_close_pairs: 207 | with warnings.catch_warnings(): 208 | warnings.simplefilter('ignore', category=RuntimeWarning) 209 | stat_df = pysynth.similarity.stat_tests(df1, df2) 210 | assert stat_df.columns.tolist() == [ 211 | 't_stat', 't_pval', 'levene_stat', 'levene_pval' 212 | ] 213 | for col in df2.columns: 214 | if pd.api.types.is_numeric_dtype(df2[col]): 215 | assert col in stat_df.index 216 | for stat in stat_df.columns: 217 | if stat.endswith('pval'): 218 | assert (np.isnan(stat_df[stat]) | (stat_df[stat] >= 0)).all() 219 | assert (np.isnan(stat_df[stat]) | (stat_df[stat] <= 1)).all() 220 | 221 | 222 | @pytest.mark.parametrize('openml_id', [31, 1461, 40536]) 223 | def test_predictor_matrix(openml_id): 224 | df = test_data.get_openml(openml_id) 225 | preds = pysynth.similarity._predictor_matrix(df) 226 | assert isinstance(preds, np.ndarray) 227 | assert np.issubdtype(preds.dtype, np.number) 228 | assert preds.ndim == 2 229 | assert preds.shape[0] == len(df.index) 230 | assert preds.shape[1] >= len(df.columns) 231 | 232 | 233 | @pytest.mark.parametrize('metrics', [None, ['gini', 'f1', 'precision']]) 234 | def test_compute_accuracy_metrics(metrics): 235 | np.random.seed(1711) 236 | n = 200 237 | sources = np.random.rand(n) 238 | targets = sources < .5 239 | prob_variants = [ 240 | (targets.astype(float), False), 241 | (sources, False), 242 | (1 - sources, False), 243 | (np.clip(sources + np.random.rand(n) * .2 - .1, 0, 1), False), 244 | (np.clip(sources + np.random.rand(n) * .4 - .2, 0, 1), False), 245 | (np.zeros(n), True), 246 | (np.ones(n), True), 247 | ] 248 | check_metrics = metrics 249 | if check_metrics is None: 250 | check_metrics = list(pysynth.similarity.DEFAULT_METRICS.keys()) 251 | for probs, is_edge in prob_variants: 252 | if is_edge: 253 | with pytest.warns(None): 254 | metric_vals = pysynth.similarity._compute_accuracy_metrics(targets, probs, metrics) 255 | else: 256 | metric_vals = pysynth.similarity._compute_accuracy_metrics(targets, probs, metrics) 257 | check_metric_values(metric_vals, check_metrics) 258 | 259 | 260 | def check_metric_values(metrics, names=None): 261 | if names is not None: 262 | assert metrics.index.tolist() == names 263 | assert not metrics.hasnans 264 | assert (np.round(metrics, 8) <= 1).all() 265 | assert (np.round(metrics, 8) >= -1).all() 266 | 267 | 268 | @pytest.mark.parametrize('test_size', [.1, .25, .5]) 269 | def test_discrimination(testing_df_close_pairs, test_size): 270 | for df1, df2 in testing_df_close_pairs: 271 | with warnings.catch_warnings(): 272 | warnings.simplefilter('ignore', category=RuntimeWarning) 273 | warnings.simplefilter('ignore', category=sklearn.exceptions.UndefinedMetricWarning) 274 | metrics, clf = pysynth.similarity.discrimination( 275 | df1, df2, test_size=test_size, return_best=True 276 | ) 277 | check_metric_values(metrics) 278 | if clf is None: 279 | assert metrics['gini'] <= 0 280 | else: 281 | assert metrics['gini'] > 0 282 | assert sklearn.base.is_classifier(clf) 283 | --------------------------------------------------------------------------------