├── .github
    └── workflows
    │   ├── pythonpackage.yml
    │   └── pythonpublish.yml
├── .gitignore
├── LICENSE.txt
├── README.md
├── VERSION
├── docs
    ├── Makefile
    ├── api.rst
    ├── conf.py
    ├── index.rst
    └── make.bat
├── pysynth
    ├── __init__.py
    ├── __main__.py
    ├── catdecat.py
    ├── ipf.py
    └── similarity.py
├── requirements.txt
├── requirements_test.txt
├── setup.py
└── tests
    ├── test_catdecat.py
    ├── test_data.py
    ├── test_init.py
    ├── test_ipf.py
    └── test_similarity.py


/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       max-parallel: 4
11 |       matrix:
12 |         python-version: [3.7]
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v1
16 |     - name: Set up Python ${{ matrix.python-version }}
17 |       uses: actions/setup-python@v1
18 |       with:
19 |         python-version: ${{ matrix.python-version }}
20 |     - name: Install dependencies
21 |       run: |
22 |         python -m pip install --upgrade pip
23 |         pip install -r requirements.txt
24 |     - name: Lint with flake8
25 |       run: |
26 |         pip install flake8
27 |         # stop the build if there are Python syntax errors or undefined names
28 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
29 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
30 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
31 |     - name: Test with pytest
32 |       run: |
33 |         pip install pytest
34 |         pytest tests
35 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpublish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package to PyPI
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@master
12 |     - name: Set up Python
13 |       uses: actions/setup-python@v1
14 |       with:
15 |         python-version: '3.7'
16 |     - name: Install dependencies
17 |       run: |
18 |         python -m pip install --upgrade pip
19 |         pip install setuptools wheel twine
20 |         python setup.py sdist bdist_wheel
21 |     - name: Build and publish to Test PyPI
22 |       uses: pypa/gh-action-pypi-publish@master
23 |       with:
24 |         password: ${{ secrets.test_pypi_apitoken }}
25 |         repository_url: https://test.pypi.org/legacy/
26 |     - name: Build and publish
27 |       uses: pypa/gh-action-pypi-publish@master
28 |       with:
29 |         password: ${{ secrets.pypi_apitoken }}
30 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2019, Jan Šimbera
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PySynth: Dataset Synthesis for Python
 2 | 
 3 | PySynth is a package to create synthetic datasets - that is, datasets that look
 4 | just like the original in terms of statistical properties, variable values,
 5 | distributions and correlations, but do not have exactly the same contents
 6 | so are safe against data disclosure. An alternative to R's 
 7 | [Synthpop](https://www.r-bloggers.com/generating-synthetic-data-sets-with-synthpop-in-r/)
 8 | with a more permissive license.
 9 | 
10 | ## Installation
11 | You can get PySynth from PyPI by using the obvious
12 | 
13 |     pip install pysynth
14 | 
15 | ## Usage
16 | You can perform the synthesis with basic settings directly on a CSV file:
17 | 
18 |     python -m pysynth source.csv synthesized.csv
19 | 
20 | This produces a `synthesized.csv` file that will look a lot like the original
21 | (variable names values, distributions, correlations) but will (most likely)
22 | not be the same.
23 | 
24 | For better control, it is best to use the synthesizer objects. They follow the
25 | scikit-learn interface for Pandas dataframes so you `fit()` them on the
26 | original and then `sample(n)` to get a synthetic dataframe of `n` rows.
27 | 
28 | So far, only a synthesizer based on iterative proportional fitting
29 | (`pysynth.ipf.IPFSynthesizer`) is available. This synthesis bins continuous
30 | variables to categories and reconstructs them using fitted univariate
31 | distributions. Missing values (`NaN`) are preserved.
32 | 
33 | Synthesis quality measurement modules to be added.
34 | 
35 | ## Contributors
36 | Feedback, additions, suggestions, issues and pull requests are welcome and much
37 | appreciated on [GitHub](https://github.com/simberaj/pysynth).
38 | 
39 | How to add features:
40 | 
41 | 1.  Fork it (https://github.com/simberaj/pysynth/fork)
42 | 2.  Create your feature branch (`git checkout -b feature/feature-name`)
43 | 3.  Commit your changes (`git commit -am "feature-name added"`)
44 | 4.  Push to the branch (`git push origin feature/feature-name`)
45 | 5.  Create a new pull request
46 | 
47 | Development requires `pytest` for testing and `sphinx` to generate
48 | documentation. Tests can be run using simple
49 | 
50 |     pytest tests
51 | 
52 | ### Intended development directions
53 | -   Synthesis quality measurement in terms of anonymization/similarity
54 | -   Model-based synthesis along the lines of R's Synthpop
55 | 
56 | ## License and author info
57 | PySynth is developed by Jan Šimbera <simbera.jan@gmail.com>.
58 | 
59 | PySynth is available under the MIT license. See `LICENSE.txt` for more details.
60 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.0.4
2 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | PySynth API
 2 | ==========================
 3 | 
 4 | Entry points
 5 | --------------
 6 | 
 7 | .. automodule:: pysynth
 8 |    :members:
 9 | 
10 | 
11 | IPF synthesis
12 | --------------
13 | 
14 | .. automodule:: pysynth.ipf
15 |    :members:
16 | 
17 | 
18 | Categorization and continuous variable reconstruction
19 | -----------------------------------------------------
20 | 
21 | .. automodule:: pysynth.catdecat
22 |    :members:
23 |    :exclude-members: Binner, Distributor
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | # import recommonmark
 5 | # from recommonmark.transform import AutoStructify
 6 | 
 7 | # to allow autodoc to discover the documented modules
 8 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 9 | 
10 | project = 'pysynth'
11 | copyright = '2019, Jan Šimbera'
12 | author = 'Jan Šimbera'
13 | 
14 | extensions = [
15 |     'sphinx.ext.autodoc',
16 |     'recommonmark',
17 | ]
18 | 
19 | source_suffix = {
20 |     '.rst': 'restructuredtext',
21 |     '.md': 'markdown',
22 | }
23 | 
24 | templates_path = ['_templates']
25 | 
26 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
27 | 
28 | html_theme = 'sphinxdoc'
29 | 
30 | html_static_path = ['_static']
31 | 
32 | # At the bottom of conf.py
33 | # def setup(app):
34 |     # app.add_config_value('recommonmark_config', {
35 |         # # 'url_resolver': (lambda url: github_doc_root + url),
36 |         # 'auto_toc_tree_section': 'Contents',
37 |     # }, True)
38 |     # app.add_transform(AutoStructify)
39 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | PySynth: Dataset Synthesis for Python
 2 | =====================================
 3 | 
 4 | PySynth is a package to create synthetic datasets - that is, datasets that look
 5 | just like the original in terms of statistical properties, variable values,
 6 | distributions and correlations, but do not have exactly the same contents
 7 | so are safe against data disclosure.
 8 | 
 9 | 
10 | .. toctree::
11 |    :maxdepth: 2
12 |    :caption: Contents:
13 | 
14 |    api
15 | 
16 | 
17 | Indices and tables
18 | ==================
19 | 
20 | * :ref:`genindex`
21 | * :ref:`modindex`
22 | * :ref:`search`
23 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/pysynth/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | 
 3 | import pandas as pd
 4 | 
 5 | from . import ipf
 6 | 
 7 | SYNTHESIZERS = {
 8 |     'ipf': ipf.IPFSynthesizer,
 9 | }
10 | 
11 | DEFAULT_METHOD = 'ipf'
12 | 
13 | 
14 | def synthesize(dataframe: pd.DataFrame,
15 |                n_rows: Optional[int] = None,
16 |                method: str = DEFAULT_METHOD,
17 |                ignore_cols: List[str] = [],
18 |                **kwargs) -> pd.DataFrame:
19 |     '''Synthesize an analog to a given dataframe.
20 | 
21 |     Optional keyword arguments are passed to the selected synthesizer.
22 | 
23 |     :param dataframe: Data to be synthesized.
24 |     :param n_rows: Number of output rows. If omitted, the same
25 |         length as the input dataframe will be used.
26 |     :param method: Method to use for synthesis. So far, only the `ipf` method
27 |         using :class:`ipf.IPFSynthesizer` is available.
28 |     :param ignore_cols: Columns not to be synthesized in the output (such as
29 |         personal identifiers).
30 |     '''
31 |     synther = SYNTHESIZERS[method](ignore_cols=ignore_cols, **kwargs)
32 |     synther.fit(dataframe)
33 |     return synther.sample(n_rows)
34 | 
35 | 
36 | def main(in_file: str,
37 |          out_file: str,
38 |          n_rows: str = None,
39 |          method: str = DEFAULT_METHOD
40 |          ) -> None:
41 |     '''Synthesize an analog to a given CSV file.
42 | 
43 |     :param in_file: A CSV file with data to serve as basis for synthesis.
44 |     :param out_file: A path to output the synthesized CSV. Will be
45 |         semicolon-delimited.
46 |     :param n_rows: Number of rows for the output file. If omitted, the same
47 |         length as the input file will be used.
48 |     :param method: Synthesis method to be used (see :func:`synthesize`).
49 |     '''
50 |     if n_rows is not None:
51 |         n_rows = int(n_rows)
52 |     orig_df = pd.read_csv(in_file, sep=None, engine='python')
53 |     synth_df = synthesize(orig_df, n_rows=n_rows, method=method)
54 |     synth_df.to_csv(out_file, sep=';', index=False)
55 | 


--------------------------------------------------------------------------------
/pysynth/__main__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | 
3 | import pysynth
4 | 
5 | if __name__ == '__main__':
6 |     pysynth.main(*sys.argv[1:])
7 | 


--------------------------------------------------------------------------------
/pysynth/catdecat.py:
--------------------------------------------------------------------------------
  1 | '''Bin continuous variables to categorical and reconstruct them back.
  2 | 
  3 | An auxiliary module that enables categorical-only synthesizers to work with
  4 | continuous variables by binning them to categories for the synthesis while
  5 | remembering the value distributions within each category, and then converting
  6 | the synthesized categories back to continuous values using those distributions.
  7 | 
  8 | The main work is done by the :class:`Categorizer` that does this trick for a
  9 | single variable (pandas Series). It might be further configured by using an
 10 | appropriate *binner* such as :class:`QuantileBinner` to choose the numeric
 11 | bounds for the categories
 12 | and an appropriate *distributor* such as :class:`FittingDistributor`
 13 | to remember and regenerate the intra-category value distribution.
 14 | '''
 15 | 
 16 | from __future__ import annotations
 17 | from typing import Union, Optional, List, Dict, Callable
 18 | 
 19 | import numpy as np
 20 | import pandas as pd
 21 | import scipy.stats
 22 | import sklearn.model_selection
 23 | import sklearn.neighbors
 24 | 
 25 | 
 26 | class Binner:
 27 |     '''Interface for numeric variable interval boundary determiners.'''
 28 |     def get(self, data: pd.Series) -> List[float]:
 29 |         '''Return a list of right-inclusive cut values, without endpoints.'''
 30 |         raise NotImplementedError
 31 | 
 32 | 
 33 | class QuantileBinner(Binner):
 34 |     '''A binner that gives quantile cuts.
 35 | 
 36 |     :param bins: Number of quantiles to bin to.
 37 |     '''
 38 |     def __init__(self, bins: int):
 39 |         self.bins = bins
 40 | 
 41 |     def get(self, data: pd.Series) -> List[float]:
 42 |         return data.quantile(
 43 |             (np.arange(self.bins - 1) + 1) / self.bins
 44 |         ).drop_duplicates().tolist()
 45 | 
 46 | 
 47 | class EqualRangeBinner(Binner):
 48 |     '''A binner that gives equal-range cuts.
 49 | 
 50 |     :param bins: Number of bins to bin to.
 51 |     '''
 52 |     def __init__(self, bins: int):
 53 |         self.bins = bins
 54 | 
 55 |     def get(self, data: pd.Series) -> List[float]:
 56 |         return np.linspace(data.min(), data.max(), self.bins + 1)[1:-1].tolist()
 57 | 
 58 | 
 59 | class AprioriBinner(Binner):
 60 |     '''A dummy binner that returns cut values it was initialized with.'''
 61 |     def __init__(self, bins: List[float]):
 62 |         self.bins = bins
 63 | 
 64 |     def get(self, data: pd.Series) -> List[float]:
 65 |         return self.bins
 66 | 
 67 | 
 68 | BINNERS = {
 69 |     'quantile': QuantileBinner,
 70 |     'equalrange': EqualRangeBinner,
 71 | }
 72 | 
 73 | 
 74 | class Distributor:
 75 |     '''Interface for numeric variable reconstructors.
 76 | 
 77 |     Fits itself on values for a single interval, and reproduces the
 78 |     distribution for a given number of output values by random sampling.
 79 |     '''
 80 |     def copy(self) -> Distributor:
 81 |         raise NotImplementedError
 82 | 
 83 |     def fit(self, values: np.ndarray) -> None:
 84 |         '''Fit a distribution on the values for a given interval.'''
 85 |         raise NotImplementedError
 86 | 
 87 |     def sample(self, n: int) -> np.ndarray:
 88 |         '''Generate a given count of random values from the fitted distribution.'''
 89 |         raise NotImplementedError
 90 | 
 91 |     @classmethod
 92 |     def create(cls, code: str, *args, **kwargs):
 93 |         return cls.CODES[code](*args, **kwargs)
 94 | 
 95 | 
 96 | class SelectingDistributor:
 97 |     '''Randomly sample from a value set according to value frequencies.
 98 | 
 99 |     Useful for variables with a small number of unique values.
100 |     '''
101 |     def __init__(self, seed: Optional[int] = None):
102 |         self.seed = seed
103 | 
104 |     def copy(self) -> SelectingDistributor:
105 |         return SelectingDistributor(seed=self.seed)
106 | 
107 |     def fit(self, values: np.ndarray) -> SelectingDistributor:
108 |         valcounts = pd.Series(values).value_counts()
109 |         self.targets = valcounts.index.values
110 |         self.probs = valcounts.values.astype(float)
111 |         self.probs /= self.probs.sum()
112 | 
113 |     def sample(self, n: int) -> np.ndarray:
114 |         return np.random.choice(self.targets, size=n, p=self.probs)
115 | 
116 | 
117 | class DiscreteDistributor(Distributor):
118 |     CODES = {
119 |         'select': SelectingDistributor,
120 |     }
121 | 
122 | 
123 | class MeanDistributor:
124 |     '''Reproduce the values as a constant value of their mean.'''
125 |     def __init__(self, seed=None):
126 |         pass
127 | 
128 |     def copy(self) -> MeanDistributor:
129 |         return MeanDistributor()
130 | 
131 |     def fit(self, values: np.ndarray) -> MeanDistributor:
132 |         self.mean = values.mean()
133 |         return self
134 | 
135 |     def sample(self, n: int) -> np.ndarray:
136 |         return np.full(n, self.mean)
137 | 
138 | 
139 | class StatisticalDistributor:
140 |     '''Reproduce the values from a univariate distribution fitted to the originals.
141 | 
142 |     Find the continuous distribution from a provided list that
143 |     approximates the distribution of the input values the best according to
144 |     the Kolmogorov-Smirnov two-sample statistic, fit its parameters and sample
145 |     from it. Values outside the range of fitting data are discarded and
146 |     re-sampled.
147 | 
148 |     :param distributions: `scipy.stats`-like continuous distributions. Need to
149 |         support a class method `fit()` that produces all required constructor
150 |         arguments as a tuple, and a `rvs(int)` method to generate random
151 |         samples. Defaults to `DEFAULT_DISTRIBUTIONS`. The distributions should
152 |         be approximately truncated, otherwise convergence is not guaranteed.
153 |     :param min_samples: Minimum number of generated samples to use when
154 |         evaluating the KS fit statistic.
155 |     :param seed: Random generator seed, applied both before fitting and before
156 |         each generator run.
157 |     '''
158 |     DEFAULT_DISTRIBUTIONS: List[scipy.stats._distn_infrastructure.rv_continuous] = [
159 |         scipy.stats.uniform,
160 |         scipy.stats.truncnorm,
161 |         scipy.stats.truncexpon,
162 |         scipy.stats.triang,
163 |     ]
164 | 
165 |     def __init__(self,
166 |                  distributions: List[scipy.stats._distn_infrastructure.rv_continuous] = DEFAULT_DISTRIBUTIONS,
167 |                  min_samples: int = 100,
168 |                  seed: Optional[int] = None,
169 |                  ):
170 |         self.distributions = distributions
171 |         self.min_samples = min_samples
172 |         self.seed = seed
173 | 
174 |     def copy(self) -> StatisticalDistributor:
175 |         return StatisticalDistributor(
176 |             self.distributions,
177 |             self.min_samples,
178 |             self.seed,
179 |         )
180 | 
181 |     def fit(self, values: np.ndarray) -> StatisticalDistributor:
182 |         self.minval = values.min()
183 |         self.maxval = values.max()
184 |         self.valrange = self.maxval - self.minval
185 |         if self.valrange == 0:
186 |             # does not matter what goes here, will be multiplied by zero anyway
187 |             best_distro = scipy.stats.norm(0, 1)
188 |         else:
189 |             best_distro = None
190 |             normalized = (values - self.minval).astype(float) / self.valrange
191 |             best_fit = 1
192 |             test_size = max(len(normalized), self.min_samples)
193 |             for distro in self.distributions:
194 |                 distro_obj = self._fit_distribution(distro, normalized)
195 |                 if distro_obj is not None:
196 |                     fit_est = scipy.stats.ks_2samp(
197 |                         normalized,
198 |                         distro_obj.rvs(test_size)
199 |                     )[0]
200 |                     if fit_est < best_fit:
201 |                         best_distro = distro_obj
202 |                         best_fit = fit_est
203 |             if best_distro is None:
204 |                 raise ValueError('no distribution could be estimated')
205 |         self.distribution = best_distro
206 |         self.generator = restricted_sampler(
207 |             lambda n: self.minval + self.valrange * self.distribution.rvs(n),
208 |             self.minval,
209 |             self.maxval,
210 |         )
211 |         return self
212 | 
213 |     def _fit_distribution(self,
214 |                           distribution: scipy.stats._distn_infrastructure.rv_continuous,
215 |                           values: np.ndarray,
216 |                           ) -> Optional[scipy.stats._distn_infrastructure.rv_frozen]:
217 |         try:
218 |             old_setting = np.seterr(all='raise')
219 |             args = distribution.fit(values)
220 |         except FloatingPointError:
221 |             return None
222 |         finally:
223 |             np.seterr(**old_setting)
224 |         if np.isnan(args).any():       # invalid distribution estimated
225 |             return None
226 |         else:
227 |             return distribution(*args)
228 | 
229 |     def sample(self, n: int) -> np.ndarray:
230 |         np.random.seed(self.seed)
231 |         return self.generator(n)
232 | 
233 | 
234 | class KDEDistributor:
235 |     '''Reproduce the values from a kernel density estimate fitted to the originals.
236 | 
237 |     Find the continuous distribution from a provided list that
238 |     approximates the distribution of the input values the best according to
239 |     the Kolmogorov-Smirnov two-sample statistic, fit its parameters and sample
240 |     from it. Values outside the range of fitting data are discarded and
241 |     re-sampled.
242 | 
243 |     Warning, this is apparently highly computationally demanding for large
244 |     datasets.
245 | 
246 |     :param n_bandwidths: Number of tries for the KDE bandwidth estimation,
247 |         in a logarithmic range between .1 and 10. The best fitting output is
248 |         kept using grid search.
249 |     :param seed: Random generator seed, applied both before fitting and before
250 |         each generator run.
251 |     '''
252 |     def __init__(self,
253 |                  n_bandwidths: int = 10,
254 |                  seed: Optional[int] = None,
255 |                  ):
256 |         self.n_bandwidths = n_bandwidths
257 |         self.seed = seed
258 | 
259 |     def copy(self) -> KDEDistributor:
260 |         return KDEDistributor(
261 |             self.min_unique_continuous,
262 |             self.max_iter,
263 |             self.n_bandwidths,
264 |             self.seed,
265 |         )
266 | 
267 |     def fit(self, values: np.ndarray) -> KDEDistributor:
268 |         np.random.seed(self.seed)
269 |         bandwidths = 10 ** np.linspace(-1, 1, self.n_bandwidths)
270 |         grid = sklearn.model_selection.GridSearchCV(
271 |             sklearn.neighbors.KernelDensity(),
272 |             {'bandwidth': bandwidths}
273 |         )
274 |         grid.fit(values.reshape(-1, 1))
275 |         self.kde = grid.best_estimator_
276 |         self.generator = restricted_sampler(
277 |             self.kde.sample,
278 |             values.min(),
279 |             values.max(),
280 |         )
281 | 
282 |     def sample(self, n: int) -> np.ndarray:
283 |         np.random.seed(self.seed)
284 |         return self.generator(n)
285 | 
286 | 
287 | def restricted_sampler(generator: Callable[int],
288 |                        minval: float,
289 |                        maxval: float,
290 |                        max_iter: int = 10,
291 |                        ) -> Callable[int]:
292 |     '''Restrict a value generator to a specified range of values.
293 | 
294 |     :param generator: A function generating random values in specified counts.
295 |     :param minval: Minimum value to generate.
296 |     :param maxval: Maximum value to generate.
297 |     :param max_iter: Maximum number of iterations. If unable to generate enough
298 |         values within range by generating this times more values from the
299 |         underlying generator, fail.
300 |     :raises ValueError: If max_iter is exceeded.
301 |     '''
302 |     def sampler(n):
303 |         g = 0
304 |         i = 0
305 |         results = []
306 |         while g < n:
307 |             if i == max_iter:
308 |                 raise ValueError('faulty generator, could not get values in range')
309 |             vals = generator(n)
310 |             sel_vals = vals[(minval <= vals) & (vals <= maxval)]
311 |             results.append(sel_vals)
312 |             g += len(sel_vals)
313 |             i += 1
314 |         return np.hstack(results)[:n]
315 |     return sampler
316 | 
317 | 
318 | class ContinuousDistributor(Distributor):
319 |     CODES = {
320 |         'mean': MeanDistributor,
321 |         'statdist': StatisticalDistributor,
322 |         'kde': KDEDistributor,
323 |     }
324 | 
325 | 
326 | class SeriesDiscretizer:
327 |     '''Discretize a continuous series to categorical.
328 | 
329 |     Able to reconstruct variables to their continuous form by estimating
330 |     distributions within bins.
331 | 
332 |     :param binner: Method to use to determine interval boundaries for
333 |         discretization. Use a :class:`Binner` instance or one of the following
334 |         strings:
335 | 
336 |         -   `'quantile'` for binning into quantiles (:class:`QuantileBinner`),
337 |         -   `'equalrange'` for binning into equally sized bins
338 |             (:class:`EqualRangeBinner`).
339 | 
340 |     :param bins: Number of intervals to which to bin non-categorical variables,
341 |         or boundaries of the intervals as a list. If a list is given, it
342 |         overrides the *binner* argument and uses :class:`AprioriBinner`. In
343 |         that case, do not specify the minimum or maximum in the list, just the
344 |         intermediate cuts.
345 |     :param min_unique_continuous: Minimum number of unique values in the input
346 |         to regard a distribution as continuous and not discrete.
347 |     :param discrete_distributor: Method to use to reconstruct numeric values
348 |         for a given category if there is less unique values than
349 |         `min_unique_continuous`. Use a Distributor instance
350 |         or one of the following strings:
351 | 
352 |         - `'select'` for :class:`SelectingDistributor` (weighted random sampling).
353 | 
354 |     :param continuous_distributor: Like `discrete_distributor`, but for cases
355 |         when there is many unique values. Use a Distributor instance
356 |         or one of the following strings:
357 | 
358 |         - `'mean'` for :class:`MeanDistributor` (constant mean value),
359 |         - `'statdist'` for :class:`StatisticalDistributor` (simple estimated distribution),
360 |         - `'kde'` for :class:`KDEDistributor` (KDE-estimated distribution).
361 | 
362 |     :param seed: Seed for the variable reconstruction.
363 |     '''
364 |     def __init__(self,
365 |                  binner: Union[str, Binner] = 'quantile',
366 |                  bins: Union[int, List[float]] = 10,
367 |                  min_for_bin: Optional[int] = 10,
368 |                  min_unique_continuous: int = 10,
369 |                  discrete_distributor: Union[str, Distributor] = 'select',
370 |                  continuous_distributor: Union[str, Distributor] = 'statdist',
371 |                  seed: Optional[int] = None,
372 |                  ):
373 |         if isinstance(binner, str):
374 |             if isinstance(bins, int):
375 |                 self.binner = BINNERS[binner](bins)
376 |             else:
377 |                 self.binner = AprioriBinner(bins)
378 |         else:
379 |             self.binner = binner
380 |         self.min_for_bin = min_for_bin
381 |         self.min_unique_continuous = min_unique_continuous
382 |         self.discrete_distributor = DiscreteDistributor.create(
383 |             discrete_distributor, seed=seed
384 |         ) if isinstance(discrete_distributor, str) else discrete_distributor
385 |         self.continuous_distributor = ContinuousDistributor.create(
386 |             continuous_distributor, seed=seed
387 |         ) if isinstance(continuous_distributor, str) else continuous_distributor
388 |         self.active = False
389 | 
390 |     def copy(self) -> SeriesDiscretizer:
391 |         return SeriesDiscretizer(
392 |             binner=self.binner,
393 |             min_for_bin=self.min_for_bin,
394 |             discrete_distributor=self.discrete_distributor,
395 |             continuous_distributor=self.continuous_distributor,
396 |         )
397 | 
398 |     def fit(self, series: pd.Series) -> SeriesDiscretizer:
399 |         '''Fit the discretizer on a given series.
400 | 
401 |         Get cut values from the underlying binner, fit distributors for the bins
402 |         and prepare the mapping.
403 | 
404 |         :raises TypeError: If the series is not numeric.
405 |         '''
406 |         if not pd.api.types.is_numeric_dtype(series):
407 |             raise TypeError(f'cannot discretize a non-numeric series of dtype {series.dtype}')
408 |         n_unique = series.nunique()
409 |         if self.min_for_bin is None or n_unique >= self.min_for_bin:
410 |             cuts = self._get_cuts(series)
411 |             if n_unique >= len(frozenset(cuts)):
412 |                 transformed = pd.cut(series, cuts, include_lowest=True)
413 |                 self.active = True
414 |                 self.index = transformed.cat.categories
415 |                 self.distributors = self._fit_distributors(series, transformed)
416 |                 self.dtype = series.dtype
417 |         return self
418 | 
419 |     def _get_cuts(self, series: pd.Series) -> List[float]:
420 |         cuts = self.binner.get(series)
421 |         minval = series.min()
422 |         if cuts[0] != minval:
423 |             cuts.insert(0, minval)
424 |         cuts.append(series.max() + 1)
425 |         return cuts
426 | 
427 |     def _fit_distributors(self,
428 |                           original: pd.Series,
429 |                           transformed: pd.Series,
430 |                           ) -> List[Distributor]:
431 |         distributors = []
432 |         for cat, bin_vals in original.groupby(transformed):
433 |             if len(bin_vals.index) > 0:
434 |                 n_unique = bin_vals.nunique()
435 |                 if n_unique < self.min_unique_continuous:
436 |                     d = self.discrete_distributor.copy()
437 |                 else:
438 |                     d = self.continuous_distributor.copy()
439 |                 d.fit(bin_vals.values)
440 |             else:
441 |                 # no values in bin, return a mean-producing distributor
442 |                 # at the center of the interval
443 |                 d = MeanDistributor(seed=self.continuous_distributor.seed)
444 |                 d.fit(np.array([cat.left, cat.right]))
445 |             distributors.append(d)
446 |         return distributors
447 | 
448 |     def transform(self, series: pd.Series) -> pd.Series:
449 |         '''Discretize the series to a dtype of :class:`pd.Categorical`.'''
450 |         if self.active:
451 |             return pd.cut(series, self.index)
452 |         else:
453 |             return series
454 | 
455 |     def fit_transform(self, series: pd.Series) -> pd.Series:
456 |         self.fit(series)
457 |         return self.transform(series)
458 | 
459 |     def inverse_transform(self, series: pd.Series) -> pd.Series:
460 |         '''De-discretize the series to a continuous one.
461 | 
462 |         For each bin, use the fitted distributor to produce continuous values
463 |         to fill the series.
464 |         '''
465 |         if self.active:
466 |             reconstructed = pd.Series(0, dtype=self.dtype, index=series.index)
467 |             for category, distributor in zip(self.index, self.distributors):
468 |                 locator = (series == category)
469 |                 reconstructed[locator] = distributor.sample(locator.sum())
470 |             na_loc = series.isna()
471 |             if na_loc.any():
472 |                 reconstructed[na_loc] = np.nan
473 |             return reconstructed.astype(self.dtype)
474 |         else:
475 |             return series
476 | 
477 | 
478 | class DataFrameDiscretizer:
479 |     '''Discretize all continuous columns in a dataframe to categorical.
480 | 
481 |     Categorical variables are left untouched.
482 | 
483 |     :param series_discretizer: A discretizer with setup to use for individual
484 |         series. If this is None, any remaining constructor parameters are
485 |         passed to the constructor of :class:`SeriesDiscretizer`.
486 |         If this is a dictionary, the discretizers are applied to the columns
487 |         denoted by the dictionary keys and the remaining columns are not
488 |         discretized.
489 |     :param max_num_cats: Maximum number of categories to accept. High numbers
490 |         of categories make categorical synthesizers unstable. If any of the
491 |         variables has more distinct values than this number after
492 |         categorization, a ValueError is raised.
493 |     '''
494 |     def __init__(self,
495 |                  series_discretizer: Union[
496 |                     None, SeriesDiscretizer, Dict[str, SeriesDiscretizer]
497 |                  ] = None,
498 |                  max_num_cats: Optional[int] = 50,
499 |                  **kwargs
500 |                  ):
501 |         if isinstance(series_discretizer, dict):
502 |             self.discretizers = series_discretizer
503 |             self.pattern = None
504 |         else:
505 |             self.discretizers = None
506 |             if series_discretizer is None:
507 |                 self.pattern = SeriesDiscretizer(**kwargs)
508 |             else:
509 |                 self.pattern = series_discretizer
510 |         self.max_num_cats = max_num_cats
511 | 
512 |     def fit(self, dataframe: pd.DataFrame) -> DataFrameDiscretizer:
513 |         '''Fit series discretizers for all non-categorical columns of the dataframe.
514 | 
515 |         If per-column discretizers were specified, other columns are ignored.
516 | 
517 |         :raises TypeError: If any column with an explicitly specified
518 |             per-column discretizer in a constructor dict is not numeric.
519 |         '''
520 |         self.fit_transform(dataframe)
521 |         return self
522 | 
523 |     def fit_transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
524 |         if self.discretizers is None:
525 |             self.discretizers = {
526 |                 col: self.pattern.copy()
527 |                 for col in dataframe.columns
528 |             }
529 |         transformed = dataframe.copy()
530 |         for col in dataframe.columns:
531 |             if col in self.discretizers:
532 |                 if pd.api.types.is_numeric_dtype(dataframe[col]):
533 |                     transformed[col] = self.discretizers[col].fit_transform(
534 |                         dataframe[col]
535 |                     )
536 |                 else:
537 |                     if self.pattern is None:
538 |                         raise TypeError(f'column {col} is not numeric but explicit discretizer provided')
539 |                     else:
540 |                         del self.discretizers[col]
541 |                 if self.max_num_cats is not None:
542 |                     n_after = transformed[col].nunique()
543 |                     if n_after > self.max_num_cats:
544 |                         raise ValueError(f'too many categories for {col} ({n_after})')
545 |         return transformed
546 | 
547 |     def transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
548 |         '''Discretize all non-categorical columns.'''
549 |         dataframe = dataframe.copy()
550 |         for col in self.discretizers:
551 |             dataframe[col] = self.discretizers[col].transform(dataframe[col])
552 |         return dataframe
553 | 
554 |     def inverse_transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
555 |         '''Return all formerly non-categorical columns to continuous.'''
556 |         dataframe = dataframe.copy()
557 |         for col in self.discretizers:
558 |             dataframe[col] = self.discretizers[col].inverse_transform(
559 |                 dataframe[col]
560 |             )
561 |         return dataframe
562 | 


--------------------------------------------------------------------------------
/pysynth/ipf.py:
--------------------------------------------------------------------------------
  1 | from typing import Union, Optional, List, Dict, Tuple
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import itertools
  6 | 
  7 | from . import catdecat
  8 | 
  9 | 
 10 | class MatrixRounder:
 11 |     def round(self, matrix: np.ndarray) -> np.ndarray:
 12 |         '''Round a matrix to integers, preserving its grand total.'''
 13 |         raise NotImplementedError
 14 | 
 15 | 
 16 | class LargestRemainderRounder(MatrixRounder):
 17 |     '''Round a matrix to integers using the largest-remainder method.
 18 | 
 19 |     The largest-remainder method (Hare quota) is deterministic and allocates
 20 |     roundings to the largest remainders. Ties are broken by selecting the cells
 21 |     with largest indices.
 22 | 
 23 |     :param seed: Meaningless, this method is deterministic.
 24 |     '''
 25 |     def __init__(self, seed: Optional[int] = None):
 26 |         pass    # this is a deterministic rounder
 27 | 
 28 |     def round(self, matrix: np.ndarray) -> np.ndarray:
 29 |         # round down to integers, those are sure hits
 30 |         rounded = matrix.astype(int)
 31 |         # compute remainders to be distributed
 32 |         remainders = matrix - rounded
 33 |         sum_remaining = int(np.round(remainders.sum()))
 34 |         # locate sum_remaining largest remainders
 35 |         ind_add = np.argsort(
 36 |             remainders, axis=None, kind='stable'
 37 |         )[::-1][:sum_remaining]
 38 |         rounded[np.unravel_index(ind_add, matrix.shape)] += 1
 39 |         return rounded
 40 | 
 41 | 
 42 | class RandomSamplingRounder(MatrixRounder):
 43 |     '''Round a matrix to integers using random sampling.
 44 | 
 45 |     Randomly sample from matrix cells, using their values as probabilities,
 46 |     until the sum is matched.
 47 | 
 48 |     :param seed: Seed for the random sampler.
 49 |     '''
 50 |     def __init__(self, seed: Optional[int] = None):
 51 |         self.seed = seed
 52 | 
 53 |     def round(self, matrix: np.ndarray) -> np.ndarray:
 54 |         matrix_sum = matrix.sum()
 55 |         final_total = int(np.round(matrix_sum))
 56 |         probs = (matrix / matrix_sum).flatten()
 57 |         # print('PROBS', probs.sum())
 58 |         np.random.seed(self.seed)
 59 |         # randomly select cells to be included
 60 |         bucket_is = np.random.choice(len(probs), size=final_total, p=probs)
 61 |         # count the cells
 62 |         cell_counts = np.bincount(bucket_is)
 63 |         return np.hstack((
 64 |             cell_counts,
 65 |             np.zeros(matrix.size - len(cell_counts), dtype=cell_counts.dtype)
 66 |         )).reshape(*matrix.shape)
 67 | 
 68 | 
 69 | ROUNDERS = {
 70 |     'lrem': LargestRemainderRounder,
 71 |     'random': RandomSamplingRounder,
 72 | }
 73 | 
 74 | 
 75 | class IPFSynthesizer:
 76 |     '''Synthesize a dataframe using iterative proportional fitting.
 77 | 
 78 |     Creates a dataframe that has similar statistical properties to the original
 79 |     but does not replicate its rows directly. Preserves univariate
 80 |     distributions and covariate distributions to a chosen degree.
 81 |     Non-categorical variables are converted to categorical for synthesis
 82 |     and then reconstructed using estimated distributions.
 83 | 
 84 |     :param cond_dim: Degree to which to match covariate distributions.
 85 |         By default, covariates to degree two (two variables' cross-tables)
 86 |         will be preserved. If you set this higher than the number of columns in
 87 |         the dataframe, the dataframe will be replicated exactly (except for
 88 |         the categorization and decategorization of non-categorical variables).
 89 |     :param discretizer: A :class:`catdecat.DataFrameDiscretizer` instance to
 90 |         convert numeric variables to and from categorical ones.
 91 |         Can be specified as a single instance or per variable in a dictionary.
 92 |         If not given, a single instance with default setup will be created.
 93 |     :param rounder: Method to use to round the IPF matrix to integer counts to
 94 |         enable row generation. Use a MatrixRounder instance or one of the
 95 |         following strings:
 96 | 
 97 |         -   `'lrem'` uses the deterministic largest remainder method (see
 98 |             :class:`LargestRemainderRounder` for details) which is more suited
 99 |             to small datasets.
100 |         -   `'random'` uses the non-deterministic random generation method (see
101 |             :class:`RandomSamplingRounder` for details), more suited to larger
102 |             datasets.
103 | 
104 |     :param ignore_cols: Columns from the input dataframe to not synthesize
105 |         (identifiers etc.); will be omitted from the output.
106 |     :param seed: Random generator seed for the discretizer and unroller.
107 |         (If a custom discretizer is specified, its seed is not overwritten by
108 |         this setting.)
109 |     '''
110 |     def __init__(self,
111 |                  cond_dim: int = 2,
112 |                  discretizer: Optional[catdecat.DataFrameDiscretizer] = None,
113 |                  rounder: Union[str, MatrixRounder] = 'lrem',
114 |                  ignore_cols: List[str] = [],
115 |                  seed: Optional[int] = None,
116 |                  ):
117 |         if cond_dim < 1:
118 |             raise ValueError('cannot preserve less than one-dimensional sums')
119 |         self.cond_dim = cond_dim
120 |         self.rounder = (
121 |             ROUNDERS[rounder](seed=seed) if isinstance(rounder, str)
122 |             else rounder
123 |         )
124 |         self.discretizer = (
125 |             discretizer if discretizer is not None
126 |             else catdecat.DataFrameDiscretizer(seed=seed)
127 |         )
128 |         self.ignore_cols = ignore_cols
129 | 
130 |     def fit(self, dataframe: pd.DataFrame) -> None:
131 |         '''Prepare the synthesis according to the provided dataframe.
132 | 
133 |         :param dataframe: Dataframe to synthesize. Every column is replicated;
134 |             if there are any identifier columns that should not be replicated,
135 |             remove them beforehand.
136 |         '''
137 |         discrete = self.discretizer.fit_transform(
138 |             dataframe.drop(self.ignore_cols, axis=1)
139 |         )
140 |         # marginals, axis_values = get_marginals(discrete)
141 |         self.axis_values = get_axis_values(discrete)
142 |         self.synthed_matrix = obscure_seed(
143 |             self.calc_true_matrix(discrete), self.cond_dim
144 |         )
145 |         self.original_n_rows = dataframe.shape[0]
146 | 
147 |     def sample(self, n: Optional[int] = None) -> pd.DataFrame:
148 |         '''Generate a synthetic dataframe with a given number of rows.
149 | 
150 |         :param n: Number of rows for the output dataframe. If not given,
151 |             it will match the fitting dataframe.
152 |         '''
153 |         matrix = self.synthed_matrix
154 |         if n is not None:
155 |             matrix *= (n / self.original_n_rows)
156 |         return self.discretizer.inverse_transform(
157 |             map_axes(unroll(self.rounder.round(matrix)), self.axis_values)
158 |         )
159 | 
160 |     def fit_transform(self, dataframe: pd.DataFrame) -> pd.DataFrame:
161 |         '''Fit the synthesizer and synthesize an equal-size dataframe.'''
162 |         self.fit(dataframe)
163 |         return self.sample()
164 | 
165 |     def calc_true_matrix(self, dataframe: pd.DataFrame) -> np.ndarray:
166 |         '''Calculate a IPF matrix reflecting true observation frequencies.'''
167 |         for col, mapper in self.axis_values.items():
168 |             dataframe[col] = dataframe[col].map(
169 |                 pd.Series(mapper.index, index=mapper.values)
170 |             )
171 |         true_seed = np.zeros(tuple(len(mapper) for mapper in self.axis_values.values()))
172 |         for indices in dataframe.itertuples(index=False, name=None):
173 |             true_seed[indices] += 1
174 |         return true_seed
175 | 
176 | 
177 | def ipf(marginals: List[np.ndarray],
178 |         seed_matrix: Optional[np.ndarray] = None,
179 |         precision: float = 1e-9
180 |         ) -> np.ndarray:
181 |     '''Perform iterative proportional fitting (IPF) on 1D marginal sums.
182 | 
183 |     Reformats the marginals to a generic n-D format and then delegates to
184 |     :func:`ipf_multidim`.
185 | 
186 |     :param marginals: Marginal sums for the IPF dimensions. The marginal sums
187 |         of the output matrix will match these. The list should contain
188 |         one-dimensional arrays that sum to the same number.
189 |     :param seed_matrix: Seed matrix, shows a-priori conditional probabilities
190 |         across dimensions.
191 |     :param precision: Terminate IPF when the largest difference of an
192 |         individual cell value between two iterations drops below this
193 |         threshold.
194 |     '''
195 |     n_dim = len(marginals)
196 |     if seed_matrix is not None and len(seed_matrix.shape) != n_dim:
197 |         raise ValueError('marginal dimensions do not match IPF seed')
198 |     return ipf_multidim(
199 |         [
200 |             marginal.reshape([
201 |                 -1 if i == dim_i else 1 for i in range(n_dim)
202 |             ])
203 |             for dim_i, marginal in enumerate(marginals)
204 |         ],
205 |         seed_matrix,
206 |         precision=precision
207 |     )
208 | 
209 | 
210 | def ipf_multidim(marginals: List[np.ndarray],
211 |                  seed_matrix: Optional[np.ndarray] = None,
212 |                  precision: float = 1e-9
213 |                  ) -> np.ndarray:
214 |     '''Perform iterative proportional fitting (IPF) on arbitrary marginal sums.
215 | 
216 |     :param marginals: Marginal sums for the final matrix. The list should
217 |         contain arrays with equal sums. Their dimensions should correspond to
218 |         the seed matrix or be 1 - at dimensions for which the given marginal
219 |         sum is summed (contracted).
220 |     :param seed_matrix: Seed matrix, shows a-priori conditional probabilities
221 |         across dimensions. If not given, the matrix shape will be computed from
222 |         the marginals and it will be initialized by ones.
223 |     :param precision: Terminate IPF when the largest difference of an
224 |         individual cell value between two iterations drops below this
225 |         threshold.
226 |     '''
227 |     if seed_matrix is None:
228 |         shape = tuple(
229 |             max(marg.shape[i] for marg in marginals)
230 |             for i in range(min(marg.ndim for marg in marginals))
231 |         )
232 |         matrix = np.ones(shape)
233 |     else:
234 |         matrix = seed_matrix.astype(float)
235 |         shape = matrix.shape
236 |     ipf_check_marginals(marginals, shape)
237 |     other_dims = [
238 |         tuple(
239 |             dim_i for dim_i in range(len(shape))
240 |             if marginal.shape[dim_i] == 1
241 |         )
242 |         for marginal in marginals
243 |     ]
244 |     diff = precision + 1
245 |     while diff > precision:
246 |         previous = matrix
247 |         for marginal, other_dimtup in zip(marginals, other_dims):
248 |             dim_sums = matrix.sum(axis=other_dimtup).reshape(marginal.shape)
249 |             matrix = matrix / np.where(dim_sums == 0, 1, dim_sums) * marginal
250 |         diff = abs(matrix - previous).max()
251 |     return matrix
252 | 
253 | 
254 | def ipf_check_marginals(marginals: List[np.ndarray], shape: Tuple[int]) -> None:
255 |     '''Checks whether the marginal sums are valid for IPF of given shape.
256 | 
257 |     Used internally by :func:`ipf_multidim` so uses the format of marginals
258 |     required by that function.
259 | 
260 |     :param marginals: List of marginal sum arrays to be checked.
261 |     :param shape: Shape of the resulting matrix.
262 |     '''
263 |     total = marginals[0].sum()
264 |     for i, marginal in enumerate(marginals):
265 |         if i != 0 and not np.isclose(marginal.sum(), total):
266 |             raise ValueError('marginal sum totals do not match')
267 |         if marginal.ndim != len(shape):
268 |             raise ValueError('marginal dimensions do not match seed')
269 |         for j, mshape in enumerate(marginal.shape):
270 |             if mshape != 1 and mshape != shape[j]:
271 |                 raise ValueError('marginal shape does not match seed')
272 | 
273 | 
274 | def unroll(matrix: np.ndarray) -> np.ndarray:
275 |     '''Convert a matrix of cell counts to a matrix of cell indices with those counts.
276 | 
277 |     :param matrix: A matrix of non-negative integers denoting counts of
278 |         observations. Each cell will generate this many rows with its positional
279 |         indices.
280 |     '''
281 |     cumcounts = np.cumsum(matrix)
282 |     inds = np.zeros(cumcounts[-1], dtype=int)
283 |     np.add.at(inds, cumcounts[:np.searchsorted(cumcounts, cumcounts[-1])], 1)
284 |     return np.stack(np.unravel_index(
285 |         np.cumsum(inds), matrix.shape
286 |     )).transpose()
287 | 
288 | 
289 | def map_axes(indices: np.ndarray,
290 |              axis_values: Dict[str, pd.Series],
291 |              ) -> pd.DataFrame:
292 |     '''Convert a category index array to a dataframe with categories.
293 | 
294 |     :param indices: A 2-D integer array.
295 |     :param axis_values: A dictionary with length matching the column count of
296 |         `indices`. Its keys are names of the columns to be assigned to the
297 |         dataframe, while values map the category indices from the given column
298 |         of the integer array to the expected dataframe values.
299 |     '''
300 |     dataframe = pd.DataFrame(indices, columns=list(axis_values.keys()))
301 |     for col, mapper in axis_values.items():
302 |         dataframe[col] = dataframe[col].map(mapper)
303 |     return dataframe
304 | 
305 | 
306 | def obscure_seed(true: np.ndarray,
307 |                  cond_dim: int = 2
308 |                  ) -> np.ndarray:
309 |     '''Produce a matrix preserving some cross-sums of the original.
310 | 
311 |     :param true: The matrix to be obscured. The output matrix will match
312 |         sums of its cells as aggregated to each combination of `cond_dim`
313 |         dimensions.
314 |     :param cond_dim: The number of dimensions to preserve cross-sums for.
315 |     '''
316 |     if cond_dim < 1:
317 |         raise ValueError('invalid preservation dimension count')
318 |     marginals = []
319 |     dim_is = list(range(true.ndim))
320 |     for sel_dim_is in itertools.combinations(dim_is, cond_dim):
321 |         left_dim_is = []
322 |         sum_indexer = []
323 |         for dim_i in dim_is:
324 |             if dim_i in sel_dim_is:
325 |                 sum_indexer.append(true.shape[dim_i])
326 |             else:
327 |                 sum_indexer.append(1)
328 |                 left_dim_is.append(dim_i)
329 |         marginals.append(true.sum(axis=tuple(left_dim_is)).reshape(sum_indexer))
330 |     return ipf_multidim(marginals)
331 | 
332 | 
333 | def get_axis_values(dataframe: pd.DataFrame
334 |                     ) -> Dict[str, pd.Series]:
335 |     '''Compute mappings of indices to categories for each dataframe column.'''
336 |     maps = {}
337 |     for col in dataframe:
338 |         values = pd.Series(dataframe[col].unique()).sort_values().values
339 |         maps[col] = pd.Series(values, index=np.arange(len(values)))
340 |     return maps
341 | 


--------------------------------------------------------------------------------
/pysynth/similarity.py:
--------------------------------------------------------------------------------
  1 | '''Measure the statistical similarity of synthesized data to the original.'''
  2 | 
  3 | from typing import Any, List, Optional, Tuple, Collection, Dict, Iterable, Union, Callable
  4 | 
  5 | import numpy as np
  6 | import scipy.stats
  7 | import pandas as pd
  8 | import sklearn.ensemble
  9 | import sklearn.linear_model
 10 | import sklearn.naive_bayes
 11 | import sklearn.neighbors
 12 | import sklearn.svm
 13 | import sklearn.tree
 14 | 
 15 | from . import catdecat
 16 | 
 17 | SUMMARY_STATS: List[str] = [
 18 |     'mean',
 19 |     'std',
 20 |     'min',
 21 |     'q1',
 22 |     'median',
 23 |     'q3',
 24 |     'max',
 25 |     'skew',
 26 |     'kurt'
 27 | ]
 28 | 
 29 | 
 30 | def summary_stats(series: pd.Series) -> pd.Series:
 31 |     '''Produce univariate summary statistics for a numerical series.
 32 | 
 33 |     Provides quartiles (q1, median and q3 respectively), mean, standard
 34 |     deviation (std), skewness (skew), kurtosis (kurt) and extremes (min, max).
 35 |     Note that for very short series, the higher moments (std, skew, kurt)
 36 |     might come out as NaN.
 37 | 
 38 |     :param series: A numerical series to compute summary statistics for.
 39 |     '''
 40 |     sumstat = series.describe().drop('count')
 41 |     # rename quartiles
 42 |     index = sumstat.index.tolist()
 43 |     index[index.index('25%'):index.index('75%')+1] = ['q1', 'median', 'q3']
 44 |     sumstat.index = index
 45 |     # add what pandas describe does not provide
 46 |     for key in SUMMARY_STATS:
 47 |         if key not in index:
 48 |             sumstat[key] = getattr(series, key)()
 49 |     return sumstat
 50 | 
 51 | 
 52 | DIFF_METHODS = {
 53 |     'diff': lambda orig, synth: synth - orig,
 54 |     'ape': lambda orig, synth: ((synth - orig) / orig).where(synth != orig, 0.),
 55 | }
 56 | 
 57 | 
 58 | def summary_stat_diff(orig: pd.DataFrame,
 59 |                       synth: pd.DataFrame,
 60 |                       method: str = 'diff',
 61 |                       ) -> pd.DataFrame:
 62 |     '''Compute differences of summary statistics for the synthesized dataset.
 63 | 
 64 |     For all numerical columns of the synthesized dataset, compute its summary
 65 |     statistics and compare them with the original using the given method.
 66 | 
 67 |     :param orig: The original dataset.
 68 |     :param synth: The synthesized dataset.
 69 |     :param method: The method to use for comparing the statistics:
 70 | 
 71 |         - `'diff'` for absolute difference,
 72 |         - `'ape'` for absolute percentage difference.
 73 |     :returns: A dataframe with a row for each column of the synthetic
 74 |         dataframe, with columns for different summary statistics
 75 |         containing their differences.
 76 |     '''
 77 |     method_fx = DIFF_METHODS[method]
 78 |     num_cols = [col for col in synth.columns
 79 |                 if pd.api.types.is_numeric_dtype(synth[col])]
 80 |     diff_df = pd.DataFrame.from_records([
 81 |         method_fx(
 82 |             summary_stats(synth[col]),
 83 |             summary_stats(orig[col])
 84 |         ).rename(col)
 85 |         for col in num_cols
 86 |     ], index=num_cols)
 87 |     # add _diff to stat names in columns to be more descriptive
 88 |     return diff_df.rename(columns={
 89 |         stat: stat + '_' + method for stat in diff_df.columns
 90 |     })
 91 | 
 92 | 
 93 | def aligned_freqs(orig: pd.Series,
 94 |                   synth: pd.Series,
 95 |                   bins: Optional[int] = 10,
 96 |                   ) -> Tuple[Optional[pd.Series], Optional[pd.Series]]:
 97 |     '''Return relative frequencies of values in the original and synthesized series.
 98 | 
 99 |     The relative frequency series will be aligned so that all values from
100 |     both columns are present in both outputs.
101 | 
102 |     :param orig: A column from the original dataframe.
103 |     :param synth: The corresponding column from the synthesized dataframe.
104 |     :param bins: Number of bins (quantiles) to which to discretize the
105 |         columns if they are numeric. Numeric columns with less unique values
106 |         than this number will not be discretized. The quantiles are measured
107 |         on the original column. If this is None, Nones will be returned for
108 |         both outputs if the columns are numeric.
109 |     :returns: A tuple of relative frequency series (summing to 1) for the
110 |         original and synthesized dataset respectively, or a tuple of two Nones,
111 |         if the originals are numeric and number of bins is not set.
112 |     '''
113 |     if pd.api.types.is_numeric_dtype(synth):
114 |         if bins is None:
115 |             return None, None
116 |         elif synth.nunique() > bins or orig.nunique() > bins:
117 |             quantiles = (
118 |                 [min(orig.min(), synth.min()) - 1]
119 |                 + catdecat.QuantileBinner(bins).get(orig)
120 |                 + [max(orig.max(), synth.max()) + 1]
121 |             )
122 |             orig = pd.cut(orig, quantiles)
123 |             synth = pd.cut(synth, quantiles)
124 |     orig_counts = orig.value_counts(normalize=True)
125 |     synth_counts = synth.value_counts(normalize=True)
126 |     orig_counts, synth_counts = orig_counts.align(synth_counts)
127 |     return orig_counts.fillna(0), synth_counts.fillna(0)
128 | 
129 | 
130 | def frequency_mismatch(orig: pd.DataFrame,
131 |                        synth: pd.DataFrame,
132 |                        bins: Optional[int] = 10,
133 |                        metrics: Optional[List[str]] = None,
134 |                        ) -> pd.DataFrame:
135 |     '''Return mismatch metrics for the dataframe columns' value frequencies.
136 | 
137 |     This only looks at univariate value frequencies, not considering whether
138 |     the values occur in conjunction with "correct" values from other columns.
139 | 
140 |     Computes the following metrics:
141 | 
142 |     -   `rtae`: Relative Total Absolute Error (sum of absolute differences).
143 |             Goes from 0 for perfect match to 2 for totally different values.
144 |     -   `overlap_coef`: Overlap coefficient (magnitude of set-wise frequency
145 |             intersection). Goes from 1 for perfect match to 0 for totally
146 |             different values.
147 |     -   `morisita_overlap`: Morisita's overlap index[#], a measure of frequency
148 |             overlap. Goes from 0 for no overlap to 1 for identical proportions.
149 |     -   `rank_damerau`: Normalized Damerau-Levenshtein distance[#] of
150 |             frequency-ordered category sets for both datasets; essentially,
151 |             a number of adjustments (additions, deletions, swaps) to arrive
152 |             from one to the other. Goes from 0 for matching category
153 |             ranks to 1 for total mismatch.
154 |     -   `mae`: Mean Absolute Error (mean of absolute differences). The less,
155 |             the better.
156 |     -   `rmse`: Root Mean Square Error. The less, the better.
157 |     -   `jaccard_dist`: Jaccard distance[#] (Intersection over Union) of the
158 |             two frequency sets. `jaccard_dist = 1 - overlap_coef`
159 |     -   `simpson_diff`: Difference between Simpson diversity indices[#] for the
160 |             synthetic and original frequencies.
161 |     -   `entropy_diff`: Difference between the Shannon entropy[#] for the
162 |             synthetic and original frequencies, in nats.
163 | 
164 |     :param orig: The original dataframe.
165 |     :param synth: The synthesized analog.
166 |     :param bins: Number of bins to (quantiles) to which to discretize the
167 |         columns if they are numeric, to be able to measure their frequencies
168 |         as well. The quantiles are measured on the original column. If None,
169 |         numeric columns will not be measured.
170 |     :param metrics: Names of metrics to include. If None, all metrics are
171 |         computed.
172 |     :returns: A dataframe with a row for each column of the synthetic dataframe
173 |         (except numeric columns if bins is None) with columns for different
174 |         frequency mismatch statistics.
175 | 
176 |     [#] "Morisita's overlap index". Wikipedia.
177 |         <https://en.wikipedia.org/wiki/Morisita%27s_overlap_index>
178 |     [#] "Damerau-Levenshtein distance". Wikipedia.
179 |         <https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance>
180 |     [#] "Jaccard index". Wikipedia.
181 |         <https://en.wikipedia.org/wiki/Jaccard_index>
182 |     [#] "Simpson index". Wikipedia. In: Diversity index.
183 |         <https://en.wikipedia.org/wiki/Diversity_index#Simpson_index>
184 |     [#] "Shannon index". Wikipedia. In: Diversity index.
185 |         <https://en.wikipedia.org/wiki/Diversity_index#Shannon_index>
186 |     '''
187 |     recs = []
188 |     index = []
189 |     for col in synth.columns:
190 |         orig_freqs, synth_freqs = aligned_freqs(orig[col], synth[col], bins)
191 |         if orig_freqs is not None and synth_freqs is not None:
192 |             recs.append(freqdiff_metrics(orig_freqs, synth_freqs, metrics))
193 |             index.append(col)
194 |     return pd.DataFrame.from_records(recs, index=index)
195 | 
196 | 
197 | def freqdiff_metrics(orig_freqs: pd.Series,
198 |                      synth_freqs: pd.Series,
199 |                      metrics: Optional[List[str]] = None,
200 |                      ) -> pd.Series:
201 |     '''Compute frequency mismatch metrics for two value frequency series.
202 | 
203 |     :param orig_freqs: Frequencies of values (or their intervals) in the
204 |         original dataframe column.
205 |     :param synth_freqs: Frequencies of values (or their intervals) in the
206 |         matching synthesized column.
207 |     :param metrics: Names of metrics to include. If None, all metrics are
208 |         computed. For a list of metrics, see :func:`frequency_mismatch`.
209 |     :returns: A Series with metric values, with their names in the index.
210 |     '''
211 |     diff = synth_freqs - orig_freqs
212 |     simpson_orig = (orig_freqs ** 2).sum()
213 |     simpson_synth = (synth_freqs ** 2).sum()
214 |     overlap = orig_freqs.where(orig_freqs <= synth_freqs, synth_freqs)
215 |     metric_series = pd.Series({
216 |         'rtae': abs(diff).sum(),
217 |         'overlap_coef': overlap.sum(),
218 |         'rank_damerau': damerau_levenshtein(
219 |             orig_freqs.sort_values().index.tolist(),
220 |             synth_freqs.sort_values().index.tolist(),
221 |         ) / len(orig_freqs.index),
222 |         'morisita_overlap': (
223 |             2 * (orig_freqs * synth_freqs).sum()
224 |             / (simpson_orig + simpson_synth)
225 |         ),
226 |         'mae': abs(diff).mean(),
227 |         'rmse': (diff ** 2).mean() ** .5,
228 |         'jaccard_dist': 1 - overlap.sum(),
229 |         'simpson_diff': simpson_synth - simpson_orig,
230 |         'entropy_diff': (
231 |             (synth_freqs[synth_freqs>0] * np.log(synth_freqs[synth_freqs>0])).sum()
232 |             - (orig_freqs[orig_freqs>0] * np.log(orig_freqs[orig_freqs>0])).sum()
233 |         )
234 |     })
235 |     if metrics is None:
236 |         return metric_series
237 |     else:
238 |         return metric_series[metrics]
239 | 
240 | 
241 | def damerau_levenshtein(seq1: Collection[Any], seq2: Collection[Any]) -> int:
242 |     """Calculate the Damerau-Levenshtein distance between sequences.
243 | 
244 |     This distance is the number of additions, deletions, substitutions,
245 |     and transpositions needed to transform the first sequence into the
246 |     second. Although generally used with strings, any sequences of
247 |     comparable objects will work.
248 | 
249 |     Transpositions are exchanges of *consecutive* characters; all other
250 |     operations are self-explanatory.
251 | 
252 |     Based on code by Michael Homer, released under MIT License, retrieved
253 |     from https://web.archive.org/web/20150909134357/http://mwh.geek.nz:80/2009/04/26/python-damerau-levenshtein-distance/
254 |     """
255 |     # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
256 |     # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
257 |     # However, only the current and two previous rows are needed at once,
258 |     # so we only store those.
259 |     oneago = None
260 |     thisrow = list(range(1, len(seq2) + 1)) + [0]
261 |     for x in range(len(seq1)):
262 |         # Python lists wrap around for negative indices, so put the
263 |         # leftmost column at the *end* of the list. This matches with
264 |         # the zero-indexed strings and saves extra calculation.
265 |         twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
266 |         for y in range(len(seq2)):
267 |             delcost = oneago[y] + 1
268 |             addcost = thisrow[y - 1] + 1
269 |             subcost = oneago[y - 1] + (seq1[x] != seq2[y])
270 |             thisrow[y] = min(delcost, addcost, subcost)
271 |             # This block deals with transpositions
272 |             is_transposition = (
273 |                 x > 0 and y > 0 and seq1[x] == seq2[y - 1]
274 |                 and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]
275 |             )
276 |             if is_transposition:
277 |                 thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
278 |     return thisrow[len(seq2) - 1]
279 | 
280 | 
281 | def correlation_diff(orig: pd.DataFrame,
282 |                      synth: pd.DataFrame,
283 |                      method: Union[str, Callable] = 'pearson',
284 |                      ) -> pd.DataFrame:
285 |     '''Return the difference of correlation matrices of the two dataframes.
286 | 
287 |     :param orig: The original dataframe.
288 |     :param synth: The synthesized analog.
289 |     :param method: A method for Pandas `corr()` to specify the manner of
290 |         correlation (Pearson, Kendall, Spearman
291 |         or arbitrary through a callable).
292 |     :returns: A Dataframe with synthesized column names in the index and
293 |         columns, with numerical differences of correlation coefficients in the
294 |         dataframes as values. Might contain NaNs where the coefficient in either
295 |         of the dataframes is NaN, e.g. when the given column only contains a
296 |         single value.
297 |     '''
298 |     return (
299 |         synth.corr(method=method)
300 |         - orig[synth.columns.tolist()].corr(method=method)
301 |     )
302 | 
303 | 
304 | def stat_tests(orig: pd.DataFrame, synth: pd.DataFrame) -> pd.DataFrame:
305 |     '''Test equality of mean and variance of synthesized columns to originals.
306 | 
307 |     Performs a two-sample independent t-test (`t_`) for mean equality
308 |     and Levene's test for variance equality with median center (`levene_`),
309 |     omitting NaNs.
310 |     Omits non-numeric columns.
311 | 
312 |     :param orig: The original dataframe.
313 |     :param synth: The synthesized analog.
314 |     :returns: A dataframe with a row for each numeric column of the synthesized
315 |         dataset, with a test statistic (`_stat`) and p-value (`_pval`) column
316 |         for each of the tests performed.
317 |     '''
318 |     recs = []
319 |     index = []
320 |     for col in synth.columns:
321 |         if pd.api.types.is_numeric_dtype(synth[col]):
322 |             t, tp = scipy.stats.ttest_ind(
323 |                 orig[col], synth[col], nan_policy='omit'
324 |             )
325 |             if not isinstance(t, float):
326 |                 t, tp = np.nan, np.nan
327 |             lev, levp = scipy.stats.levene(
328 |                 orig[col].dropna(), synth[col].dropna(), center='median'
329 |             )
330 |             recs.append((t, tp, lev, levp))
331 |             index.append(col)
332 |     return pd.DataFrame.from_records(
333 |         recs, index=index,
334 |         columns=['t_stat', 't_pval', 'levene_stat', 'levene_pval']
335 |     )
336 | 
337 | 
338 | DEFAULT_DISCRIMINATORS: List[sklearn.base.ClassifierMixin] = [
339 |     sklearn.ensemble.GradientBoostingClassifier(n_estimators=10),
340 |     sklearn.ensemble.RandomForestClassifier(n_estimators=10),
341 |     # sklearn.linear_model.LogisticRegression(max_iter=250),
342 |     # sklearn.linear_model.Perceptron(),
343 |     # sklearn.linear_model.RidgeClassifier(),
344 |     sklearn.naive_bayes.GaussianNB(),
345 |     sklearn.neighbors.KNeighborsClassifier(),
346 |     # sklearn.neighbors.RadiusNeighborsClassifier(),
347 |     # sklearn.svm.LinearSVC(),
348 |     # sklearn.svm.NuSVC(),
349 |     # sklearn.svm.SVC(),
350 |     sklearn.tree.DecisionTreeClassifier(),
351 | ]
352 | 
353 | 
354 | def discrimination(orig: pd.DataFrame,
355 |                    synth: pd.DataFrame,
356 |                    classifiers: Iterable[sklearn.base.ClassifierMixin] = DEFAULT_DISCRIMINATORS,
357 |                    metrics: Optional[List[str]] = None,
358 |                    test_size: float = .25,
359 |                    return_best: bool = False,
360 |                    ) -> Union[
361 |                        pd.Series,
362 |                        Tuple[pd.Series, Optional[sklearn.base.ClassifierMixin]]
363 |                    ]:
364 |     '''Calculate how well the synthesized rows can be discriminated from originals.
365 | 
366 |     Fits each of the provided classifiers to predict whether the given row is
367 |     synthesized or original, measures their accuracy on a test sample and
368 |     gives a detailed evaluation of the best-performing one.
369 | 
370 |     :param orig: The original dataframe.
371 |     :param synth: The synthesized analog.
372 |     :param classifiers: Unfitted classifiers to try the discrimination. The
373 |         one with the best ROC AUC on the test sample is selected.
374 |     :param metrics: Names of discrimination accuracy metrics to compute.
375 |         If None, all of these metrics are computed using their scikit-learn
376 |         implementations:
377 | 
378 |         -   `auc`: ROC Area Under Curve (0.5 is no discrimination, 1 full
379 |                 discrimination).
380 |         -   `gini`: ROC Gini coefficient (0 is no discrimination, 1 full
381 |                 discrimination: `gini = 2 * auc - 1`).
382 |         -   `ap`: Average Precision (evaluates the precision-recall curve)[#].
383 |         -   `matthews`: Matthews' four-square table correlation coefficient.
384 |         -   `f1`: F1-score.
385 |         -   `accuracy`: Ordinary accuracy (fraction of equally labeled rows).
386 |         -   `precision`: Classification precision.
387 |         -   `recall`: Classification recall.
388 |         -   `cohen_kappa`: Cohen's kappa score of annotator agreement.
389 |         -   `hamming`: Hamming loss.
390 |         -   `jaccard`: Jaccard score.
391 |     :param test_size: Fraction of the input to use for evaluating discrimination
392 |         performance (and not for discriminator training). The train/test split
393 |         is stratified on original/synthetic origins.
394 |     :param return_best: Return the best performing fitted discriminator
395 |         along with the metrics.
396 |     :returns: A series of discrimination classification performance metrics
397 |         with their aforementioned names in the index. If return_best is True,
398 |         return a tuple with the metrics and the fitted discriminator.
399 | 
400 |     [#] "Average precision". Wikipedia. In: Information retrieval.
401 |         <https://en.wikipedia.org/w/index.php?title=Information_retrieval&oldid=793358396#Average_precision>
402 |     '''
403 |     feats = _predictor_matrix(pd.concat([orig, synth]))
404 |     print(feats)
405 |     target = np.hstack((
406 |         np.zeros(len(orig.index), dtype=bool),
407 |         np.ones(len(synth.index), dtype=bool)
408 |     ))
409 |     best_est, best_probs, test_tgts = _find_best_classifier(
410 |         feats, target, classifiers, test_size
411 |     )
412 |     metric_series = _compute_accuracy_metrics(test_tgts, best_probs, metrics)
413 |     if return_best:
414 |         return metric_series, best_est
415 |     else:
416 |         return metric_series
417 | 
418 | 
419 | DEFAULT_METRICS: Dict[str, Tuple[Union[str, Callable], bool]] = {
420 |     'auc': ('roc_auc_score', False),
421 |     'gini': (
422 |         (lambda trues, probs: 2 * sklearn.metrics.roc_auc_score(trues, probs) - 1),
423 |         False
424 |     ),
425 |     'ap': ('average_precision_score', False),
426 |     'matthews': ('matthews_corrcoef', True),
427 |     'f1': ('f1_score', True),
428 |     'accuracy': ('accuracy_score', True),
429 |     'precision': ('precision_score', True),
430 |     'recall': ('recall_score', True),
431 |     'cohen_kappa': ('cohen_kappa_score', True),
432 |     'hamming': ('hamming_loss', True),
433 |     'jaccard': ('jaccard_score', True),
434 | }
435 | 
436 | 
437 | def _find_best_classifier(feats: np.ndarray,
438 |                           target: np.ndarray,
439 |                           classifiers: Iterable[sklearn.base.ClassifierMixin] = DEFAULT_DISCRIMINATORS,
440 |                           test_size: float = .25,
441 |                           ) -> Tuple[
442 |                               sklearn.base.ClassifierMixin,
443 |                               np.ndarray, np.ndarray
444 |                           ]:
445 |     train_feats, test_feats, train_tgts, test_tgts = \
446 |         sklearn.model_selection.train_test_split(
447 |             feats, target, test_size=test_size, stratify=target
448 |         )
449 |     best_auc = 0.5
450 |     best_est = None
451 |     best_probs = np.full_like(test_tgts, .5, dtype=np.double)
452 |     for clf in classifiers:
453 |         clf.fit(train_feats, train_tgts)
454 |         probs = clf.predict_proba(test_feats)[:,1]
455 |         auc = sklearn.metrics.roc_auc_score(test_tgts, probs)
456 |         if auc > best_auc:
457 |             best_est = clf
458 |             best_auc = auc
459 |             best_probs = probs
460 |     return best_est, best_probs, test_tgts
461 | 
462 | 
463 | def _compute_accuracy_metrics(targets: np.ndarray,
464 |                               probs: np.ndarray,
465 |                               metrics: Optional[List[str]] = None,
466 |                               ) -> pd.Series:
467 |     preds = probs >= .5
468 |     metric_results = {}
469 |     for name, conf in DEFAULT_METRICS.items():
470 |         if metrics is None or name in metrics:
471 |             fx, do_threshold = conf
472 |             if isinstance(fx, str):
473 |                 fx = getattr(sklearn.metrics, fx)
474 |             metric_results[name] = fx(
475 |                 targets,
476 |                 (preds if do_threshold else probs)
477 |             )
478 |     return pd.Series(metric_results)
479 | 
480 | 
481 | def _predictor_matrix(dataframe: pd.DataFrame):
482 |     dataframe = pd.get_dummies(
483 |         dataframe,
484 |         dummy_na=True,
485 |     )
486 |     fillers = {
487 |         col: dataframe[col].median()
488 |         for col in dataframe.columns
489 |         if dataframe[col].hasnans
490 |     }
491 |     return dataframe.fillna(value=fillers).values
492 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | scipy
4 | scikit-learn>=0.22


--------------------------------------------------------------------------------
/requirements_test.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | sphinx
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open('README.md') as infile:
 4 |     long_description = infile.read()
 5 | 
 6 | with open('requirements.txt') as infile:
 7 |     required = [line.strip() for line in infile]
 8 | 
 9 | with open('VERSION') as infile:
10 |     version = infile.read().strip()
11 | 
12 | setuptools.setup(
13 |     name='pysynth',
14 |     version=version,
15 |     description='Dataset synthesis for Python',
16 |     long_description=long_description,
17 |     long_description_content_type='text/markdown; charset=UTF-8',
18 |     author='Jan Šimbera',
19 |     author_email='simbera.jan@gmail.com',
20 |     python_requires='>=3.7.0',
21 |     url='https://github.com/simberaj/pysynth',
22 |     packages=setuptools.find_packages(exclude=('tests', )),
23 |     install_requires=required,
24 |     extras_require={},
25 |     include_package_data=True,
26 |     license='MIT',
27 |     keywords='synthesis ipf data python',
28 |     classifiers=[
29 |         'Development Status :: 2 - Pre-Alpha',
30 |         'Environment :: Console',
31 |         'Intended Audience :: Developers',
32 |         'License :: OSI Approved :: MIT License',
33 |         'Natural Language :: English',
34 |         'Programming Language :: Python',
35 |         'Programming Language :: Python :: 3',
36 |     ],
37 |     zip_safe=True
38 | )
39 | 


--------------------------------------------------------------------------------
/tests/test_catdecat.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import itertools
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import scipy.stats
  8 | import pytest
  9 | 
 10 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 11 | import pysynth.catdecat
 12 | 
 13 | import test_data
 14 | 
 15 | np.random.seed(1711)
 16 | 
 17 | @pytest.mark.parametrize('binner_cls, bins', list(itertools.product(
 18 |     pysynth.catdecat.BINNERS.values(), [5, 10, 20],
 19 | )))
 20 | def test_binners_formal(binner_cls, bins):
 21 |     binner = binner_cls(bins)
 22 |     cutvals = binner.get(pd.Series(np.random.rand(100)))
 23 |     assert isinstance(cutvals, list)
 24 |     assert len(cutvals) == bins - 1
 25 |     assert all(isinstance(cutval, float) for cutval in cutvals)
 26 | 
 27 | @pytest.mark.parametrize('bins', [4, 8, 12])
 28 | def test_quantile_binner(bins):
 29 |     binner = pysynth.catdecat.QuantileBinner(bins)
 30 |     for i in range(10):
 31 |         vals = pd.Series(np.random.rand(100))
 32 |         cuts = binner.get(vals)
 33 |         assert np.isclose(
 34 |             cuts,
 35 |             np.percentile(vals, (np.arange(bins - 1) + 1) / bins * 100)
 36 |         ).all()
 37 | 
 38 | @pytest.mark.parametrize('bins', [4, 8, 12])
 39 | def test_equalrange_binner(bins):
 40 |     binner = pysynth.catdecat.EqualRangeBinner(bins)
 41 |     for i in range(10):
 42 |         vals = pd.Series(np.random.rand(100))
 43 |         cuts = binner.get(vals)
 44 |         inner_widths = np.diff(cuts)
 45 |         assert np.isclose(inner_widths.min(), inner_widths.max())
 46 |         assert np.isclose(inner_widths.mean(), cuts[0] - vals.min())
 47 |         assert np.isclose(inner_widths.mean(), vals.max() - cuts[-1])
 48 | 
 49 | def test_apriori_binner():
 50 |     for i in range(10):
 51 |         vals = pd.Series(np.random.rand(100))
 52 |         cuts = np.sort(vals.sample(10).unique()).tolist()
 53 |         binner = pysynth.catdecat.AprioriBinner(cuts)
 54 |         assert binner.get(vals) == cuts
 55 | 
 56 | 
 57 | @pytest.mark.parametrize('dist_cls', pysynth.catdecat.ContinuousDistributor.CODES.values())
 58 | def test_continuous_distributors(dist_cls):
 59 |     distributor = dist_cls(seed=42)
 60 |     minval = 2
 61 |     maxval = 7
 62 |     for i in range(10):
 63 |         vals = np.random.rand(100) * (maxval - minval) + minval
 64 |         distributor.fit(vals)
 65 |         reconst = distributor.sample(100)
 66 |         assert minval <= reconst.min() <= reconst.max() <= maxval
 67 | 
 68 | @pytest.mark.parametrize('dist_cls', pysynth.catdecat.DiscreteDistributor.CODES.values())
 69 | def test_discrete_distributors(dist_cls):
 70 |     distributor = dist_cls(seed=42)
 71 |     minval = 2
 72 |     maxval = 12
 73 |     for i in range(10):
 74 |         vals = (np.random.rand(100) * (maxval - minval) + minval).astype(int)
 75 |         uniques = np.unique(vals)
 76 |         distributor.fit(vals)
 77 |         reconst = distributor.sample(100)
 78 |         assert minval <= reconst.min() <= reconst.max() <= maxval
 79 |         assert np.isin(reconst, uniques).all()
 80 | 
 81 | def test_restricted_sampler_ok():
 82 |     minval = 1
 83 |     maxval = 3
 84 |     testdist = scipy.stats.norm(2, 1)
 85 |     sampler = pysynth.catdecat.restricted_sampler(testdist.rvs, minval, maxval)
 86 |     x = sampler(1000)
 87 |     assert (x >= minval).all()
 88 |     assert (x <= maxval).all()
 89 |     assert len(x) == 1000
 90 | 
 91 | def test_restricted_sampler_fail():
 92 |     minval = 1
 93 |     maxval = 3
 94 |     testgen = lambda n: np.full(n, 4)
 95 |     sampler = pysynth.catdecat.restricted_sampler(testgen, 1, 3)
 96 |     with pytest.raises(ValueError):
 97 |         x = sampler(1000)
 98 | 
 99 | 
100 | def test_mean_distributor():
101 |     dist = pysynth.catdecat.MeanDistributor()
102 |     for i in range(10):
103 |         vals = np.random.rand(100)
104 |         val_mean = vals.mean()
105 |         dist.fit(vals)
106 |         assert (dist.sample(20) == np.array([val_mean] * 20)).all()
107 | 
108 | 
109 | SERIES_DISCRETIZERS = [
110 |     pysynth.catdecat.SeriesDiscretizer(seed=42),
111 |     pysynth.catdecat.SeriesDiscretizer(binner='equalrange', continuous_distributor='mean', seed=42),
112 | ]
113 | 
114 | @pytest.mark.parametrize('categ, na_frac', list(itertools.product(
115 |     SERIES_DISCRETIZERS, [0, 0.2, 1]
116 | )))
117 | def test_discretizer_numeric(categ, na_frac):
118 |     size = 100
119 |     minval = -3
120 |     maxval = 10
121 |     vals = pd.Series(np.random.rand(size) * 13 - 3)
122 |     vals[np.random.rand(size) < na_frac] = np.nan
123 |     cats = categ.fit_transform(vals)
124 |     check_series_properly_discretized(vals, cats, categ.inverse_transform(cats))
125 | 
126 | @pytest.mark.parametrize('n_cats', [2, 20, 70])
127 | def test_discretizer_category(n_cats):
128 |     vals = pd.Series(np.random.choice([chr(48 + i) for i in range(n_cats)], 300))
129 |     c = pysynth.catdecat.SeriesDiscretizer(seed=42)
130 |     with pytest.raises(TypeError):
131 |         trans = c.fit_transform(vals)
132 | 
133 | 
134 | @pytest.mark.parametrize('n_vals', [2, 20, 70])
135 | def test_discretizer_integer(n_vals):
136 |     vals = pd.Series(np.random.randint(n_vals, size=300))
137 |     c = pysynth.catdecat.SeriesDiscretizer(seed=42)
138 |     cats = c.fit_transform(vals)
139 |     if n_vals < c.min_for_bin:
140 |         assert (cats == vals).all()
141 |     else:
142 |         check_series_properly_discretized(vals, cats, c.inverse_transform(cats))
143 | 
144 | 
145 | def check_df_properly_discretized(df, tr_df, reconst_df, max_nums=10):
146 |     orig_cols = frozenset(df.columns)
147 |     assert orig_cols == frozenset(tr_df.columns)
148 |     assert orig_cols == frozenset(reconst_df.columns)
149 |     for col in df.columns:
150 |         check_series_properly_discretized(
151 |             df[col],
152 |             tr_df[col],
153 |             reconst_df[col],
154 |             max_nums=max_nums
155 |         )
156 | 
157 | def check_series_properly_discretized(orig, tr, reconst, max_nums=10):
158 |     orig_notna = orig.notna()
159 |     tr_notna = tr.notna()
160 |     reconst_notna = reconst.notna()
161 |     assert (orig_notna == tr_notna).all()
162 |     assert (orig_notna == reconst_notna).all()
163 |     if pd.api.types.is_numeric_dtype(orig):
164 |         if pd.api.types.is_categorical_dtype(tr):
165 |             for val, interv, reconst in zip(orig[orig_notna], tr[tr_notna], reconst[reconst_notna]):
166 |                 assert val in interv
167 |                 assert reconst in interv
168 |         else:
169 |             assert orig.nunique() <= max_nums
170 |             assert (orig[orig_notna] == tr[tr_notna]).all()
171 |     else:
172 |         assert (orig[orig_notna] == tr[tr_notna]).all()
173 | 
174 | 
175 | @pytest.mark.parametrize('openml_id', [31, 1461, 40536])
176 | def test_df_discretizer(openml_id):
177 |     disc = pysynth.catdecat.DataFrameDiscretizer(max_num_cats=300)
178 |     df = test_data.get_openml(openml_id)
179 |     tr_df = disc.fit_transform(df)
180 |     tr2_df = disc.transform(df)
181 |     pd.testing.assert_frame_equal(tr_df, tr2_df)
182 |     reconst_df = disc.inverse_transform(tr_df)
183 |     check_df_properly_discretized(df, tr_df, reconst_df, max_nums=10)
184 | 
185 | 


--------------------------------------------------------------------------------
/tests/test_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import sklearn.datasets
 3 | 
 4 | def get_openml(id):
 5 |     return sklearn.datasets.fetch_openml(
 6 |         data_id=id,
 7 |         target_column=None,
 8 |         as_frame=True,
 9 |     )['data']
10 | 
11 | def test_openml():
12 |     colnames = [
13 |         'fathers_occupation',
14 |         'sons_occupation',
15 |         'family_structure',
16 |         'race',
17 |         'counts_for_sons_first_occupation',
18 |         'counts_for_sons_current_occupation'
19 |     ]
20 |     df = get_openml(541)
21 |     assert df.columns.tolist() == colnames
22 |     assert (df[colnames[:4]].dtypes == 'category').all()
23 |     assert df[colnames[4:]].dtypes.apply(pd.api.types.is_numeric_dtype).all()
24 |     assert len(df.index) == 1156
25 | 
26 | def check_synthdf_equal(df, synth, nrows=None):
27 |     assert frozenset(df.columns) == frozenset(synth.columns)
28 |     assert len(synth.index) == (len(df.index) if nrows is None else nrows)
29 |     for col in df.columns:
30 |         assert df[col].dtype == synth[col].dtype
31 |         if pd.api.types.is_categorical_dtype(df[col].dtype):
32 |             assert frozenset(synth[col].cat.categories).issubset(df[col].cat.categories)
33 |         elif pd.api.types.is_object_dtype(df[col].dtype):
34 |             assert frozenset(synth[col]).issubset(frozenset(df[col]))
35 |         elif pd.api.types.is_numeric_dtype(df[col].dtype):
36 |             assert df[col].min() <= synth[col].mean() <= df[col].max()
37 | 


--------------------------------------------------------------------------------
/tests/test_init.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import tempfile
 4 | import shutil
 5 | 
 6 | import pandas as pd
 7 | 
 8 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 9 | import pysynth
10 | import test_data
11 | 
12 | def test_synthesize():
13 |     df = test_data.get_openml(469) # analcatdata_dmft
14 |     synth = pysynth.synthesize(df)
15 |     test_data.check_synthdf_equal(df, synth)
16 | 
17 | def test_main():
18 |     tmp_dir = None
19 |     try:
20 |         tmp_dir = tempfile.mkdtemp()
21 |         in_path = os.path.join(tmp_dir, 'source.csv')
22 |         out_path = os.path.join(tmp_dir, 'target.csv')
23 |         test_data.get_openml(469).to_csv(in_path, sep=';', index=False)
24 |         assert os.path.isfile(in_path)
25 |         pysynth.main(in_path, out_path, '200')
26 |         assert os.path.isfile(out_path)
27 |         orig = pd.read_csv(in_path, sep=';')
28 |         synth = pd.read_csv(out_path, sep=';')
29 |         test_data.check_synthdf_equal(orig, synth, 200)
30 |     finally:
31 |         if tmp_dir is not None:
32 |             shutil.rmtree(tmp_dir)


--------------------------------------------------------------------------------
/tests/test_ipf.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import itertools
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import pytest
  8 | 
  9 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 10 | import pysynth.ipf
 11 | import test_data
 12 | 
 13 | IPF_PRECISION = 1e-10
 14 | 
 15 | np.random.seed(1711)
 16 | 
 17 | SEED_GEN_PARAMS = [
 18 |     ((4, 4), 0),
 19 |     ((8, 5), 0),
 20 |     ((5, 3, 3), 0),
 21 |     ((2, 8, 7, 4, 3), 0),
 22 |     ((4, 4), 0.1),
 23 |     ((8, 5), 0.2),
 24 |     ((5, 3, 3), 0.1),
 25 |     ((2, 8, 7, 4, 3), 0.05),
 26 | ]
 27 | 
 28 | def generate_seed_matrix(shape, zero_fraction):
 29 |     seed_matrix = np.random.rand(*shape)
 30 |     if zero_fraction > 0:
 31 |         seed_matrix[np.random.rand(*shape) < zero_fraction] = 0
 32 |     return seed_matrix
 33 | 
 34 | @pytest.mark.parametrize('shape, zero_fraction', SEED_GEN_PARAMS)
 35 | def test_ipf_correct(shape, zero_fraction):
 36 |     seed_matrix = generate_seed_matrix(shape, zero_fraction)
 37 |     marginals = [
 38 |         np.random.rand(dim) for dim in shape
 39 |     ]
 40 |     for i, marginal in enumerate(marginals):
 41 |         margsum = marginal.sum()
 42 |         marginals[i] = np.array([val * 50 / margsum for val in marginal])
 43 |     ipfed = pysynth.ipf.ipf(marginals, seed_matrix, precision=IPF_PRECISION)
 44 |     # check the shape and zeros are retained
 45 |     assert ipfed.shape == shape
 46 |     assert ((seed_matrix == 0) == (ipfed == 0)).all()
 47 |     if zero_fraction == 0:
 48 |         for i, marginal in enumerate(marginals):
 49 |             ipfed_sum = ipfed.sum(axis=tuple(j for j in range(ipfed.ndim) if j != i))
 50 |             # check the marginal sums match
 51 |             assert (abs(ipfed_sum - marginal) < (IPF_PRECISION * ipfed.size / len(marginal) * 10)).all()
 52 | 
 53 | def test_ipf_dim_mismatch():
 54 |     with pytest.raises(ValueError):
 55 |         pysynth.ipf.ipf(list(np.ones((3,2))), np.random.rand(2,2))
 56 | 
 57 | def test_ipf_sum_mismatch():
 58 |     with pytest.raises(ValueError):
 59 |         pysynth.ipf.ipf([np.ones(2), np.full(2, 2)], np.random.rand(2,2))
 60 | 
 61 | def test_ipf_shape_mismatch():
 62 |     with pytest.raises(ValueError):
 63 |         pysynth.ipf.ipf([np.ones(2), np.full((2, 4), .25)], np.random.rand(2,2))
 64 | 
 65 | @pytest.mark.parametrize('openml_id', [31, 1461, 40536])
 66 | def test_get_axis_values(openml_id):
 67 |     df = test_data.get_openml(openml_id)
 68 |     df = df.drop(
 69 |         [col for col, dtype in df.dtypes.iteritems() if not pd.api.types.is_categorical_dtype(dtype)],
 70 |         axis=1
 71 |     )
 72 |     maps = pysynth.ipf.get_axis_values(df)
 73 |     for col in maps:
 74 |         assert col in df.columns
 75 |         assert (maps[col].index == np.arange(len(maps[col].index))).all()
 76 |         assert frozenset(maps[col].values) == frozenset(df[col].unique())
 77 | 
 78 | ROUNDERS = [
 79 |     pysynth.ipf.LargestRemainderRounder(),
 80 |     pysynth.ipf.RandomSamplingRounder(seed=1711),
 81 | ]
 82 | 
 83 | UNROUND_MATRICES = [
 84 |     np.array([[[2.,.5],[.5,0]],[[1.2,1],[1,.8]],[[1,.2],[1,1.8]]]),
 85 |     np.random.rand(4,8,7) * 3,
 86 |     np.where(np.random.rand(3,7,4,2) < .2, 0, np.random.rand(3,7,4,2) * 2),
 87 | ]
 88 | 
 89 | @pytest.mark.parametrize('rder, mat', list(itertools.product(
 90 |     ROUNDERS, UNROUND_MATRICES
 91 | )))
 92 | def test_rounders(rder, mat):
 93 |     result = rder.round(mat)
 94 |     assert np.issubdtype(result.dtype, np.integer)
 95 |     assert result.sum() == int(np.round(mat.sum()))
 96 |     assert result.min() >= 0
 97 |     assert result[mat == 0].sum() == 0
 98 |     # for dim_i, dim in enumerate(mat.shape):
 99 |         # assert (result[:,dim_i] < dim).all()
100 | 
101 | @pytest.mark.parametrize('mat', UNROUND_MATRICES)
102 | def test_lrem_rounder(mat):
103 |     result = pysynth.ipf.LargestRemainderRounder().round(mat)
104 |     assert abs(result - np.round(mat)).max() <= 1
105 | 
106 | @pytest.mark.parametrize('mat', [
107 |     np.array([[[2,1],[0,0]],[[1,1],[1,3]],[[1,0],[1,2]]]),
108 |     (np.random.rand(4,8,7) * 3).astype(int),
109 |     (np.where(np.random.rand(3,7,4,2) < .2, 0, np.random.rand(3,7,4,2) * 2)).astype(int),
110 | ])
111 | def test_unroll(mat):
112 |     unrolled = pysynth.ipf.unroll(mat)
113 |     assert unrolled.shape == (mat.sum(), mat.ndim)
114 |     assert (unrolled >= 0).all()
115 |     for dim_i, dim in enumerate(mat.shape):
116 |         assert (unrolled[:,dim_i] < dim).all()
117 |     unroll_df = pd.DataFrame(unrolled)
118 |     for index, subdf in unroll_df.groupby(unroll_df.columns.tolist()):
119 |         assert mat[index] == len(subdf.index)
120 | 
121 | 
122 | def test_map_axes():
123 |     n_cols = 6
124 |     n_cats = 5
125 |     indices = (np.random.rand(40, n_cols) * n_cats).astype(int)
126 |     axis_values = {
127 |         chr(97 + np.random.randint(26)): pd.Series(
128 |             [chr(97 + k) for k in np.random.randint(26, size=n_cats)],
129 |             index=np.arange(n_cats)
130 |         ) for i in range(n_cols)
131 |     }
132 |     df = pysynth.ipf.map_axes(indices, axis_values)
133 |     assert list(df.columns) == list(axis_values.keys())
134 |     assert len(df.index) == indices.shape[0]
135 |     i = 0
136 |     for col, mapping in axis_values.items():
137 |         for index, value in mapping.iteritems():
138 |             assert (df[col][indices[:,i] == index] == value).all()
139 |         i += 1
140 | 
141 | def test_calc_true_matrix():
142 |     ipfsynth = pysynth.ipf.IPFSynthesizer(cond_dim=2)
143 |     n_rows = 400
144 |     n_cols = 4
145 |     n_cats = 5
146 |     cat_indices = np.arange(n_cats)
147 |     cat_objs = np.array([chr(97 + k) for k in np.arange(n_cats)])
148 |     map_to_objs = pd.Series(cat_objs, index=cat_indices)
149 |     ind_df = pd.DataFrame(
150 |         np.random.randint(n_cats, size=(n_rows, n_cols)),
151 |         columns=[chr(97 + k) for k in np.arange(n_cols)]
152 |     )
153 |     cat_df = ind_df.copy()
154 |     for col in cat_df.columns:
155 |         cat_df[col] = cat_df[col].map(map_to_objs)
156 |     ipfsynth.axis_values = {
157 |         col: map_to_objs for col in cat_df.columns
158 |     }
159 |     true_mat = ipfsynth.calc_true_matrix(cat_df)
160 |     assert true_mat.shape == tuple([n_cats] * n_cols)
161 |     assert true_mat.min() >= 0
162 |     assert true_mat.sum() == n_rows
163 |     assert np.isclose(true_mat, true_mat.astype(int)).all()
164 |     for inds, subdf in ind_df.groupby(list(ind_df.columns)):
165 |         assert true_mat[inds] == len(subdf.index)
166 | 
167 | # @pytest.mark.parametrize('shape, zero_fraction', [((4, 3, 2), .1)])
168 | @pytest.mark.parametrize('shape, zero_fraction', SEED_GEN_PARAMS)
169 | def test_obscure_seed(shape, zero_fraction):
170 |     seed_matrix = (generate_seed_matrix(shape, zero_fraction) * 10).astype(int)
171 |     n_dims = seed_matrix.ndim
172 |     for cond_dim in range(1, min(n_dims + 1, 4)):
173 |         obscured = pysynth.ipf.obscure_seed(seed_matrix, cond_dim)
174 |         preserved_dims = [
175 |             tuple(sorted(frozenset(dims)))
176 |             for dims in itertools.combinations_with_replacement(
177 |                 range(n_dims), cond_dim
178 |             )
179 |         ]
180 |         print(preserved_dims)
181 |         for sel_dim_is in preserved_dims:
182 |             other_dim_is = tuple(i for i in range(n_dims) if i not in sel_dim_is)
183 |             assert np.isclose(
184 |                 obscured.sum(axis=other_dim_is),
185 |                 seed_matrix.sum(axis=other_dim_is)
186 |             ).all()
187 | 
188 | # @pytest.mark.parametrize('openml_id', [11, 31, 1461, 1480])
189 | @pytest.mark.parametrize('openml_id, rder', [(11, 'lrem'), (23, 'random')])
190 | def test_synth(openml_id, rder):
191 |     df = test_data.get_openml(openml_id)
192 |     synth = pysynth.ipf.IPFSynthesizer(rounder=rder).fit_transform(df)
193 |     test_data.check_synthdf_equal(df, synth)
194 |     assert frozenset(df.columns) == frozenset(synth.columns)
195 |     assert len(df.index) == len(synth.index)
196 |     for col in df.columns:
197 |         assert df[col].dtype == synth[col].dtype
198 |         if pd.api.types.is_categorical_dtype(df[col].dtype):
199 |             assert frozenset(synth[col].cat.categories).issubset(df[col].cat.categories)
200 |         else:
201 |             assert df[col].min() <= synth[col].mean() <= df[col].max()
202 | 


--------------------------------------------------------------------------------
/tests/test_similarity.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import itertools
  4 | import warnings
  5 | 
  6 | import scipy.stats
  7 | import numpy as np
  8 | import pandas as pd
  9 | import pytest
 10 | import sklearn.base
 11 | import sklearn.exceptions
 12 | 
 13 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 14 | import pysynth.similarity
 15 | import test_data
 16 | 
 17 | 
 18 | def generate_artificial_df(n=200):
 19 |     return pd.DataFrame({
 20 |         'prob': np.random.rand(n),
 21 |         'iq': scipy.stats.norm(loc=100, scale=15).rvs(n),
 22 |         'cat' : np.random.choice(list('abc'), size=n),
 23 |     })
 24 | 
 25 | 
 26 | @pytest.fixture(scope='module')
 27 | def testing_df_close_pairs():
 28 |     np.random.seed(1711)
 29 |     pairs = [
 30 |         (generate_artificial_df(1000), generate_artificial_df(300)),
 31 |     ]
 32 |     for openml_id in [31, 1461, 40536]:
 33 |         df = test_data.get_openml(openml_id)
 34 |         half = len(df.index) // 2
 35 |         pairs.append((df.iloc[:half], df.iloc[half:]))
 36 |     return pairs
 37 | 
 38 | 
 39 | @pytest.mark.parametrize('distro_name, n_samples', itertools.product(
 40 |     ['norm', 'uniform'],
 41 |     [2, 30, 1000],
 42 | ))
 43 | def test_summary_stats_artificial(distro_name, n_samples):
 44 |     distro = getattr(scipy.stats, distro_name)()
 45 |     data = pd.Series(distro.rvs(n_samples))
 46 |     sumstat = pysynth.similarity.summary_stats(data)
 47 |     for item in pysynth.similarity.SUMMARY_STATS:
 48 |         assert item in sumstat.index
 49 |     assert np.isclose(sumstat['mean'], data.mean())
 50 |     assert np.isclose(sumstat['skew'], data.skew()) or (
 51 |         np.isnan(sumstat['skew']) and np.isnan(data.skew())
 52 |     )
 53 | 
 54 | def test_summary_stat_diff_artificial():
 55 |     df1, df2 = generate_artificial_df(), generate_artificial_df()
 56 |     df_diff = pysynth.similarity.summary_stat_diff(df1, df2, 'diff')
 57 |     # check range for probs column
 58 |     assert (-1 < df_diff.loc['prob',:]).all()
 59 |     assert (df_diff.loc['prob',:] < 1).all()
 60 | 
 61 | 
 62 | @pytest.mark.parametrize('method', ['diff', 'ape'])
 63 | def test_summary_stat_diff_real(testing_df_close_pairs, method):
 64 |     for df1, df2 in testing_df_close_pairs:
 65 |         nodiff = pysynth.similarity.summary_stat_diff(df1, df2, method)
 66 |         # check all stats have their column
 67 |         assert frozenset(nodiff.columns) == frozenset([
 68 |             col + '_' + method for col in pysynth.similarity.SUMMARY_STATS
 69 |         ])
 70 |         # check all numeric columns have their row
 71 |         assert frozenset(nodiff.index) == frozenset([
 72 |             col for col in df2 if pd.api.types.is_numeric_dtype(df2[col])
 73 |         ])
 74 |         if df1 is df2:
 75 |             assert np.isclose(nodiff, 0).all()
 76 | 
 77 | 
 78 | @pytest.mark.parametrize('n_bins', [5, 10, 15])
 79 | def test_aligned_freqs_normal(n_bins):
 80 |     np.random.seed(1711)
 81 |     df1, df2 = generate_artificial_df(1000), generate_artificial_df(300)
 82 |     for col in df1.columns:
 83 |         f1, f2 = pysynth.similarity.aligned_freqs(df1[col], df2[col], n_bins)
 84 |         f1_cats = frozenset(f1.index)
 85 |         assert not f1.hasnans
 86 |         assert not f2.hasnans
 87 |         assert f1_cats == frozenset(f2.index)
 88 |         assert np.isclose(f1.sum(), 1)
 89 |         assert (f1 >= 0).all() and (f1 <= 1).all()
 90 |         assert np.isclose(f2.sum(), 1)
 91 |         assert (f2 >= 0).all() and (f2 <= 1).all()
 92 |         # since the generation process is the same, the diffs should be low
 93 |         assert abs(f1 - f2).max() < .1
 94 |         if pd.api.types.is_numeric_dtype(df1[col]):
 95 |             assert len(f1_cats) == n_bins
 96 |             # the frequencies in f1 should be appx equal due to quantile binning
 97 |             assert (abs(f1[f1 > 0] - 1 / len(f1_cats)) < .01).all()
 98 |         else:
 99 |             assert len(f1_cats) == df1[col].nunique()
100 | 
101 | 
102 | def test_aligned_freqs_nobin():
103 |     x, y = pd.Series(np.random.rand(300)), pd.Series(np.random.rand(100))
104 |     f_x, f_y = pysynth.similarity.aligned_freqs(x, y, bins=None)
105 |     assert f_x is None
106 |     assert f_y is None
107 | 
108 | 
109 | @pytest.mark.parametrize('bins, metrics', itertools.product(
110 |     [5, 10, None], [None, ['rtae', 'mae']]
111 | ))
112 | def test_frequency_mismatch_normal(testing_df_close_pairs, bins, metrics):
113 |     for df1, df2 in testing_df_close_pairs:
114 |         metric_df = pysynth.similarity.frequency_mismatch(df1, df2, bins, metrics)
115 |         if metrics is None:
116 |             metrics = list(freqdiff_metric_bounds.keys())
117 |         assert frozenset(metric_df.columns) == frozenset(metrics)
118 |         if bins is None:
119 |             assert metric_df.index.tolist() == [
120 |                 col for col in df2 if not pd.api.types.is_numeric_dtype(df2[col])
121 |             ]
122 |         else:
123 |             assert metric_df.index.tolist() == df2.columns.tolist()
124 |         assert all(not metric_df[col].hasnans for col in metric_df.columns)
125 | 
126 | def random_freqs(n, zero_frac, index):
127 |     if not (0 <= zero_frac < 1): raise ValueError
128 |     freqs = np.random.rand(n)
129 |     freqs[np.random.rand(n) < zero_frac] = 0
130 |     return pd.Series(freqs / freqs.sum(), index=index)
131 | 
132 | freqdiff_metric_bounds = {
133 |     'rtae': (0, 0, 2),
134 |     'overlap_coef': (1, 0, 1),
135 |     'rank_damerau': (0, 0, 1),
136 |     'morisita_overlap': (1, 0, 1),
137 |     'mae': (0, 0, 1),
138 |     'rmse': (0, 0, 1),
139 |     'jaccard_dist': (0, 0, 1),
140 |     'simpson_diff': (0, -1, 1),
141 |     'entropy_diff': (0, -np.inf, np.inf),
142 | }
143 | 
144 | @pytest.mark.parametrize('n_cats, zero_frac, metrics', itertools.product(
145 |     [2, 5, 15], [0, .2, .6], [None, ['rtae', 'mae']]
146 | ))
147 | def test_freqdiff_metrics(n_cats, zero_frac, metrics):
148 |     np.random.seed(1711)
149 |     if n_cats == 2 and zero_frac == .6: return    # invalid case
150 |     cats = ['c' + str(i) for i in range(n_cats)]
151 |     probs1 = random_freqs(n_cats, zero_frac, cats)
152 |     probs2 = random_freqs(n_cats, zero_frac, cats)
153 |     metric_vals = pysynth.similarity.freqdiff_metrics(probs1, probs2, metrics)
154 |     nodiff_vals = pysynth.similarity.freqdiff_metrics(probs1, probs1, metrics)
155 |     if metrics is None:
156 |         metrics = list(freqdiff_metric_bounds.keys())
157 |     assert frozenset(metric_vals.index) == frozenset(metrics)
158 |     assert pd.api.types.is_numeric_dtype(metric_vals)
159 |     for metric, value in metric_vals.items():
160 |         assert metric in freqdiff_metric_bounds
161 |         nodiff, lo, hi = freqdiff_metric_bounds[metric]
162 |         assert lo <= value <= hi
163 |         assert metric in nodiff_vals.index
164 |         assert np.isclose(nodiff_vals[metric], nodiff)
165 | 
166 | 
167 | damlev_test_cases = [
168 |     ([8, 3, 7], [8, 3, 7], 0),
169 |     (['a', 'b', 'c'], ['a', 'b', 'c'], 0),
170 |     (list(range(100)), list(range(100)), 0),
171 |     (['c'], ['c'], 0),
172 |     ([], [], 0),
173 |     ([6, 4, 2], [4, 2], 1),
174 |     ([6, 4, 2], [6, 4], 1),
175 |     ([3, 8, 1], [], 3),
176 |     ([3, 8, 1], [9, 1], 2),
177 |     ([3, 8, 1], [8, 3, 1], 1),
178 |     ([3, 8, 1], [1, 8, 3], 2),
179 |     ([3, 8, 1], [3, 9, 1], 1),
180 | ]
181 | 
182 | def test_damerau_levenshtein():
183 |     for seq1, seq2, dist in damlev_test_cases:
184 |         assert pysynth.similarity.damerau_levenshtein(seq1, seq2) == dist
185 |         assert pysynth.similarity.damerau_levenshtein(seq2, seq1) == dist
186 | 
187 | 
188 | @pytest.mark.parametrize('method', ['pearson', 'kendall', 'spearman'])
189 | def test_correlation_diff(testing_df_close_pairs, method):
190 |     for df1, df2 in testing_df_close_pairs:
191 |         corrdiff = pysynth.similarity.correlation_diff(df1, df2, method)
192 |         ok_subset = []
193 |         for col in df2.columns:
194 |             if pd.api.types.is_numeric_dtype(df2[col]):
195 |                 assert col in corrdiff.index
196 |                 assert col in corrdiff.columns
197 |                 if not np.isnan(corrdiff[col]).all():
198 |                     ok_subset.append(col)
199 |         selfcorr_diffs = np.diag(corrdiff.loc[ok_subset,ok_subset].values)
200 |         assert (selfcorr_diffs == 0).all()
201 |         assert (np.isnan(corrdiff) | (corrdiff >= -1)).all(axis=None)
202 |         assert (np.isnan(corrdiff) | (corrdiff <= 1)).all(axis=None)
203 | 
204 | 
205 | def test_stat_tests(testing_df_close_pairs):
206 |     for df1, df2 in testing_df_close_pairs:
207 |         with warnings.catch_warnings():
208 |             warnings.simplefilter('ignore', category=RuntimeWarning)
209 |             stat_df = pysynth.similarity.stat_tests(df1, df2)
210 |         assert stat_df.columns.tolist() == [
211 |             't_stat', 't_pval', 'levene_stat', 'levene_pval'
212 |         ]
213 |         for col in df2.columns:
214 |             if pd.api.types.is_numeric_dtype(df2[col]):
215 |                 assert col in stat_df.index
216 |         for stat in stat_df.columns:
217 |             if stat.endswith('pval'):
218 |                 assert (np.isnan(stat_df[stat]) | (stat_df[stat] >= 0)).all()
219 |                 assert (np.isnan(stat_df[stat]) | (stat_df[stat] <= 1)).all()
220 | 
221 | 
222 | @pytest.mark.parametrize('openml_id', [31, 1461, 40536])
223 | def test_predictor_matrix(openml_id):
224 |     df = test_data.get_openml(openml_id)
225 |     preds = pysynth.similarity._predictor_matrix(df)
226 |     assert isinstance(preds, np.ndarray)
227 |     assert np.issubdtype(preds.dtype, np.number)
228 |     assert preds.ndim == 2
229 |     assert preds.shape[0] == len(df.index)
230 |     assert preds.shape[1] >= len(df.columns)
231 | 
232 | 
233 | @pytest.mark.parametrize('metrics', [None, ['gini', 'f1', 'precision']])
234 | def test_compute_accuracy_metrics(metrics):
235 |     np.random.seed(1711)
236 |     n = 200
237 |     sources = np.random.rand(n)
238 |     targets = sources < .5
239 |     prob_variants = [
240 |         (targets.astype(float), False),
241 |         (sources, False),
242 |         (1 - sources, False),
243 |         (np.clip(sources + np.random.rand(n) * .2 - .1, 0, 1), False),
244 |         (np.clip(sources + np.random.rand(n) * .4 - .2, 0, 1), False),
245 |         (np.zeros(n), True),
246 |         (np.ones(n), True),
247 |     ]
248 |     check_metrics = metrics
249 |     if check_metrics is None:
250 |         check_metrics = list(pysynth.similarity.DEFAULT_METRICS.keys())
251 |     for probs, is_edge in prob_variants:
252 |         if is_edge:
253 |             with pytest.warns(None):
254 |                 metric_vals = pysynth.similarity._compute_accuracy_metrics(targets, probs, metrics)
255 |         else:
256 |             metric_vals = pysynth.similarity._compute_accuracy_metrics(targets, probs, metrics)
257 |         check_metric_values(metric_vals, check_metrics)
258 | 
259 | 
260 | def check_metric_values(metrics, names=None):
261 |     if names is not None:
262 |         assert metrics.index.tolist() == names
263 |     assert not metrics.hasnans
264 |     assert (np.round(metrics, 8) <= 1).all()
265 |     assert (np.round(metrics, 8) >= -1).all()
266 |     
267 | 
268 | @pytest.mark.parametrize('test_size', [.1, .25, .5])
269 | def test_discrimination(testing_df_close_pairs, test_size):
270 |     for df1, df2 in testing_df_close_pairs:
271 |         with warnings.catch_warnings():
272 |             warnings.simplefilter('ignore', category=RuntimeWarning)
273 |             warnings.simplefilter('ignore', category=sklearn.exceptions.UndefinedMetricWarning)
274 |             metrics, clf = pysynth.similarity.discrimination(
275 |                 df1, df2, test_size=test_size, return_best=True
276 |             )
277 |         check_metric_values(metrics)
278 |         if clf is None:
279 |             assert metrics['gini'] <= 0
280 |         else:
281 |             assert metrics['gini'] > 0
282 |             assert sklearn.base.is_classifier(clf)
283 | 


--------------------------------------------------------------------------------