├── .gitignore ├── LICENSE ├── LICENSE.MIT ├── README.md ├── attack_models ├── __init__.py ├── attack_model.py ├── mia_classifier.py └── reconstruction.py ├── data ├── germancredit.json ├── texas.csv └── texas.json ├── docker ├── Dockerfile └── requirements.txt ├── executables ├── __init__.py ├── generate_metadata_file.py └── generate_synthetic_dataset.py ├── feature_sets ├── __init__.py ├── bayes.py ├── feature_set.py ├── independent_histograms.py └── model_agnostic.py ├── generative_models ├── __init__.py ├── ctgan.py ├── data_synthesiser.py ├── data_synthesiser_utils │ ├── __init__.py │ ├── datatypes │ │ ├── AbstractAttribute.py │ │ ├── FloatAttribute.py │ │ ├── IntegerAttribute.py │ │ ├── StringAttribute.py │ │ ├── __init__.py │ │ ├── constants.py │ │ └── utils │ │ │ ├── DataType.py │ │ │ └── __init__.py │ └── utils.py ├── generative_model.py ├── gmm.py └── pate_gan.py ├── inference_cli.py ├── linkage_cli.py ├── notebooks └── Analyse Results.ipynb ├── predictive_models ├── __init__.py └── predictive_model.py ├── requirements.txt ├── sanitisation_techniques └── sanitiser.py ├── tests ├── __init__.py ├── germancredit_test.csv ├── germancredit_test.json ├── inference │ └── runconfig.json ├── linkage │ └── runconfig.json ├── test_attacks.py ├── test_gms.py ├── test_sanitisation.py └── utility │ └── runconfig.json ├── utility_cli.py └── utils ├── __init__.py ├── analyse_results.py ├── constants.py ├── datagen.py ├── evaluation_framework.py ├── logging.py ├── plot_setup.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | venv* 2 | outputs/* 3 | .idea/* 4 | spring_synthetic_data.iml 5 | .DS_Store 6 | __pycache__/* 7 | */__pycache__/* 8 | */__pycache__ 9 | __pycache__ 10 | paper/*.aux 11 | paper/*.synctex.gz 12 | paper/*.pdf 13 | paper/*.log 14 | paper/*.bbl 15 | paper/*.blg 16 | paper/*.out 17 | syn_data_files/ 18 | notebooks_local/ 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD-3-Clause License 2 | 3 | Copyright 2021 Theresa Stadler (EPFL SPRING Lab), Bristena Oprisanu (UCL) 4 | 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 8 | 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 10 | 11 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 14 | -------------------------------------------------------------------------------- /LICENSE.MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright <2018> 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Privacy evaluation framework for synthetic data publishing 2 | A practical framework to evaluate the privacy-utility tradeoff of synthetic data publishing 3 | 4 | Based on "Synthetic Data - Anonymisation Groundhog Day, Theresa Stadler, Bristena Oprisanu, and Carmela Troncoso, [arXiv](https://arxiv.org/abs/2011.07018), 2020" 5 | 6 | # Attack models 7 | The module `attack_models` so far includes 8 | 9 | A privacy adversary to test for privacy gain with respect to linkage attacks modelled as a membership inference attack `MIAAttackClassifier`. 10 | 11 | A simple attribute inference attack `AttributeInferenceAttack` that aims to infer a target's sensitive value given partial knowledge about the target record 12 | 13 | # Generative models 14 | The module `generative_models` so far includes: 15 | - `IndependentHistogram`: An independent histogram model adapted from [Data Responsibly's DataSynthesiser](https://github.com/DataResponsibly/DataSynthesizer) 16 | - `BayesianNet`: A generative model based on a Bayesian Network adapted from [Data Responsibly's DataSynthesiser](https://github.com/DataResponsibly/DataSynthesizer) 17 | - `PrivBayes`: A differentially private version of the BayesianNet model adapted from [Data Responsibly's DataSynthesiser](https://github.com/DataResponsibly/DataSynthesizer) 18 | - `CTGAN`: A conditional tabular generative adversarial network that integrates the CTGAN model from [CTGAN](https://github.com/sdv-dev/CTGAN) 19 | - `PATE-GAN`: A differentially private generative adversarial network adapted from its original implementation by the [MLforHealth Lab](https://bitbucket.org/mvdschaar/mlforhealthlabpub/src/82d7f91d46db54d256ff4fc920d513499ddd2ab8/alg/pategan/) 20 | 21 | # Setup 22 | 23 | ## Docker Distribution 24 | 25 | For your convenience, Synthetic Data is also distributed as a ready-to-use Docker image containing Python 3.9 and CUDA 11.4.2, along with all dependencies required by Synthetic Data, including jupyter notebook to visualise and analyse the results. 26 | 27 | **Note:** This distribution includes CUDA binaries, before downloading the image, ensure to read [its EULA](https://docs.nvidia.com/cuda/eula/index.html) and to agree to its terms. 28 | 29 | Pull the image and run a container (and bind a volume where you want to save the data): 30 | 31 | ``` 32 | docker pull springepfl/synthetic-data:latest 33 | docker run -it --rm -v "$(pwd)/output:/output" -p 8888:8888 springepfl/synthetic-data 34 | ``` 35 | 36 | The Synthetic Data directory is placed at the root directory of the container. 37 | ``` 38 | cd /synthetic_data_release 39 | ``` 40 | 41 | You should now be able to run the examples without encountering any problems, and you should be able to visualize the results with Jupyter by running 42 | ``` 43 | jupyter notebook --allow-root --ip=0.0.0.0 44 | ``` 45 | 46 | and opening the notebook with your favourite web browser at the url `http://127.0.0.1:8888/?token=`. 47 | 48 | 49 | ## Direct Installation 50 | 51 | ### Requirements 52 | The framework and its building blocks have been developed and tested under Python 3.9 . 53 | 54 | We recommend to create a virtual environment for installing all dependencies and running the code 55 | ``` 56 | python3 -m venv pyvenv3 57 | source pyvenv3/bin/activate 58 | pip install numpy==1.19.5 && pip install -r requirements.txt 59 | ``` 60 | 61 | Note: Some people encountered problems due to the API of Numpy having changed between versions, to ensure all dependencies are compiled against the same Numpy version, it needs to be installed first. 62 | 63 | ### Dependencies 64 | The `CTGAN` model depends on a fork of the original model training algorithm that can be found here 65 | [CTGAN-SPRING](https://github.com/spring-epfl/CTGAN.git) 66 | 67 | To install the correct version clone the repository above and run 68 | ``` 69 | cd CTGAN 70 | make install 71 | ``` 72 | 73 | Add the path to this directory to your python path. You can also add this line 74 | in your shell configuration file (e.g., `~/.bashrc`) to load it automatically. 75 | ```bash 76 | # Execute this in the CTGAN folder, otherwise replace `pwd` with the actual path 77 | export PYTHONPATH=$PYTHONPATH:`pwd` 78 | ``` 79 | 80 | To test your installation try to run 81 | ``` 82 | import ctgan 83 | ``` 84 | from within your virtualenv `python` 85 | 86 | # Example runs 87 | To run a privacy evaluation with respect to the privacy concern of linkability you can run 88 | 89 | ``` 90 | python3 linkage_cli.py -D data/texas -RC tests/linkage/runconfig.json -O tests/linkage 91 | ``` 92 | 93 | The results file produced after successfully running the script will be written to `tests/linkage` and can be parsed with the function `load_results_linkage` provided in `utils/analyse_results.py`. 94 | A jupyter notebook to visualise and analyse the results is included at `notebooks/Analyse Results.ipynb`. 95 | 96 | 97 | To run a privacy evaluation with respect to the privacy concern of inference you can run 98 | 99 | ``` 100 | python3 inference_cli.py -D data/texas -RC tests/inference/runconfig.json -O tests/inference 101 | ``` 102 | 103 | The results file produced after successfully running the script can be parsed with the function `load_results_inference` provided in `utils/analyse_results.py`. 104 | A jupyter notebook to visualise and analyse the results is included at `notebooks/Analyse Results.ipynb`. 105 | 106 | 107 | To run a utility evaluation with respect to a simple classification task as utility function run 108 | 109 | ``` 110 | python3 utility_cli.py -D data/texas -RC tests/utility/runconfig.json -O tests/utility 111 | ``` 112 | 113 | The results file produced after successfully running the script can be parsed with the function `load_results_utility` provided in `utils/analyse_results.py`. 114 | 115 | -------------------------------------------------------------------------------- /attack_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/attack_models/__init__.py -------------------------------------------------------------------------------- /attack_models/attack_model.py: -------------------------------------------------------------------------------- 1 | """Parent class for all privacy attacks""" 2 | 3 | class PrivacyAttack(object): 4 | 5 | def train(self, *args): 6 | """Train privacy adversary""" 7 | return NotImplementedError('Method needs to be overwritten by a subclass.') 8 | 9 | def attack(self, *args): 10 | """Make a guess about target's secret""" 11 | return NotImplementedError('Method needs to be overwritten by a subclass.') -------------------------------------------------------------------------------- /attack_models/mia_classifier.py: -------------------------------------------------------------------------------- 1 | """Parent class for launching a membership inference attack on the output of a generative model""" 2 | from pandas import DataFrame 3 | from pandas.api.types import CategoricalDtype 4 | from numpy import ndarray, concatenate, stack, array, round, zeros, arange 5 | 6 | from sklearn.svm import SVC 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.neighbors import KNeighborsClassifier 10 | from sklearn.neural_network import MLPClassifier 11 | from sklearn.model_selection import ShuffleSplit 12 | 13 | from utils.datagen import convert_df_to_array 14 | from utils.utils import CustomProcess 15 | from utils.constants import * 16 | 17 | from attack_models.attack_model import PrivacyAttack 18 | 19 | from warnings import simplefilter 20 | simplefilter('ignore', category=FutureWarning) 21 | simplefilter('ignore', category=DeprecationWarning) 22 | 23 | import multiprocessing as mp 24 | 25 | class MIAttackClassifier(PrivacyAttack): 26 | """"Parent class for membership inference attack on the output of a generative model using sklearn classifier""" 27 | def __init__(self, Distinguisher, metadata, FeatureSet=None, quids=None): 28 | 29 | self.Distinguisher = Distinguisher 30 | self.FeatureSet = FeatureSet 31 | 32 | self.metadata, self.categoricalAttributes, self.numericalAttributes = self._read_meta(metadata, quids) 33 | 34 | self.trained = False 35 | 36 | self.__name__ = f'{self.Distinguisher.__class__.__name__}{self.FeatureSet.__class__.__name__}' 37 | 38 | def train(self, synA, labels): 39 | """Train a membership inference attack on a labelled training set""" 40 | 41 | if self.FeatureSet is not None: 42 | synA = stack([self.FeatureSet.extract(s) for s in synA]) 43 | else: 44 | synA = stack([self._df_to_array(s).flatten() for s in synA]) 45 | 46 | if not isinstance(labels, ndarray): 47 | labels = array(labels) 48 | 49 | self.Distinguisher.fit(synA, labels) 50 | 51 | self.trained = True 52 | 53 | def attack(self, datasets, attemptLinkage=False, target=None): 54 | """ 55 | Make a guess about the target's membership in the training data of the 56 | generative model that produced the synthetic input data 57 | 58 | :param datasets: list: A list of synthetic or sanitised datasets 59 | :return: guess: list: A guess about the target's membership for each of the synthetic input datasets 60 | """ 61 | assert self.trained, 'Attack must first be trained.' 62 | 63 | if attemptLinkage: 64 | assert target is not None, 'Attacker needs target record to attempt linkage' 65 | 66 | guesses = [] 67 | for df in datasets: 68 | if attemptLinkage: 69 | try: 70 | k = df.groupby(self.categoricalAttributes).size()[target[self.categoricalAttributes].values] 71 | if all(k == 1): 72 | guess = LABEL_IN 73 | else: 74 | guess = self._make_guess(df) 75 | except: 76 | guess = self._make_guess(df) 77 | else: 78 | guess = self._make_guess(df) 79 | 80 | guesses.append(guess) 81 | 82 | return guesses 83 | 84 | def _make_guess(self, df): 85 | if self.FeatureSet is not None: 86 | f = self.FeatureSet.extract(df).reshape(1, -1) 87 | else: 88 | f = self._df_to_array(df).reshape(1, -1) 89 | 90 | return round(self.Distinguisher.predict(f), 0).astype(int)[0] 91 | 92 | 93 | def get_confidence(self, synT, secret): 94 | """Calculate probability that attacker correctly predicts whether target was present in model's training data""" 95 | assert self.trained, 'Attack must first be trained.' 96 | if self.FeatureSet is not None: 97 | synT = stack([self.FeatureSet.extract(s) for s in synT]) 98 | else: 99 | if isinstance(synT[0], DataFrame): 100 | synT = stack([convert_df_to_array(s, self.metadata).flatten() for s in synT]) 101 | else: 102 | synT = stack([s.flatten() for s in synT]) 103 | 104 | probs = self.Distinguisher.predict_proba(synT) 105 | 106 | return [p[s] for p,s in zip(probs, secret)] 107 | 108 | def _read_meta(self, metadata, quids): 109 | if quids is None: 110 | quids = [] 111 | 112 | meta_dict = {} 113 | categoricalAttributes = [] 114 | numericalAttributes = [] 115 | 116 | for cdict in metadata['columns']: 117 | attr_name = cdict['name'] 118 | data_type = cdict['type'] 119 | 120 | if data_type == FLOAT or data_type == INTEGER: 121 | if attr_name in quids: 122 | cat_bins = cdict['bins'] 123 | cat_labels = [f'({cat_bins[i]},{cat_bins[i+1]}]' for i in range(len(cat_bins)-1)] 124 | 125 | meta_dict[attr_name] = { 126 | 'type': CATEGORICAL, 127 | 'categories': cat_labels, 128 | 'size': len(cat_labels) 129 | } 130 | 131 | categoricalAttributes.append(attr_name) 132 | 133 | else: 134 | meta_dict[attr_name] = { 135 | 'type': data_type, 136 | 'min': cdict['min'], 137 | 'max': cdict['max'] 138 | } 139 | 140 | numericalAttributes.append(attr_name) 141 | 142 | elif data_type == CATEGORICAL or data_type == ORDINAL: 143 | meta_dict[attr_name] = { 144 | 'type': data_type, 145 | 'categories': cdict['i2s'], 146 | 'size': len(cdict['i2s']) 147 | } 148 | 149 | categoricalAttributes.append(attr_name) 150 | 151 | else: 152 | raise ValueError(f'Unknown data type {data_type} for attribute {attr_name}') 153 | 154 | return meta_dict, categoricalAttributes, numericalAttributes 155 | 156 | def _df_to_array(self, data): 157 | dfAsArray = [] 158 | for col, cdict in self.metadata.items(): 159 | if col in list(data): 160 | colData = data[col].copy() 161 | coltype = cdict['type'] 162 | 163 | if coltype in STRINGS: 164 | if len(colData) > len(colData.dropna()): 165 | colData = colData.fillna(FILLNA_VALUE_CAT) 166 | if FILLNA_VALUE_CAT not in cdict['categories']: 167 | col['categories'].append(FILLNA_VALUE_CAT) 168 | col['size'] += 1 169 | 170 | if coltype == ORDINAL: 171 | cat = CategoricalDtype(categories=cdict['categories'], ordered=True) 172 | colData = colData.astype(cat) 173 | colArray = colData.cat.codes.values.reshape(-1, 1) 174 | 175 | else: 176 | colArray = self._one_hot(colData.values, cdict['categories']) 177 | 178 | elif coltype in NUMERICAL: 179 | colArray = colData.values.reshape(-1, 1) 180 | 181 | else: 182 | raise ValueError(f'Unknown type {coltype} for col {col}') 183 | 184 | dfAsArray.append(colArray) 185 | 186 | return concatenate(dfAsArray, axis=1) 187 | 188 | def _one_hot(self, col_data, categories): 189 | col_data_onehot = zeros((len(col_data), len(categories))) 190 | cidx = [categories.index(c) for c in col_data] 191 | col_data_onehot[arange(len(col_data)), cidx] = 1 192 | 193 | return col_data_onehot 194 | 195 | 196 | class MIAttackClassifierLinearSVC(MIAttackClassifier): 197 | 198 | def __init__(self, metadata, FeatureSet=None): 199 | super().__init__(SVC(kernel='linear', probability=True), metadata, FeatureSet) 200 | 201 | 202 | class MIAttackClassifierSVC(MIAttackClassifier): 203 | 204 | def __init__(self, metadata, FeatureSet=None): 205 | super().__init__(SVC(probability=True), metadata, FeatureSet) 206 | 207 | 208 | class MIAttackClassifierLogReg(MIAttackClassifier): 209 | 210 | def __init__(self, metadata, FeatureSet=None): 211 | super().__init__(LogisticRegression(), metadata, FeatureSet) 212 | 213 | 214 | class MIAttackClassifierRandomForest(MIAttackClassifier): 215 | 216 | def __init__(self, metadata, FeatureSet=None, quids=None): 217 | super().__init__(RandomForestClassifier(), metadata=metadata, FeatureSet=FeatureSet, quids=quids) 218 | 219 | 220 | class MIAttackClassifierKNN(MIAttackClassifier): 221 | 222 | def __init__(self, metadata, FeatureSet=None, quids=None): 223 | super().__init__(KNeighborsClassifier(n_neighbors=5), metadata=metadata, FeatureSet=FeatureSet, quids=quids) 224 | 225 | 226 | class MIAttackClassifierMLP(MIAttackClassifier): 227 | 228 | def __init__(self, metadata, FeatureSet=None, quids=None): 229 | super().__init__(MLPClassifier((200,), solver='lbfgs'), metadata=metadata, FeatureSet=FeatureSet, quids=quids) 230 | 231 | 232 | def generate_mia_shadow_data(GenModel, target, rawA, sizeRaw, sizeSyn, numModels, numCopies): 233 | assert isinstance(rawA, GenModel.datatype), f"GM expects datatype {GenModel.datatype} but got {type(rawA)}" 234 | assert isinstance(target, type(rawA)), f"Mismatch of datatypes between target record and raw data" 235 | 236 | kf = ShuffleSplit(n_splits=numModels, train_size=sizeRaw) 237 | 238 | if GenModel.multiprocess: 239 | 240 | manager = mp.Manager() 241 | synA = manager.list() 242 | labelsA = manager.list() 243 | jobs = [] 244 | tasks = [(rawA, train_index, GenModel, target, sizeSyn, numCopies, synA, labelsA) for train_index, _ in kf.split(rawA)] 245 | 246 | for task in tasks: 247 | p = CustomProcess(target=worker_train_shadow, args=task) 248 | jobs.append(p) 249 | p.start() 250 | 251 | for p in jobs: 252 | p.join() 253 | 254 | else: 255 | synA, labelsA = [], [] 256 | for train_index, _ in kf.split(rawA): 257 | worker_train_shadow(rawA, train_index, GenModel, target, sizeSyn, numCopies, synA, labelsA) 258 | 259 | return synA, labelsA 260 | 261 | 262 | def worker_train_shadow(rawA, train_index, GenModel, target, sizeSyn, numCopies, synA, labelsA): 263 | # Fit GM to data without target's data 264 | if isinstance(rawA, DataFrame): 265 | rawAout = rawA.iloc[train_index] 266 | else: 267 | rawAout = rawA[train_index, :] 268 | GenModel.fit(rawAout) 269 | 270 | # Generate synthetic sample for data without target 271 | synOut = [GenModel.generate_samples(sizeSyn) for _ in range(numCopies)] 272 | labelsOut = [LABEL_OUT for _ in range(numCopies)] 273 | 274 | # Insert targets into training data 275 | if isinstance(rawA, DataFrame): 276 | rawAin = rawAout.append(target) 277 | else: 278 | if len(target.shape) == 1: 279 | target = target.reshape(1, len(target)) 280 | rawAin = concatenate([rawAout, target]) 281 | 282 | # Fit generative model to data including target 283 | GenModel.fit(rawAin) 284 | 285 | # Generate synthetic sample for data including target 286 | synIn = [GenModel.generate_samples(sizeSyn) for _ in range(numCopies)] 287 | labelsIn = [LABEL_IN for _ in range(numCopies)] 288 | 289 | syn = synOut + synIn 290 | labels = labelsOut + labelsIn 291 | 292 | synA.extend(syn) 293 | labelsA.extend(labels) 294 | 295 | 296 | def generate_mia_anon_data(Sanitiser, target, rawA, sizeRaw, numSamples): 297 | assert isinstance(rawA, Sanitiser.datatype), f"GM expects datatype {Sanitiser.datatype} but got {type(rawA)}" 298 | assert isinstance(target, type(rawA)), f"Mismatch of datatypes between target record and raw data" 299 | 300 | kf = ShuffleSplit(n_splits=numSamples, train_size=sizeRaw) 301 | 302 | sanA, labelsA = [], [] 303 | for train_index, _ in kf.split(rawA): 304 | worker_sanitise_data(rawA, train_index, Sanitiser, target, sanA, labelsA) 305 | 306 | return sanA, labelsA 307 | 308 | 309 | def worker_sanitise_data(rawA, train_index, Sanitiser, target, sanA, labelsA): 310 | # Fit GM to data without target's data 311 | if isinstance(rawA, DataFrame): 312 | rawAout = rawA.iloc[train_index] 313 | else: 314 | rawAout = rawA[train_index, :] 315 | sanOut = Sanitiser.sanitise(rawAout) 316 | sanA.append(sanOut) 317 | labelsA.append(LABEL_OUT) 318 | 319 | # Insert targets into training data 320 | if isinstance(rawA, DataFrame): 321 | rawAin = rawAout.append(target) 322 | else: 323 | if len(target.shape) == 1: 324 | target = target.reshape(1, len(target)) 325 | rawAin = concatenate([rawAout, target]) 326 | 327 | # Fit generative model to data including target 328 | sanIn = Sanitiser.sanitise(rawAin) 329 | sanA.append(sanIn) 330 | labelsA.append(LABEL_IN) 331 | 332 | 333 | 334 | -------------------------------------------------------------------------------- /attack_models/reconstruction.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | from pandas.api.types import CategoricalDtype 3 | from numpy import mean, concatenate, ones, sqrt, zeros, arange 4 | from scipy.stats import norm 5 | from sklearn.impute import SimpleImputer 6 | from sklearn.linear_model import LinearRegression 7 | from sklearn.ensemble import RandomForestClassifier 8 | 9 | from attack_models.attack_model import PrivacyAttack 10 | from utils.constants import * 11 | from utils.logging import LOGGER 12 | 13 | 14 | class AttributeInferenceAttack(PrivacyAttack): 15 | """A privacy attack that aims to reconstruct a sensitive attribute c given a partial target record T""" 16 | 17 | def __init__(self, PredictionModel, sensitiveAttribute, metadata, quids=None): 18 | """ 19 | Parent class for simple regression attribute inference attack 20 | 21 | :param PredictionModel: object: sklearn-type prediction model 22 | :param sensitiveAttribute: string: name of a column in a DataFrame that is considered the unknown, sensitive attribute 23 | :param metadata: dict: schema for the data to be attacked 24 | :param backgroundKnowledge: pd.DataFrame: adversary's background knowledge dataset 25 | """ 26 | 27 | self.PredictionModel = PredictionModel 28 | self.sensitiveAttribute = sensitiveAttribute 29 | 30 | self.metadata, self.knownAttributes, self.categoricalAttributes, self.nfeatures = self._read_meta(metadata, quids) 31 | 32 | self.ImputerCat = SimpleImputer(strategy='most_frequent') 33 | self.ImputerNum = SimpleImputer(strategy='median') 34 | 35 | self.trained = False 36 | 37 | self.__name__ = f'{self.PredictionModel.__class__.__name__}' 38 | 39 | def attack(self, targetAux, attemptLinkage=False, data=None): 40 | """Makes a guess about the target's secret attribute""" 41 | assert self.trained, 'Attack must first be trained on some data before can predict sensitive target value' 42 | 43 | if attemptLinkage: 44 | assert data is not None, "Need a dataset for linkage attack." 45 | try: 46 | groups = data.groupby(self.categoricalAttributes) 47 | targetCats = targetAux[self.categoricalAttributes].values 48 | groupSize = groups.size()[targetCats] 49 | if all(groupSize == 1): 50 | guess = groups.get_group(tuple(targetCats[0]))[self.sensitiveAttribute].values[0] 51 | else: 52 | guess = self._make_guess(targetAux) 53 | except: 54 | guess = self._make_guess(targetAux) 55 | else: 56 | guess = self._make_guess(targetAux) 57 | 58 | return guess 59 | 60 | def _make_guess(self, targetAux): 61 | raise NotImplementedError('Method must be overriden by a subclass') 62 | 63 | def _read_meta(self, metadata, quids): 64 | if quids is None: 65 | quids = [] 66 | 67 | meta_dict = {} 68 | knownAttributes = [] 69 | categoricalAttributes = [] 70 | nfeatures = 0 71 | 72 | for cdict in metadata['columns']: 73 | attr_name = cdict['name'] 74 | data_type = cdict['type'] 75 | 76 | if data_type == FLOAT or data_type == INTEGER: 77 | if attr_name in quids: 78 | cat_bins = cdict['bins'] 79 | cat_labels = [f'({cat_bins[i]},{cat_bins[i+1]}]' for i in range(len(cat_bins)-1)] 80 | 81 | meta_dict[attr_name] = { 82 | 'type': CATEGORICAL, 83 | 'categories': cat_labels, 84 | 'size': len(cat_labels) 85 | } 86 | 87 | nfeatures += len(cat_labels) 88 | 89 | if attr_name != self.sensitiveAttribute: 90 | categoricalAttributes.append(attr_name) 91 | 92 | else: 93 | meta_dict[attr_name] = { 94 | 'type': data_type, 95 | 'min': cdict['min'], 96 | 'max': cdict['max'] 97 | } 98 | 99 | nfeatures += 1 100 | 101 | elif data_type == CATEGORICAL or data_type == ORDINAL: 102 | meta_dict[attr_name] = { 103 | 'type': data_type, 104 | 'categories': cdict['i2s'], 105 | 'size': len(cdict['i2s']) 106 | } 107 | 108 | nfeatures += len(cdict['i2s']) 109 | 110 | if attr_name != self.sensitiveAttribute: 111 | categoricalAttributes.append(attr_name) 112 | 113 | else: 114 | raise ValueError(f'Unknown data type {data_type} for attribute {attr_name}') 115 | 116 | if attr_name != self.sensitiveAttribute: 117 | knownAttributes.append(attr_name) 118 | 119 | return meta_dict, knownAttributes, categoricalAttributes, nfeatures 120 | 121 | def _encode_data(self, data): 122 | dfcopy = data.copy() 123 | for col, cdict in self.metadata.items(): 124 | if col in list(dfcopy): 125 | col_data = dfcopy[col] 126 | if cdict['type'] in [CATEGORICAL, ORDINAL]: 127 | if len(col_data) > len(col_data.dropna()): 128 | col_data = col_data.fillna(FILLNA_VALUE_CAT) 129 | if FILLNA_VALUE_CAT not in cdict['categories']: 130 | col['categories'].append(FILLNA_VALUE_CAT) 131 | col['size'] += 1 132 | 133 | cat = CategoricalDtype(categories=cdict['categories'], ordered=True) 134 | col_data = col_data.astype(cat) 135 | dfcopy[col] = col_data.cat.codes 136 | 137 | return dfcopy.values 138 | 139 | def _impute_missing_values(self, df): 140 | dfImpute = df.copy() 141 | 142 | catCols = [] 143 | numCols = [] 144 | 145 | for attr, col in self.metadata.items(): 146 | if attr in list(dfImpute): 147 | if col['type'] in [CATEGORICAL, ORDINAL]: 148 | catCols.append(attr) 149 | elif col['type'] in NUMERICAL: 150 | numCols.append(attr) 151 | 152 | self.ImputerCat.fit(df[catCols]) 153 | dfImpute[catCols] = self.ImputerCat.transform(df[catCols]) 154 | 155 | self.ImputerNum.fit(df[numCols]) 156 | dfImpute[numCols] = self.ImputerNum.transform(df[numCols]) 157 | 158 | return dfImpute 159 | 160 | def _one_hot(self, col_data, categories): 161 | col_data_onehot = zeros((len(col_data), len(categories))) 162 | cidx = [categories.index(c) for c in col_data] 163 | col_data_onehot[arange(len(col_data)), cidx] = 1 164 | 165 | return col_data_onehot 166 | 167 | 168 | class LinRegAttack(AttributeInferenceAttack): 169 | """An AttributeInferenceAttack based on a simple Linear Regression model""" 170 | def __init__(self, sensitiveAttribute, metadata, quids=None): 171 | super().__init__(LinearRegression(fit_intercept=False), sensitiveAttribute, metadata, quids) 172 | 173 | self.scaleFactor = None 174 | self.coefficients = None 175 | self.sigma = None 176 | 177 | 178 | def train(self, data): 179 | """ 180 | Train a MLE attack to reconstruct an unknown sensitive value from a vector of known attributes 181 | :param data: type(DataFrame) A dataset of shape (n, k) 182 | """ 183 | features = self._encode_data(data.drop(self.sensitiveAttribute, axis=1)) 184 | labels = data[self.sensitiveAttribute].values 185 | 186 | n, k = features.shape 187 | 188 | # Center independent variables for better regression performance 189 | self.scaleFactor = mean(features, axis=0) 190 | featuresScaled = features - self.scaleFactor 191 | featuresScaled = concatenate([ones((n, 1)), featuresScaled], axis=1) # append all ones for inclu intercept in beta vector 192 | 193 | # Get MLE for linear coefficients 194 | self.PredictionModel.fit(featuresScaled, labels) 195 | self.coefficients = self.PredictionModel.coef_ 196 | self.sigma = sum((labels - featuresScaled.dot(self.coefficients))**2)/(n-k) 197 | 198 | LOGGER.debug('Finished training regression model') 199 | self.trained = True 200 | 201 | def _make_guess(self, targetAux): 202 | targetFeatures = self._encode_data(targetAux) 203 | targetFeaturesScaled = targetFeatures - self.scaleFactor 204 | targetFeaturesScaled = concatenate([ones((len(targetFeaturesScaled), 1)), targetFeatures], axis=1) 205 | 206 | guess = targetFeaturesScaled.dot(self.coefficients)[0] 207 | 208 | return guess 209 | 210 | def get_likelihood(self, targetAux, targetSensitive, attemptLinkage=False, data=None): 211 | assert self.trained, 'Attack must first be trained on some data before can predict sensitive target value' 212 | 213 | targetFeatures = self._encode_data(targetAux) 214 | targetFeaturesScaled = targetFeatures - self.scaleFactor 215 | targetFeaturesScaled = concatenate([ones((len(targetFeaturesScaled), 1)), targetFeatures], axis=1) 216 | 217 | if attemptLinkage: 218 | assert data is not None, "Need a dataset for linkage attack." 219 | try: 220 | groups = data.groupby(self.categoricalAttributes) 221 | targetCats = targetAux[self.categoricalAttributes].values 222 | groupSize = groups.size()[targetCats] 223 | if all(groupSize == 1): 224 | pCorrect = 1. 225 | 226 | else: 227 | pdfLikelihood = norm(loc=targetFeaturesScaled.dot(self.coefficients), scale=sqrt(self.sigma)) 228 | pCorrect = pdfLikelihood.pdf(targetSensitive)[0] 229 | 230 | except: 231 | pdfLikelihood = norm(loc=targetFeaturesScaled.dot(self.coefficients), scale=sqrt(self.sigma)) 232 | pCorrect = pdfLikelihood.pdf(targetSensitive)[0] 233 | else: 234 | pdfLikelihood = norm(loc=targetFeaturesScaled.dot(self.coefficients), scale=sqrt(self.sigma)) 235 | pCorrect = pdfLikelihood.pdf(targetSensitive)[0] 236 | 237 | return pCorrect 238 | 239 | 240 | class RandForestAttack(AttributeInferenceAttack): 241 | """An AttributeInferenceAttack based on a simple Linear Regression model""" 242 | def __init__(self, sensitiveAttribute, metadata, quids=None): 243 | super().__init__(RandomForestClassifier(), sensitiveAttribute, metadata, quids) 244 | 245 | self.labels = {l:i for i, l in enumerate(self.metadata[self.sensitiveAttribute]['categories'])} 246 | self.labelsInv = {i:l for l, i in self.labels.items()} 247 | 248 | self.scaleFactor = None 249 | 250 | def train(self, data): 251 | """ 252 | Train a Classifier to reconstruct an unknown sensitive label from a vector of known attributes 253 | :param data: type(DataFrame) A dataset of shape (n, k) 254 | """ 255 | features = self._encode_data(data.drop(self.sensitiveAttribute, axis=1)) 256 | labels = data[self.sensitiveAttribute].apply(lambda x: self.labels[x]).values 257 | 258 | # Feature normalisation 259 | self.scaleFactor = mean(features, axis=0) 260 | featuresScaled = features - self.scaleFactor 261 | 262 | # Get MLE for linear coefficients 263 | self.PredictionModel.fit(featuresScaled, labels) 264 | 265 | LOGGER.debug('Finished training regression model') 266 | self.trained = True 267 | 268 | def _make_guess(self, targetAux): 269 | targetFeatures = self._encode_data(targetAux) 270 | targetFeaturesScaled = targetFeatures - self.scaleFactor 271 | 272 | guess = self.PredictionModel.predict(targetFeaturesScaled) 273 | 274 | return self.labelsInv[guess[0]] 275 | 276 | def get_likelihood(self, targetAux, targetSensitive, attemptLinkage=False, data=None): 277 | assert self.trained, 'Attack must first be trained on some data before can predict sensitive target value' 278 | 279 | targetFeatures = self._encode_data(targetAux) 280 | targetFeaturesScaled = targetFeatures - self.scaleFactor 281 | 282 | if attemptLinkage: 283 | assert data is not None, "Need a dataset for linkage attack." 284 | try: 285 | groups = data.groupby(self.categoricalAttributes) 286 | targetCats = targetAux[self.categoricalAttributes].values 287 | groupSize = groups.size()[targetCats] 288 | if all(groupSize == 1): 289 | pCorrect = 1. 290 | 291 | else: 292 | probs = self.PredictionModel.predict_proba(targetFeaturesScaled).flatten() 293 | pCorrect = probs[self.labels[targetSensitive]] 294 | 295 | except: 296 | probs = self.PredictionModel.predict_proba(targetFeaturesScaled).flatten() 297 | pCorrect = probs[self.labels[targetSensitive]] 298 | else: 299 | probs = self.PredictionModel.predict_proba(targetFeaturesScaled).flatten() 300 | pCorrect = probs[self.labels[targetSensitive]] 301 | 302 | return pCorrect -------------------------------------------------------------------------------- /data/germancredit.json: -------------------------------------------------------------------------------- 1 | { 2 | "columns": [ 3 | { 4 | "name": "Age", 5 | "min": 19.0, 6 | "max": 75.0, 7 | "type": "Float" 8 | }, 9 | { 10 | "name": "Sex", 11 | "type": "Categorical", 12 | "size": 2, 13 | "i2s": [ 14 | "male", 15 | "female" 16 | ] 17 | }, 18 | { 19 | "name": "Job", 20 | "type": "Ordinal", 21 | "size": 4, 22 | "i2s": [ 23 | "unemployed", 24 | "unskilled", 25 | "skilled", 26 | "management" 27 | ] 28 | }, 29 | { 30 | "name": "Housing", 31 | "type": "Categorical", 32 | "size": 3, 33 | "i2s": [ 34 | "own", 35 | "free", 36 | "rent" 37 | ] 38 | }, 39 | { 40 | "name": "Saving accounts", 41 | "type": "Ordinal", 42 | "i2s": [ 43 | "no_info", 44 | "little", 45 | "moderate", 46 | "quite rich", 47 | "rich" 48 | ], 49 | "size": 5 50 | }, 51 | { 52 | "name": "Checking account", 53 | "type": "Ordinal", 54 | "size": 4, 55 | "i2s": [ 56 | "no_info", 57 | "little", 58 | "moderate", 59 | "rich" 60 | ] 61 | }, 62 | { 63 | "name": "Credit amount", 64 | "type": "Float", 65 | "min": 250.0, 66 | "max": 18424.0 67 | }, 68 | { 69 | "name": "Duration", 70 | "type": "Float", 71 | "min": 4.0, 72 | "max": 72.0 73 | }, 74 | { 75 | "name": "Purpose", 76 | "type": "Categorical", 77 | "size": 8, 78 | "i2s": [ 79 | "radio/TV", 80 | "education", 81 | "furniture/equipment", 82 | "car", 83 | "business", 84 | "domestic appliances", 85 | "repairs", 86 | "vacation/others" 87 | ] 88 | }, 89 | { 90 | "name": "Risk", 91 | "type": "Categorical", 92 | "size": 2, 93 | "i2s": [ 94 | "good", 95 | "bad" 96 | ] 97 | } 98 | ] 99 | } 100 | -------------------------------------------------------------------------------- /data/texas.json: -------------------------------------------------------------------------------- 1 | { 2 | "columns": [ 3 | { 4 | "name": "DISCHARGE", 5 | "type": "Categorical", 6 | "size": 9, 7 | "i2s": [ 8 | "2013Q4", 9 | "2013Q1", 10 | "2013Q3", 11 | "2013Q2", 12 | "2012Q4", 13 | "2014Q4", 14 | "2014Q1", 15 | "2014Q2", 16 | "2014Q3" 17 | ] 18 | }, 19 | { 20 | "name": "TYPE_OF_ADMISSION", 21 | "type": "Categorical", 22 | "size": 7, 23 | "i2s": [ 24 | "3", 25 | "4", 26 | "1", 27 | "2", 28 | "9", 29 | "5", 30 | "INVALID" 31 | ] 32 | }, 33 | { 34 | "name": "PAT_STATE", 35 | "type": "Categorical", 36 | "size": 9, 37 | "i2s": [ 38 | "TX", 39 | "NM", 40 | "AR", 41 | "ZZ", 42 | "OK", 43 | "FC", 44 | "LA", 45 | "XX", 46 | "INVALID" 47 | ] 48 | }, 49 | { 50 | "name": "PAT_STATUS", 51 | "type": "Categorical", 52 | "size": 23, 53 | "i2s": [ 54 | "6", 55 | "1", 56 | "3", 57 | "2", 58 | "51", 59 | "62", 60 | "50", 61 | "20", 62 | "65", 63 | "5", 64 | "63", 65 | "7", 66 | "9", 67 | "4", 68 | "INVALID", 69 | "66", 70 | "61", 71 | "64", 72 | "30", 73 | "43", 74 | "8", 75 | "41", 76 | "40" 77 | ] 78 | }, 79 | { 80 | "name": "SEX_CODE", 81 | "type": "Categorical", 82 | "size": 4, 83 | "i2s": [ 84 | "F", 85 | "M", 86 | "U", 87 | "INVALID" 88 | ] 89 | }, 90 | { 91 | "name": "RACE", 92 | "type": "Categorical", 93 | "size": 6, 94 | "i2s": [ 95 | "4", 96 | "5", 97 | "3", 98 | "2", 99 | "INVALID", 100 | "1" 101 | ] 102 | }, 103 | { 104 | "name": "ETHNICITY", 105 | "type": "Categorical", 106 | "size": 3, 107 | "i2s": [ 108 | "2", 109 | "1", 110 | "INVALID" 111 | ] 112 | }, 113 | { 114 | "name": "ADMIT_WEEKDAY", 115 | "type": "Categorical", 116 | "size": 7, 117 | "i2s": [ 118 | "3", 119 | "4", 120 | "1", 121 | "2", 122 | "7", 123 | "5", 124 | "6" 125 | ] 126 | }, 127 | { 128 | "name": "PAT_AGE", 129 | "type": "Ordinal", 130 | "size": 23, 131 | "i2s": [ 132 | "00", 133 | "01", 134 | "02", 135 | "03", 136 | "04", 137 | "05", 138 | "06", 139 | "07", 140 | "08", 141 | "09", 142 | "10", 143 | "11", 144 | "12", 145 | "13", 146 | "14", 147 | "15", 148 | "16", 149 | "17", 150 | "18", 151 | "19", 152 | "20", 153 | "21", 154 | "INVALID" 155 | ] 156 | }, 157 | { 158 | "name": "RISK_MORTALITY", 159 | "type": "Ordinal", 160 | "size": 5, 161 | "i2s": [ 162 | "0", 163 | "1", 164 | "2", 165 | "3", 166 | "4" 167 | ] 168 | }, 169 | { 170 | "name": "ILLNESS_SEVERITY", 171 | "type": "Ordinal", 172 | "size": 5, 173 | "i2s": [ 174 | "0", 175 | "1", 176 | "2", 177 | "3", 178 | "4" 179 | ] 180 | }, 181 | { 182 | "name": "LENGTH_OF_STAY", 183 | "type": "Integer", 184 | "min": 1, 185 | "max": 986 186 | }, 187 | { 188 | "name": "TOTAL_CHARGES", 189 | "type": "Float", 190 | "min": 0.0, 191 | "max": 3293072.0 192 | }, 193 | { 194 | "name": "TOTAL_NON_COV_CHARGES", 195 | "type": "Float", 196 | "min": 0.0, 197 | "max": 969641.0 198 | }, 199 | { 200 | "name": "TOTAL_CHARGES_ACCOMM", 201 | "type": "Float", 202 | "min": 0.0, 203 | "max": 974433.0 204 | }, 205 | { 206 | "name": "TOTAL_NON_COV_CHARGES_ACCOMM", 207 | "type": "Float", 208 | "min": 0.0, 209 | "max": 412751.0 210 | }, 211 | { 212 | "name": "TOTAL_CHARGES_ANCIL", 213 | "type": "Float", 214 | "min": 0.0, 215 | "max": 2994631.0 216 | }, 217 | { 218 | "name": "TOTAL_NON_COV_CHARGES_ANCIL", 219 | "type": "Float", 220 | "min": 0.0, 221 | "max": 642921.0 222 | } 223 | ] 224 | } 225 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/cuda:11.4.2-cudnn8-devel-ubuntu20.04 2 | 3 | RUN export DEBIAN_FRONTEND="noninteractive" && \ 4 | apt-get update && \ 5 | apt-get upgrade -y && \ 6 | apt-get autoremove -y && \ 7 | apt-get install --no-install-recommends -y \ 8 | cm-super \ 9 | cython3 \ 10 | dvipng \ 11 | git \ 12 | libfreetype6-dev \ 13 | pkgconf \ 14 | python3-dev \ 15 | python3-pip \ 16 | texlive \ 17 | texlive-latex-extra \ 18 | && \ 19 | apt-get clean -y && \ 20 | rm -rf /var/lib/apt/lists/* 21 | 22 | COPY ./requirements.txt /requirements.txt 23 | 24 | RUN cd / && \ 25 | python3 -m pip install --upgrade pip && \ 26 | python3 -m pip install --upgrade wheel && \ 27 | python3 -m pip install numpy==1.19.5 && \ 28 | python3 -m pip install -r requirements.txt 29 | 30 | RUN cd / && \ 31 | git clone https://github.com/spring-epfl/CTGAN.git && \ 32 | git clone https://github.com/spring-epfl/synthetic_data_release.git && \ 33 | cd CTGAN && \ 34 | python3 -m pip install . 35 | 36 | ENV PYTHONPATH "${PYTHONPATH}:/CTGAN" 37 | 38 | ENTRYPOINT ["/bin/bash"] 39 | -------------------------------------------------------------------------------- /docker/requirements.txt: -------------------------------------------------------------------------------- 1 | husl==4.0.3 2 | loguru==0.5.3 3 | matplotlib==3.4.3 4 | notebook==6.4.6 5 | palettable==3.3.0 6 | pandas==0.25.3 7 | scipy==1.7.1 8 | seaborn==0.11.2 9 | sklearn==0.0 10 | tensorflow==2.6.0 11 | torch==1.9.1 12 | -------------------------------------------------------------------------------- /executables/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Module containing executable scripts. 4 | 5 | ----- 6 | Nampoina Andriamilanto 7 | """ 8 | -------------------------------------------------------------------------------- /executables/generate_metadata_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Generate the json metadata file given a dataset in csv format. 4 | 5 | Please set the two global variables IMPLICIT_ORDINAL_ATTRIBUTES and 6 | EXPLICIT_ORDINAL_ATTRIBUTES to correspond to the dataset that you use. 7 | 8 | Great care should be taken when using this script to infer the type and the 9 | domain of the attributes as it relies on the dataset that is given in 10 | parameter. 11 | 12 | 13 | usage: generate_metadata_file.py [-h] --dataset DATASET [--output OUTPUT] 14 | 15 | optional arguments: 16 | -h, --help show this help message and exit 17 | --dataset DATASET, -i DATASET 18 | Path to the dataset in csv format 19 | --output OUTPUT, -o OUTPUT 20 | Path where to write the json metadata file 21 | 22 | ----- 23 | Nampoina Andriamilanto 24 | """ 25 | 26 | import json 27 | from argparse import ArgumentParser 28 | from pathlib import Path 29 | from typing import Set 30 | 31 | import numpy as np 32 | import pandas as pd 33 | from loguru import logger 34 | 35 | from utils.constants import (CATEGORICAL, FLOAT, INTEGER, NUMERICAL, ORDINAL) 36 | from utils.utils import json_numpy_serialzer 37 | 38 | # Please define the set of the ordinal attributes which values can be 39 | # automatically sorted (using the sorted() python function) 40 | IMPLICIT_ORDINAL_ATTRIBUTES = {'age'} 41 | 42 | # Please define the set of the ordinal attributes which values are ordered 43 | # manually 44 | EXPLICIT_ORDINAL_ATTRIBUTES = { 45 | 'education': ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', 46 | '11th', '12th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 47 | 'Assoc-voc', 'Some-college', 'Bachelors', 'Masters', 48 | 'Doctorate']} 49 | 50 | ORDINAL_ATTRIBUTES = IMPLICIT_ORDINAL_ATTRIBUTES.union( 51 | set(EXPLICIT_ORDINAL_ATTRIBUTES.keys())) 52 | 53 | OUTPUT_FILE_SUFFIX = '.json' 54 | JSON_SPACE_INDENT = 2 55 | 56 | 57 | def main(): 58 | """Generate the json metadata file.""" 59 | # Parse the arguments 60 | argparser = ArgumentParser() 61 | argparser.add_argument('--dataset', '-i', type=str, required=True, 62 | help='Path to the dataset in csv format') 63 | argparser.add_argument('--output', '-o', type=str, 64 | help='Path where to write the json metadata file') 65 | args = argparser.parse_args() 66 | 67 | # Load the dataset 68 | logger.info(f'Loading the data from {args.dataset}') 69 | dataset_path = Path(args.dataset) 70 | dataset = pd.read_csv(dataset_path, header=0) 71 | logger.debug(f'Sample of the loaded dataset:\n{dataset}') 72 | dataset.info() 73 | 74 | # Generate the metadata of each attribute 75 | logger.info('Generating the metadata of the attributes') 76 | attributes = [] 77 | for column in dataset.columns: 78 | # Get the numpy type of the column 79 | numpy_type = dataset[column].dtype 80 | logger.debug(f'{column} has the numpy type {numpy_type}') 81 | 82 | # Infer its type among (Integer, Float, Ordinal, Categorical) 83 | inferred_type = infer_type(column, numpy_type, ORDINAL_ATTRIBUTES) 84 | column_infos = {'name': column, 'type': inferred_type} 85 | logger.debug(column_infos) 86 | 87 | # If the type is numerical, set the min and max value 88 | if inferred_type in NUMERICAL: 89 | column_infos['min'] = dataset[column].min() 90 | column_infos['max'] = dataset[column].max() 91 | else: 92 | # If the type is explicitely ordinal, we retrieve its ordered 93 | # values which are set manually in EXPLICIT_ORDINAL_ATTRIBUTES. 94 | # Otherwise (implicit ordinal or categorical), we get the sorted 95 | # list of values from the dataset (the second parameter of get()). 96 | ordered_values = EXPLICIT_ORDINAL_ATTRIBUTES.get( 97 | column, sorted(dataset[column].unique())) 98 | column_infos['size'] = len(ordered_values) 99 | 100 | # If the values are numbers, we cast them to strings as the 101 | # metadata configuration files seem to have the values of ordinal 102 | # and categorical attributes specified as strings 103 | if isinstance(ordered_values[0], np.number): 104 | ordered_values = [str(value) for value in ordered_values] 105 | 106 | column_infos['i2s'] = ordered_values 107 | 108 | attributes.append(column_infos) 109 | 110 | # Write the json metadata file 111 | if args.output: 112 | output_path = args.output 113 | else: 114 | output_path = dataset_path.with_name( 115 | dataset_path.stem + OUTPUT_FILE_SUFFIX) 116 | logger.info(f'Writting the metadata to {output_path}') 117 | 118 | with open(output_path, 'w+') as json_output_file: 119 | json.dump({'columns': attributes}, json_output_file, 120 | indent=JSON_SPACE_INDENT, default=json_numpy_serialzer) 121 | 122 | 123 | def infer_type(column: str, numpy_type: str, ordinal_attributes: Set[str] 124 | ) -> str: 125 | """Infer the type of an attribute given its numpy type. 126 | 127 | Args: 128 | column: The name of the column. 129 | numpy_type: The numpy type of the column. 130 | ordinal_attributes: The set of the ordinal attributes. 131 | """ 132 | if column in ordinal_attributes: 133 | return ORDINAL 134 | if np.issubdtype(numpy_type, np.integer): 135 | return INTEGER 136 | if np.issubdtype(numpy_type, np.floating): 137 | return FLOAT 138 | return CATEGORICAL 139 | 140 | 141 | if __name__ == "__main__": 142 | main() 143 | -------------------------------------------------------------------------------- /executables/generate_synthetic_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Launcher to process the PrivBayes mechanism. 4 | 5 | This script is an adaptation of the execution scripts from 6 | https://github.com/spring-epfl/synthetic_data_release. 7 | 8 | ----- 9 | Nampoina Andriamilanto 10 | """ 11 | 12 | from argparse import ArgumentParser 13 | from ast import literal_eval 14 | from pathlib import Path 15 | from warnings import simplefilter 16 | simplefilter('ignore', category=FutureWarning) 17 | simplefilter('ignore', category=DeprecationWarning) 18 | 19 | from loguru import logger 20 | 21 | from generative_models.ctgan import CTGAN 22 | from generative_models.data_synthesiser import ( 23 | IndependentHistogram, BayesianNet, PrivBayes) 24 | from generative_models.pate_gan import PATEGAN 25 | from utils.datagen import load_s3_data_as_df, load_local_data_as_df 26 | 27 | 28 | DEFAULT_SAMPLE_SIZE = 1000 29 | 30 | 31 | def main(): 32 | """Execute the PrivBayes mechanism.""" 33 | # Parse the arguments 34 | argparser = ArgumentParser() 35 | datasource = argparser.add_mutually_exclusive_group() 36 | datasource.add_argument('--s3name', '-S3', type=str, choices=[ 37 | 'adult', 'census', 'credit', 'alarm', 'insurance'], 38 | help='Name of the dataset to run on') 39 | datasource.add_argument('--datapath', '-D', type=str, 40 | help='Path to a local data file') 41 | argparser.add_argument('--mechanism', '-M', type=str, choices=[ 42 | 'IndependentHistogram', 'BayesianNet', 'PrivBayes', 'CTGAN', 'PATEGAN' 43 | ], default='PrivBayes', help='The mechanism to use') 44 | argparser.add_argument('--parameters', '-P', type=str, default=None, 45 | help='The parameters of the mechanism to use ' 46 | 'separated by a colon') 47 | argparser.add_argument('--output-file', '-O', type=str, 48 | help='The file where to store the synthetic dataset' 49 | ) 50 | argparser.add_argument('--sample-size', '-N', type=int, 51 | default=DEFAULT_SAMPLE_SIZE, 52 | help='The size of the synthetic dataset') 53 | args = argparser.parse_args() 54 | 55 | # Load data 56 | if args.s3name: 57 | raw_pop, metadata = load_s3_data_as_df(args.s3name) 58 | dname = args.s3name 59 | elif args.datapath: 60 | raw_pop, metadata = load_local_data_as_df(Path(args.datapath)) 61 | dname = args.datapath.split('/')[-1] 62 | else: 63 | raise ValueError('Please provide a dataset') 64 | logger.info(f'Loaded data {dname}:\n{raw_pop}') 65 | logger.info(f'Loaded the corresponding metadata: {metadata}') 66 | 67 | # Initialize the mechanism 68 | parameters = [] 69 | if args.parameters: 70 | parameters = [literal_eval(param) 71 | for param in args.parameters.split(',')] 72 | logger.debug(f'Parameters: {parameters}') 73 | 74 | # IndependentHistogram parameters: 75 | # histogram_bins=10, infer_ranges=False, multiprocess=True 76 | if args.mechanism == 'IndependentHistogram': 77 | mechanism = IndependentHistogram(metadata, *parameters) 78 | 79 | # BayesianNet parameters: 80 | # histogram_bins=10, degree=1, infer_ranges=False, multiprocess=True, 81 | # seed=None 82 | elif args.mechanism == 'BayesianNet': 83 | mechanism = BayesianNet(metadata, *parameters) 84 | 85 | # PrivBayes parameters: 86 | # histogram_bins=10, degree=1, epsilon=.1, infer_ranges=False, 87 | # multiprocess=True, seed=None 88 | elif args.mechanism == 'PrivBayes': 89 | mechanism = PrivBayes(metadata, *parameters) 90 | 91 | # CTGAN parameters: 92 | # embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6, 93 | # batch_size=500, epochs=300, multiprocess=False 94 | elif args.mechanism == 'CTGAN': 95 | mechanism = CTGAN(metadata, *parameters) 96 | 97 | # PATEGAN parameters: 98 | # eps=1, delta=1e-5, infer_ranges=False, num_teachers=10, n_iters=100, 99 | # batch_size=128, learning_rate=1e-4, multiprocess=False 100 | elif args.mechanism == 'PATEGAN': 101 | mechanism = PATEGAN(metadata, *parameters) 102 | 103 | # Unknown mechanism 104 | else: 105 | raise ValueError(f'Unknown mechanism {args.mechanism}') 106 | 107 | # Set the output path 108 | output_path = Path(f'{mechanism.__name__}.csv') 109 | if args.output_file: 110 | output_path = Path(args.output_file) 111 | 112 | # Generate the synthetic data 113 | logger.info('Generating the synthetic data, this can take time...') 114 | mechanism.fit(raw_pop) 115 | mechanism.generate_samples(args.sample_size).to_csv(output_path) 116 | 117 | 118 | if __name__ == "__main__": 119 | main() 120 | -------------------------------------------------------------------------------- /feature_sets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/feature_sets/__init__.py -------------------------------------------------------------------------------- /feature_sets/bayes.py: -------------------------------------------------------------------------------- 1 | """A set of features that a Bayesian Net model is expected to extract from the raw data""" 2 | from pandas import DataFrame, get_dummies 3 | from pandas.api.types import CategoricalDtype 4 | from numpy import ndarray, all, corrcoef, concatenate, nan_to_num, zeros_like, triu_indices_from 5 | from itertools import combinations 6 | 7 | from utils.constants import * 8 | from utils.logging import LOGGER 9 | from feature_sets.feature_set import FeatureSet 10 | from feature_sets.independent_histograms import HistogramFeatureSet 11 | 12 | 13 | class CorrelationsFeatureSet(FeatureSet): 14 | def __init__(self, datatype, metadata, quids=None): 15 | assert datatype in [DataFrame, ndarray], 'Unknown data type {}'.format(datatype) 16 | self.datatype = datatype 17 | self.nfeatures = 0 18 | 19 | self.cat_attributes = [] 20 | self.num_attributes = [] 21 | 22 | self.category_codes = {} 23 | 24 | if quids is None: 25 | quids = [] 26 | 27 | for cdict in metadata['columns']: 28 | attr_name = cdict['name'] 29 | dtype = cdict['type'] 30 | 31 | if dtype == FLOAT or dtype == INTEGER: 32 | if attr_name not in quids: 33 | self.num_attributes.append(attr_name) 34 | else: 35 | self.cat_attributes.append(attr_name) 36 | cat_bins = cdict['bins'] 37 | cat_labels = [f'({cat_bins[i]},{cat_bins[i+1]}]' for i in range(len(cat_bins)-1)] 38 | self.category_codes[attr_name] = cat_labels 39 | self.nfeatures += len(cat_labels) 40 | 41 | elif dtype == CATEGORICAL or dtype == ORDINAL: 42 | self.cat_attributes.append(attr_name) 43 | self.category_codes[attr_name] = cdict['i2s'] 44 | self.nfeatures += len(cdict['i2s']) 45 | 46 | LOGGER.debug(f'Feature set will have length {self.nfeatures}') 47 | 48 | self.__name__ = 'Correlations' 49 | 50 | def extract(self, data, flatten=True): 51 | assert isinstance(data, self.datatype), f'Feature extraction expects {self.datatype} as input type' 52 | 53 | assert all([c in list(data) for c in self.cat_attributes]), 'Missing some categorical attributes in input data' 54 | assert all([c in list(data) for c in self.num_attributes]), 'Missing some numerical attributes in input data' 55 | 56 | encoded = data[self.num_attributes].copy() 57 | for c in self.cat_attributes: 58 | col = data[c] 59 | col = col.astype(CategoricalDtype(categories=self.category_codes[c], ordered=True)) 60 | encoded = encoded.merge(get_dummies(col, drop_first=True, prefix=c), left_index=True, right_index=True) 61 | 62 | col_names = list(encoded) 63 | self.feature_names = list(combinations(col_names, r=2)) 64 | 65 | corr = encoded.corr().fillna(0).values 66 | 67 | mask = zeros_like(corr).astype(bool) 68 | mask[triu_indices_from(mask, k=1)] = True 69 | 70 | if flatten: 71 | features = corr[mask].flatten() 72 | else: 73 | features = corr 74 | 75 | return features 76 | 77 | 78 | class BayesFeatureSet(FeatureSet): 79 | def __init__(self, datatype, metadata, nbins=10, quids=None): 80 | assert datatype in [DataFrame, ndarray], 'Unknown data type {}'.format(datatype) 81 | self.datatype = datatype 82 | 83 | self.histograms = HistogramFeatureSet(datatype, metadata, nbins, quids) 84 | self.correlations = CorrelationsFeatureSet(datatype, metadata, quids) 85 | 86 | def extract(self, data): 87 | Hist = self.histograms.extract(data) 88 | Corr = self.correlations.extract(data) 89 | 90 | return concatenate([Hist, Corr]) 91 | -------------------------------------------------------------------------------- /feature_sets/feature_set.py: -------------------------------------------------------------------------------- 1 | """A parent class for all feature extraction layers""" 2 | 3 | 4 | class FeatureSet(object): 5 | def extract(self, data): 6 | return NotImplementedError('Method needs to be overwritten by subclass') -------------------------------------------------------------------------------- /feature_sets/independent_histograms.py: -------------------------------------------------------------------------------- 1 | from pandas import DataFrame 2 | from numpy import ndarray, array, linspace, all 3 | from pandas.api.types import CategoricalDtype 4 | 5 | from feature_sets.feature_set import FeatureSet 6 | from utils.logging import LOGGER 7 | from utils.constants import * 8 | 9 | from warnings import filterwarnings 10 | filterwarnings('ignore', message=r"Parsing", category=FutureWarning) 11 | 12 | 13 | class HistogramFeatureSet(FeatureSet): 14 | def __init__(self, datatype, metadata, nbins=10, quids=None): 15 | assert datatype in [DataFrame], 'Unknown data type {}'.format(datatype) 16 | self.datatype = datatype 17 | self.nfeatures = 0 18 | 19 | self.cat_attributes = [] 20 | self.num_attributes = [] 21 | 22 | self.histogram_bins = {} 23 | self.category_codes = {} 24 | 25 | if quids is None: 26 | quids = [] 27 | 28 | for cdict in metadata['columns']: 29 | attr_name = cdict['name'] 30 | dtype = cdict['type'] 31 | 32 | if dtype == FLOAT or dtype == INTEGER: 33 | if attr_name not in quids: 34 | self.num_attributes.append(attr_name) 35 | self.histogram_bins[attr_name] = linspace(cdict['min'], cdict['max'], nbins+1) 36 | self.nfeatures += nbins 37 | else: 38 | self.cat_attributes.append(attr_name) 39 | cat_bins = cdict['bins'] 40 | cat_labels = [f'({cat_bins[i]},{cat_bins[i+1]}]' for i in range(len(cat_bins)-1)] 41 | self.category_codes[attr_name] = cat_labels 42 | self.nfeatures += len(cat_labels) 43 | 44 | elif dtype == CATEGORICAL or dtype == ORDINAL: 45 | self.cat_attributes.append(attr_name) 46 | self.category_codes[attr_name] = cdict['i2s'] 47 | self.nfeatures += len(cdict['i2s']) 48 | 49 | LOGGER.debug(f'Feature set will have length {self.nfeatures}') 50 | 51 | self.__name__ = 'Histogram' 52 | 53 | def extract(self, data): 54 | assert isinstance(data, self.datatype), f'Feature extraction expects {self.datatype} as input type' 55 | 56 | assert all([c in list(data) for c in self.cat_attributes]), 'Missing some categorical attributes in input data' 57 | assert all([c in list(data) for c in self.num_attributes]), 'Missing some numerical attributes in input data' 58 | 59 | features = [] 60 | for attr in self.num_attributes: 61 | col = data[attr] 62 | F = col.value_counts(bins=self.histogram_bins[attr]).values 63 | features.extend(F.tolist()) 64 | 65 | for attr in self.cat_attributes: 66 | col = data[attr] 67 | col = col.astype(CategoricalDtype(categories=self.category_codes[attr], ordered=True)) 68 | F = col.value_counts().loc[self.category_codes[attr]].values 69 | features.extend(F.tolist()) 70 | 71 | assert len(features) == self.nfeatures, f'Expected number of features is {self.nfeatures} but found {len(features)}' 72 | 73 | return array(features) 74 | 75 | def _get_names(self): 76 | feature_names = [] 77 | for attr in self.num_attributes: 78 | bins = self.histogram_bins[attr] 79 | feature_names.extend([f'{attr}({int(bins[i-1])},{int(bins[i])}]' for i in range(1,len(bins))]) 80 | 81 | for attr in self.cat_attributes: 82 | feature_names.extend([f'{attr}_{c}' for c in self.category_codes[attr]]) 83 | 84 | return feature_names 85 | 86 | -------------------------------------------------------------------------------- /feature_sets/model_agnostic.py: -------------------------------------------------------------------------------- 1 | """A simple feature extraction layer for data with a mix of categorical and numerical attributes""" 2 | from os import path 3 | from pandas import DataFrame 4 | from numpy import ndarray, nanmean, nanmedian, nanvar, array, concatenate 5 | from pandas.api.types import is_numeric_dtype, CategoricalDtype 6 | 7 | from utils.logging import LOGGER 8 | from feature_sets.feature_set import FeatureSet 9 | from feature_sets.independent_histograms import HistogramFeatureSet 10 | from feature_sets.bayes import CorrelationsFeatureSet 11 | 12 | from warnings import filterwarnings 13 | filterwarnings('ignore', message=r"Parsing", category=FutureWarning) 14 | 15 | 16 | class NaiveFeatureSet(FeatureSet): 17 | def __init__(self, datatype): 18 | self.datatype = datatype 19 | self.attributes = None 20 | self.category_codes = {} 21 | assert self.datatype in [DataFrame, ndarray], 'Unknown data type {}'.format(datatype) 22 | 23 | self.__name__ = 'Naive' 24 | 25 | def extract(self, data): 26 | if self.datatype is DataFrame: 27 | assert isinstance(data, DataFrame), 'Feature extraction expects DataFrame as input' 28 | if self.attributes is not None: 29 | if bool(set(list(data)).difference(set(self.attributes))): 30 | raise ValueError('Data to filter does not match expected schema') 31 | else: 32 | self.attributes = list(data) 33 | features = DataFrame(columns=self.attributes) 34 | for c in self.attributes: 35 | col = data[c] 36 | if is_numeric_dtype(col): 37 | features[c] = [col.mean(), col.median(), col.var()] 38 | else: 39 | if c in self.category_codes.keys(): 40 | new_cats = set(col.astype('category').cat.categories).difference(set(self.category_codes[c])) 41 | self.category_codes[c] += list(new_cats) 42 | col = col.astype(CategoricalDtype(categories=self.category_codes[c])) 43 | else: 44 | col = col.astype('category') 45 | self.category_codes[c] = list(col.cat.categories) 46 | counts = list(col.cat.codes.value_counts().index) 47 | features[c] = [counts[0], counts[-1], len(counts)] 48 | features = features.values 49 | 50 | elif self.datatype is ndarray: 51 | assert isinstance(data, ndarray), 'Feature extraction expects ndarray as input' 52 | features = array([nanmean(data), nanmedian(data), nanvar(data)]) 53 | else: 54 | raise ValueError(f'Unknown data type {type(data)}') 55 | 56 | return features.flatten() 57 | 58 | 59 | class EnsembleFeatureSet(FeatureSet): 60 | """An ensemble of features that is not model specific""" 61 | def __init__(self, datatype, metadata, nbins=10, quasi_id_cols=None): 62 | assert datatype in [DataFrame, ndarray], 'Unknown data type {}'.format(datatype) 63 | self.datatype = datatype 64 | 65 | self.naive = NaiveFeatureSet(datatype) 66 | self.histograms = HistogramFeatureSet(datatype, metadata, nbins=nbins, quids=quasi_id_cols) 67 | self.correlations = CorrelationsFeatureSet(datatype, metadata, quids=quasi_id_cols) 68 | 69 | self.__name__ = 'Ensemble' 70 | 71 | def extract(self, data): 72 | F_naive = self.naive.extract(data) 73 | F_hist = self.histograms.extract(data) 74 | F_corr = self.correlations.extract(data) 75 | 76 | return concatenate([F_naive, F_hist, F_corr]) 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /generative_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/generative_models/__init__.py -------------------------------------------------------------------------------- /generative_models/ctgan.py: -------------------------------------------------------------------------------- 1 | from pandas import DataFrame 2 | 3 | from utils.logging import LOGGER 4 | 5 | from generative_models.generative_model import GenerativeModel 6 | from ctgan import CTGANSynthesizer 7 | 8 | 9 | class CTGAN(GenerativeModel): 10 | """A conditional generative adversarial network for tabular data""" 11 | def __init__(self, metadata, 12 | embedding_dim=128, gen_dim=(256, 256), 13 | dis_dim=(256, 256), l2scale=1e-6, 14 | batch_size=500, epochs=300, 15 | multiprocess=False): 16 | 17 | self.synthesiser = CTGANSynthesizer(embedding_dim, gen_dim, dis_dim, 18 | l2scale, batch_size, epochs) 19 | 20 | self.metadata = metadata 21 | self.datatype = DataFrame 22 | 23 | self.multiprocess = bool(multiprocess) 24 | 25 | self.infer_ranges = True 26 | self.trained = False 27 | 28 | self.__name__ = 'CTGAN' 29 | 30 | def fit(self, data): 31 | """Train a generative adversarial network on tabular data. 32 | Input data is assumed to be of shape (n_samples, n_features) 33 | See https://github.com/DAI-Lab/SDGym for details""" 34 | assert isinstance(data, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(data)}' 35 | 36 | LOGGER.debug(f'Start fitting {self.__class__.__name__} to data of shape {data.shape}...') 37 | self.synthesiser.fit(data, self.metadata) 38 | 39 | LOGGER.debug(f'Finished fitting') 40 | self.trained = True 41 | 42 | def generate_samples(self, nsamples): 43 | """Generate random samples from the fitted Gaussian distribution""" 44 | assert self.trained, "Model must first be fitted to some data." 45 | 46 | LOGGER.debug(f'Generate synthetic dataset of size {nsamples}') 47 | synthetic_data = self.synthesiser.sample(nsamples) 48 | 49 | return synthetic_data 50 | -------------------------------------------------------------------------------- /generative_models/data_synthesiser.py: -------------------------------------------------------------------------------- 1 | """Generative models adapted from https://github.com/DataResponsibly/DataSynthesizer""" 2 | # Copyright <2018> 3 | 4 | from numpy.random import seed, laplace, choice 5 | from pandas import DataFrame, merge 6 | from itertools import product 7 | 8 | from generative_models.data_synthesiser_utils.datatypes.FloatAttribute import FloatAttribute 9 | from generative_models.data_synthesiser_utils.datatypes.IntegerAttribute import IntegerAttribute 10 | from generative_models.data_synthesiser_utils.datatypes.StringAttribute import StringAttribute 11 | from generative_models.data_synthesiser_utils.utils import bayes_worker, normalize_given_distribution, exponential_mechanism 12 | 13 | from generative_models.generative_model import GenerativeModel 14 | 15 | from utils.constants import * 16 | from utils.logging import LOGGER 17 | 18 | 19 | class IndependentHistogram(GenerativeModel): 20 | 21 | def __init__(self, metadata, histogram_bins=10, infer_ranges=False, multiprocess=True): 22 | self.metadata = self._read_meta(metadata) 23 | self.histogram_bins = histogram_bins 24 | 25 | self.datatype = DataFrame 26 | self.multiprocess = bool(multiprocess) 27 | self.infer_ranges = bool(infer_ranges) 28 | 29 | self.DataDescriber = None 30 | 31 | self.trained = False 32 | 33 | self.__name__ = 'IndependentHistogram' 34 | 35 | def fit(self, data): 36 | assert isinstance(data, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(data)}' 37 | LOGGER.debug(f'Start fitting IndependentHistogram model to data of shape {data.shape}...') 38 | if self.trained: 39 | self.trained = False 40 | self.DataDescriber = None 41 | 42 | self.DataDescriber = DataDescriber(self.metadata, self.histogram_bins, self.infer_ranges) 43 | self.DataDescriber.describe(data) 44 | LOGGER.debug(f'Finished fitting IndependentHistogram') 45 | self.trained = True 46 | 47 | def generate_samples(self, nsamples): 48 | assert self.trained, "Model must be fitted to some data first" 49 | 50 | LOGGER.debug(f'Generate synthetic dataset of size {nsamples}') 51 | synthetic_dataset = DataFrame(columns=self.DataDescriber.attr_names) 52 | for attr_name, Attr in self.DataDescriber.attr_dict.items(): 53 | binning_indices = Attr.sample_binning_indices_in_independent_attribute_mode(nsamples) 54 | synthetic_dataset[attr_name] = Attr.sample_values_from_binning_indices(binning_indices) 55 | 56 | LOGGER.debug(f'Generated synthetic dataset of size {nsamples}') 57 | return synthetic_dataset 58 | 59 | def _read_meta(self, metadata): 60 | """ Read metadata from metadata file.""" 61 | metadict = {} 62 | 63 | for cdict in metadata['columns']: 64 | col = cdict['name'] 65 | coltype = cdict['type'] 66 | 67 | if coltype == FLOAT or coltype == INTEGER: 68 | metadict[col] = { 69 | 'type': coltype, 70 | 'min': cdict['min'], 71 | 'max': cdict['max'] 72 | } 73 | 74 | elif coltype == CATEGORICAL or coltype == ORDINAL: 75 | metadict[col] = { 76 | 'type': coltype, 77 | 'categories': cdict['i2s'], 78 | 'size': len(cdict['i2s']) 79 | } 80 | 81 | else: 82 | raise ValueError(f'Unknown data type {coltype} for attribute {col}') 83 | 84 | return metadict 85 | 86 | 87 | class BayesianNet(GenerativeModel): 88 | """ 89 | A BayesianNet model using non-private GreedyBayes to learn conditional probabilities 90 | """ 91 | def __init__(self, metadata, histogram_bins=10, degree=1, infer_ranges=False, multiprocess=True, seed=None): 92 | self.metadata = self._read_meta(metadata) 93 | self.histogram_bins = histogram_bins 94 | self.degree = degree 95 | self.num_attributes = len(metadata['columns']) 96 | 97 | self.multiprocess = bool(multiprocess) 98 | self.infer_ranges = bool(infer_ranges) 99 | self.seed = seed 100 | self.datatype = DataFrame 101 | 102 | self.bayesian_network = None 103 | self.conditional_probabilities = None 104 | self.DataDescriber = None 105 | self.trained = False 106 | 107 | self.__name__ = 'BayesianNet' 108 | 109 | def fit(self, data): 110 | assert isinstance(data, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(data)}' 111 | assert len(list(data)) >= 2, "BayesianNet requires at least 2 attributes(i.e., columns) in dataset." 112 | LOGGER.debug(f'Start training BayesianNet on data of shape {data.shape}...') 113 | if self.trained: 114 | self.trained = False 115 | self.DataDescriber = None 116 | self.bayesian_network = None 117 | self.conditional_probabilities = None 118 | 119 | self.DataDescriber = DataDescriber(self.metadata, self.histogram_bins, self.infer_ranges) 120 | self.DataDescriber.describe(data) 121 | 122 | encoded_df = DataFrame(columns=self.DataDescriber.attr_names) 123 | for attr_name, column in self.DataDescriber.attr_dict.items(): 124 | encoded_df[attr_name] = column.encode_values_into_bin_idx() 125 | 126 | self.bayesian_network = self._greedy_bayes_linear(encoded_df, self.degree) 127 | 128 | self.conditional_probabilities = self._construct_conditional_probabilities(self.bayesian_network, encoded_df) 129 | 130 | LOGGER.debug(f'Finished training Bayesian net') 131 | self.trained = True 132 | 133 | def generate_samples(self, nsamples): 134 | LOGGER.debug(f'Generate synthetic dataset of size {nsamples}') 135 | assert self.trained, "Model must be fitted to some real data first" 136 | synthetic_data = DataFrame(columns=self.DataDescriber.attr_names) 137 | 138 | # Get samples for attributes modelled in Bayesian net 139 | encoded_dataset = self._generate_encoded_dataset(nsamples) 140 | 141 | for attr in self.DataDescriber.attr_names: 142 | column = self.DataDescriber.attr_dict[attr] 143 | if attr in encoded_dataset: 144 | synthetic_data[attr] = column.sample_values_from_binning_indices(encoded_dataset[attr]) 145 | else: 146 | # For attributes not in BN use independent attribute mode 147 | binning_indices = column.sample_binning_indices_in_independent_attribute_mode(nsamples) 148 | synthetic_data[attr] = column.sample_values_from_binning_indices(binning_indices) 149 | 150 | return synthetic_data 151 | 152 | def _generate_encoded_dataset(self, nsamples): 153 | encoded_df = DataFrame(columns=self._get_sampling_order(self.bayesian_network)) 154 | 155 | bn_root_attr = self.bayesian_network[0][1][0] 156 | root_attr_dist = self.conditional_probabilities[bn_root_attr] 157 | encoded_df[bn_root_attr] = choice(len(root_attr_dist), size=nsamples, p=root_attr_dist) 158 | 159 | for child, parents in self.bayesian_network: 160 | child_conditional_distributions = self.conditional_probabilities[child] 161 | 162 | for parents_instance in child_conditional_distributions.keys(): 163 | dist = child_conditional_distributions[parents_instance] 164 | parents_instance = list(eval(parents_instance)) 165 | 166 | filter_condition = '' 167 | for parent, value in zip(parents, parents_instance): 168 | filter_condition += f"(encoded_df['{parent}']=={value})&" 169 | 170 | filter_condition = eval(filter_condition[:-1]) 171 | size = encoded_df[filter_condition].shape[0] 172 | if size: 173 | encoded_df.loc[filter_condition, child] = choice(len(dist), size=size, p=dist) 174 | 175 | # Fill any nan values by sampling from marginal child distribution 176 | marginal_dist = self.DataDescriber.attr_dict[child].distribution_probabilities 177 | null_idx = encoded_df[child].isnull() 178 | encoded_df.loc[null_idx, child] = choice(len(marginal_dist), size=null_idx.sum(), p=marginal_dist) 179 | 180 | encoded_df[encoded_df.columns] = encoded_df[encoded_df.columns].astype(int) 181 | 182 | return encoded_df 183 | 184 | def _get_sampling_order(self, bayesian_net): 185 | order = [bayesian_net[0][1][0]] 186 | for child, _ in bayesian_net: 187 | order.append(child) 188 | return order 189 | 190 | def _greedy_bayes_linear(self, encoded_df, k=1): 191 | """Construct a Bayesian Network (BN) using greedy algorithm.""" 192 | dataset = encoded_df.astype(str, copy=False) 193 | 194 | # Optional: Fix sed for reproducibility 195 | if self.seed is not None: 196 | seed(self.seed) 197 | 198 | root_attribute = choice(dataset.columns) 199 | V = [root_attribute] 200 | rest_attributes = set(dataset.columns) 201 | rest_attributes.remove(root_attribute) 202 | bayesian_net = [] 203 | while rest_attributes: 204 | parents_pair_list = [] 205 | mutual_info_list = [] 206 | 207 | num_parents = min(len(V), k) 208 | for child, split in product(rest_attributes, range(len(V) - num_parents + 1)): 209 | task = (child, V, num_parents, split, dataset) 210 | res = bayes_worker(task) 211 | parents_pair_list += res[0] 212 | mutual_info_list += res[1] 213 | 214 | idx = mutual_info_list.index(max(mutual_info_list)) 215 | 216 | bayesian_net.append(parents_pair_list[idx]) 217 | adding_attribute = parents_pair_list[idx][0] 218 | V.append(adding_attribute) 219 | rest_attributes.remove(adding_attribute) 220 | 221 | return bayesian_net 222 | 223 | def _construct_conditional_probabilities(self, bayesian_network, encoded_dataset): 224 | k = len(bayesian_network[-1][1]) 225 | conditional_distributions = {} 226 | 227 | # first k+1 attributes 228 | root = bayesian_network[0][1][0] 229 | kplus1_attributes = [root] 230 | for child, _ in bayesian_network[:k]: 231 | kplus1_attributes.append(child) 232 | 233 | freqs_of_kplus1_attributes = self._get_attribute_frequency_counts(kplus1_attributes, encoded_dataset) 234 | 235 | # get distribution of root attribute 236 | root_marginal_freqs = freqs_of_kplus1_attributes.loc[:, [root, 'count']].groupby(root).sum()['count'] 237 | conditional_distributions[root] = normalize_given_distribution(root_marginal_freqs).tolist() 238 | 239 | for idx, (child, parents) in enumerate(bayesian_network): 240 | conditional_distributions[child] = {} 241 | 242 | if idx < k: 243 | stats = freqs_of_kplus1_attributes.copy().loc[:, parents + [child, 'count']] 244 | else: 245 | stats = self._get_attribute_frequency_counts(parents + [child], encoded_dataset) 246 | 247 | stats = DataFrame(stats.loc[:, parents + [child, 'count']].groupby(parents + [child]).sum()) 248 | 249 | if len(parents) == 1: 250 | for parent_instance in stats.index.levels[0]: 251 | dist = normalize_given_distribution(stats.loc[parent_instance]['count']).tolist() 252 | conditional_distributions[child][str([parent_instance])] = dist 253 | else: 254 | for parents_instance in product(*stats.index.levels[:-1]): 255 | dist = normalize_given_distribution(stats.loc[parents_instance]['count']).tolist() 256 | conditional_distributions[child][str(list(parents_instance))] = dist 257 | 258 | return conditional_distributions 259 | 260 | def _get_attribute_frequency_counts(self, attributes, encoded_dataset): 261 | # Get attribute counts for category combinations present in data 262 | counts = encoded_dataset.groupby(attributes).size() 263 | counts.name = 'count' 264 | counts = counts.reset_index() 265 | 266 | # Get all possible attribute combinations 267 | attr_combs = [range(self.DataDescriber.attr_dict[attr].domain_size) for attr in attributes] 268 | full_space = DataFrame(columns=attributes, data=list(product(*attr_combs))) 269 | # stats.reset_index(inplace=True) 270 | full_counts = merge(full_space, counts, how='left') 271 | full_counts.fillna(0, inplace=True) 272 | 273 | return full_counts 274 | 275 | def _read_meta(self, metadata): 276 | """ Read metadata from metadata file.""" 277 | metadict = {} 278 | 279 | for cdict in metadata['columns']: 280 | col = cdict['name'] 281 | coltype = cdict['type'] 282 | 283 | if coltype == FLOAT or coltype == INTEGER: 284 | metadict[col] = { 285 | 'type': coltype, 286 | 'min': cdict['min'], 287 | 'max': cdict['max'] 288 | } 289 | 290 | elif coltype == CATEGORICAL or coltype == ORDINAL: 291 | metadict[col] = { 292 | 'type': coltype, 293 | 'categories': cdict['i2s'], 294 | 'size': len(cdict['i2s']) 295 | } 296 | 297 | else: 298 | raise ValueError(f'Unknown data type {coltype} for attribute {col}') 299 | 300 | return metadict 301 | 302 | 303 | class PrivBayes(BayesianNet): 304 | """" 305 | A differentially private BayesianNet model using GreedyBayes 306 | """ 307 | def __init__(self, metadata, histogram_bins=10, degree=1, epsilon=.1, infer_ranges=False, multiprocess=True, seed=None): 308 | super().__init__(metadata=metadata, histogram_bins=histogram_bins, degree=degree, infer_ranges=infer_ranges, multiprocess=multiprocess, seed=seed) 309 | 310 | self.epsilon = float(epsilon) 311 | 312 | self.__name__ = f'PrivBayesEps{self.epsilon}' 313 | 314 | @property 315 | def laplace_noise_scale(self): 316 | return 2 * (self.num_attributes - self.degree) / (self.epsilon / 2) 317 | 318 | def _greedy_bayes_linear(self, encoded_df, k=1): 319 | """Construct a Bayesian Network (BN) using greedy algorithm.""" 320 | dataset = encoded_df.astype(str, copy=False) 321 | num_tuples, num_attributes = dataset.shape 322 | 323 | # Optional: Fix seed for reproducibility 324 | if self.seed is not None: 325 | seed(self.seed) 326 | 327 | attr_to_is_binary = {attr: dataset[attr].unique().size <= 2 for attr in dataset} 328 | 329 | root_attribute = choice(dataset.columns) 330 | V = [root_attribute] 331 | rest_attributes = set(dataset.columns) 332 | rest_attributes.remove(root_attribute) 333 | bayesian_net = [] 334 | while rest_attributes: 335 | parents_pair_list = [] 336 | mutual_info_list = [] 337 | 338 | num_parents = min(len(V), k) 339 | for child, split in product(rest_attributes, range(len(V) - num_parents + 1)): 340 | task = (child, V, num_parents, split, dataset) 341 | res = bayes_worker(task) 342 | parents_pair_list += res[0] 343 | mutual_info_list += res[1] 344 | 345 | sampling_distribution = exponential_mechanism(self.epsilon/2, mutual_info_list, parents_pair_list, attr_to_is_binary, 346 | num_tuples, num_attributes) 347 | idx = choice(list(range(len(mutual_info_list))), p=sampling_distribution) 348 | 349 | bayesian_net.append(parents_pair_list[idx]) 350 | adding_attribute = parents_pair_list[idx][0] 351 | V.append(adding_attribute) 352 | rest_attributes.remove(adding_attribute) 353 | 354 | return bayesian_net 355 | 356 | def _get_attribute_frequency_counts(self, attributes, encoded_dataset): 357 | """ Differentially private mechanism to get attribute frequency counts""" 358 | # Get attribute counts for category combinations present in data 359 | counts = encoded_dataset.groupby(attributes).size() 360 | counts.name = 'count' 361 | counts = counts.reset_index() 362 | 363 | # Get all possible attribute combinations 364 | attr_combs = [range(self.DataDescriber.attr_dict[attr].domain_size) for attr in attributes] 365 | full_space = DataFrame(columns=attributes, data=list(product(*attr_combs))) 366 | full_counts = merge(full_space, counts, how='left') 367 | full_counts.fillna(0, inplace=True) 368 | 369 | # Get Laplace noise sample 370 | noise_sample = laplace(0, scale=self.laplace_noise_scale, size=full_counts.index.size) 371 | full_counts['count'] += noise_sample 372 | full_counts.loc[full_counts['count'] < 0, 'count'] = 0 373 | 374 | return full_counts 375 | 376 | 377 | class DataDescriber(object): 378 | def __init__(self, metadata, histogram_bins, infer_ranges=False): 379 | self.metadata = metadata 380 | self.histogram_bins = histogram_bins 381 | self.infer_ranges = infer_ranges 382 | 383 | self.attr_dict = None 384 | self.attr_names = None 385 | 386 | def describe(self, df): 387 | self.attr_names = self._get_attr_names() 388 | self.attr_dict = self._represent_input_dataset_by_columns(df) 389 | 390 | for col, Attribute in self.attr_dict.items(): 391 | Attribute.infer_distribution() 392 | 393 | def _get_attr_names(self): 394 | return [c for c in self.metadata.keys()] 395 | 396 | def _represent_input_dataset_by_columns(self, df): 397 | attr_dict = {} 398 | 399 | for col, cdict in self.metadata.items(): 400 | coltype = cdict['type'] 401 | 402 | paras = (col, df[col], self.histogram_bins) 403 | if coltype in NUMERICAL: 404 | if coltype == FLOAT: 405 | Attribute = FloatAttribute(*paras) 406 | else: 407 | Attribute = IntegerAttribute(*paras) 408 | 409 | if self.infer_ranges: 410 | cmin, cmax = min(df[col]), max(df[col]) 411 | else: 412 | cmin, cmax = cdict['min'], cdict['max'] 413 | 414 | Attribute.set_domain(domain=(cmin, cmax)) 415 | 416 | elif coltype in STRINGS: 417 | Attribute = StringAttribute(*paras) 418 | if self.infer_ranges: 419 | ccats = list(df[col].unique()) 420 | else: 421 | ccats = cdict['categories'] 422 | 423 | Attribute.set_domain(domain=ccats) 424 | 425 | else: 426 | raise Exception(f'The DataType of {col} is unknown.') 427 | 428 | attr_dict[col] = Attribute 429 | 430 | return attr_dict 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | -------------------------------------------------------------------------------- /generative_models/data_synthesiser_utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Generative models adapted from https://github.com/DataResponsibly/DataSynthesizer""" 2 | # Copyright <2018> -------------------------------------------------------------------------------- /generative_models/data_synthesiser_utils/datatypes/AbstractAttribute.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | from bisect import bisect_right 3 | from random import uniform 4 | 5 | import numpy as np 6 | from numpy.random import choice 7 | from pandas import Series 8 | 9 | from generative_models.data_synthesiser_utils.utils import normalize_given_distribution 10 | 11 | 12 | class AbstractAttribute(object): 13 | __metaclass__ = ABCMeta 14 | 15 | def __init__(self, name, data, histogram_size): 16 | self.name = name 17 | self.data = data 18 | self.histogram_size = histogram_size 19 | 20 | self.data_dropna = self.data.dropna() 21 | self.missing_rate = (self.data.size - self.data_dropna.size) / (self.data.size or 1) 22 | 23 | self.is_categorical = None 24 | self.is_numerical = None 25 | self.data_type = None 26 | self.min = None 27 | self.max = None 28 | self.distribution_bins = None 29 | self.distribution_probabilities = None 30 | self.domain_size = None 31 | 32 | def set_domain(self, domain): 33 | return NotImplementedError('Method needs to be overwritten.') 34 | 35 | @abstractmethod 36 | def infer_distribution(self): 37 | if self.is_categorical: 38 | histogram = self.data_dropna.value_counts() 39 | for value in set(self.distribution_bins) - set(histogram.index): 40 | histogram[value] = 0 41 | histogram = histogram[self.distribution_bins] 42 | self.distribution_probabilities = normalize_given_distribution(histogram) 43 | 44 | else: 45 | histogram, _ = np.histogram(self.data_dropna, bins=self.distribution_bins) 46 | self.distribution_probabilities = normalize_given_distribution(histogram) 47 | 48 | def encode_values_into_bin_idx(self): 49 | """ 50 | Encode values into bin indices for Bayesian Network construction. 51 | """ 52 | if self.is_categorical: 53 | value_to_bin_idx = {value: idx for idx, value in enumerate(self.distribution_bins)} 54 | encoded = self.data.map(lambda x: value_to_bin_idx[x], na_action='ignore') 55 | else: 56 | encoded = self.data.map(lambda x: bisect_right(self.distribution_bins[:-1], x) - 1, na_action='ignore') 57 | 58 | encoded.fillna(len(self.distribution_bins), inplace=True) 59 | return encoded.astype(int, copy=False) 60 | 61 | def to_json(self): 62 | """Encode attribution information in JSON format / Python dictionary. 63 | 64 | """ 65 | return {"name": self.name, 66 | "data_type": self.data_type.value, 67 | "is_categorical": self.is_categorical, 68 | "min": self.min, 69 | "max": self.max, 70 | "missing_rate": self.missing_rate, 71 | "distribution_bins": self.distribution_bins.tolist(), 72 | "distribution_probabilities": self.distribution_probabilities.tolist()} 73 | 74 | @abstractmethod 75 | def generate_values_as_candidate_key(self, n): 76 | """When attribute should be a candidate key in output dataset. 77 | 78 | """ 79 | return np.arange(n) 80 | 81 | def sample_binning_indices_in_independent_attribute_mode(self, n): 82 | """Sample an array of binning indices. 83 | 84 | """ 85 | return Series(choice(len(self.distribution_probabilities), size=n, p=self.distribution_probabilities)) 86 | 87 | @abstractmethod 88 | def sample_values_from_binning_indices(self, binning_indices): 89 | """Convert binning indices into values in domain. Used by both independent and correlated attribute mode. 90 | 91 | """ 92 | return binning_indices.apply(lambda x: self.uniform_sampling_within_a_bin(x)) 93 | 94 | def uniform_sampling_within_a_bin(self, bin_idx): 95 | num_bins = len(self.distribution_probabilities) 96 | if bin_idx == num_bins: 97 | return np.nan 98 | elif self.is_categorical: 99 | return self.distribution_bins[bin_idx] 100 | else: 101 | return uniform(self.distribution_bins[bin_idx], self.distribution_bins[bin_idx + 1]) 102 | 103 | -------------------------------------------------------------------------------- /generative_models/data_synthesiser_utils/datatypes/FloatAttribute.py: -------------------------------------------------------------------------------- 1 | from numpy import linspace, histogram, arange 2 | 3 | from generative_models.data_synthesiser_utils.datatypes.AbstractAttribute import AbstractAttribute 4 | from generative_models.data_synthesiser_utils.datatypes.utils.DataType import DataType 5 | from generative_models.data_synthesiser_utils.utils import normalize_given_distribution 6 | 7 | 8 | class FloatAttribute(AbstractAttribute): 9 | def __init__(self, name, data, histogram_size): 10 | 11 | super().__init__(name, data, histogram_size) 12 | self.is_categorical = False 13 | self.is_numerical = True 14 | self.data_type = DataType.FLOAT 15 | self.data = self.data.astype(float) 16 | self.data_dropna = self.data_dropna.astype(float) 17 | 18 | def set_domain(self, domain=None): 19 | if domain is not None: 20 | self.min, self.max = domain 21 | else: 22 | self.min = float(self.data_dropna.min()) 23 | self.max = float(self.data_dropna.max()) 24 | 25 | self.distribution_bins = linspace(self.min, self.max, self.histogram_size+1) 26 | self.domain_size = self.histogram_size 27 | 28 | def infer_distribution(self): 29 | frequency_counts, _ = histogram(self.data_dropna, bins=self.distribution_bins) 30 | self.distribution_probabilities = normalize_given_distribution(frequency_counts) 31 | 32 | def generate_values_as_candidate_key(self, n): 33 | return arange(self.min, self.max, (self.max - self.min) / n) 34 | 35 | def sample_values_from_binning_indices(self, binning_indices): 36 | return super().sample_values_from_binning_indices(binning_indices) 37 | -------------------------------------------------------------------------------- /generative_models/data_synthesiser_utils/datatypes/IntegerAttribute.py: -------------------------------------------------------------------------------- 1 | from numpy import linspace, histogram 2 | 3 | from generative_models.data_synthesiser_utils.datatypes.AbstractAttribute import AbstractAttribute 4 | from generative_models.data_synthesiser_utils.datatypes.utils.DataType import DataType 5 | from generative_models.data_synthesiser_utils.utils import normalize_given_distribution 6 | 7 | 8 | class IntegerAttribute(AbstractAttribute): 9 | def __init__(self, name, data, histogram_size): 10 | super().__init__(name, data, histogram_size) 11 | self.is_categorical = False 12 | self.is_numerical = True 13 | self.data_type = DataType.INTEGER 14 | self.data = self.data.astype(int) 15 | self.data_dropna = self.data_dropna.astype(int) 16 | 17 | def set_domain(self, domain=None): 18 | if domain is not None: 19 | self.min, self.max = domain 20 | else: 21 | self.min = self.data_dropna.min() 22 | self.max = self.data_dropna.max() 23 | 24 | self.min = int(self.min) 25 | self.max = int(self.max) 26 | self.distribution_bins = linspace(self.min, self.max, self.histogram_size + 1).astype(int) 27 | self.domain_size = self.histogram_size 28 | 29 | def infer_distribution(self): 30 | frequency_counts, _ = histogram(self.data_dropna, bins=self.distribution_bins) 31 | self.distribution_probabilities = normalize_given_distribution(frequency_counts) 32 | 33 | def generate_values_as_candidate_key(self, n): 34 | return super().generate_values_as_candidate_key(n) 35 | 36 | def sample_values_from_binning_indices(self, binning_indices): 37 | column = super().sample_values_from_binning_indices(binning_indices) 38 | column = column.round() 39 | column = column.astype(int) 40 | # column[~column.isnull()] = column[~column.isnull()].astype(int) 41 | return column 42 | -------------------------------------------------------------------------------- /generative_models/data_synthesiser_utils/datatypes/StringAttribute.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from generative_models.data_synthesiser_utils.datatypes.AbstractAttribute import AbstractAttribute 4 | from generative_models.data_synthesiser_utils.datatypes.utils.DataType import DataType 5 | from generative_models.data_synthesiser_utils.utils import normalize_given_distribution, generate_random_string 6 | 7 | 8 | class StringAttribute(AbstractAttribute): 9 | """Variable min and max are the lengths of the shortest and longest strings. 10 | 11 | """ 12 | 13 | def __init__(self, name, data, histogram_size): 14 | super().__init__(name, data, histogram_size) 15 | self.is_categorical = True 16 | self.is_numerical = False 17 | self.data_type = DataType.STRING 18 | self.data_dropna_len = self.data_dropna.astype(str).map(len) 19 | 20 | def set_domain(self, domain=None): 21 | if domain is not None: 22 | lengths = [len(i) for i in domain] 23 | self.min = min(lengths) 24 | self.max = max(lengths) 25 | self.distribution_bins = np.array(domain) 26 | else: 27 | self.min = int(self.data_dropna_len.min()) 28 | self.max = int(self.data_dropna_len.max()) 29 | self.distribution_bins = self.data_dropna.unique() 30 | 31 | self.domain_size = len(self.distribution_bins) 32 | 33 | def infer_distribution(self): 34 | 35 | histogram = self.data_dropna.value_counts() 36 | for attr_cat in set(self.distribution_bins) - set(histogram.index): 37 | histogram[attr_cat] = 0 38 | histogram = histogram[self.distribution_bins] 39 | self.distribution_probabilities = normalize_given_distribution(histogram) 40 | 41 | def generate_values_as_candidate_key(self, n): 42 | length = np.random.randint(self.min, self.max) 43 | vectorized = np.vectorize(lambda x: '{}{}'.format(generate_random_string(length), x)) 44 | return vectorized(np.arange(n)) 45 | 46 | def sample_values_from_binning_indices(self, binning_indices): 47 | return super().sample_values_from_binning_indices(binning_indices) 48 | -------------------------------------------------------------------------------- /generative_models/data_synthesiser_utils/datatypes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/generative_models/data_synthesiser_utils/datatypes/__init__.py -------------------------------------------------------------------------------- /generative_models/data_synthesiser_utils/datatypes/constants.py: -------------------------------------------------------------------------------- 1 | CONTINUOUS = 'Continuous' 2 | CATEGORICAL = 'Categorical' 3 | ORDINAL = 'Ordinal' 4 | INTEGER = 'Integer' 5 | FLOAT = 'Float' 6 | STRING = 'String' -------------------------------------------------------------------------------- /generative_models/data_synthesiser_utils/datatypes/utils/DataType.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class DataType(Enum): 5 | INTEGER = 'Integer' 6 | FLOAT = 'Float' 7 | STRING = 'String' 8 | DATETIME = 'DateTime' 9 | SOCIAL_SECURITY_NUMBER = 'SocialSecurityNumber' 10 | -------------------------------------------------------------------------------- /generative_models/data_synthesiser_utils/datatypes/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/generative_models/data_synthesiser_utils/datatypes/utils/__init__.py -------------------------------------------------------------------------------- /generative_models/data_synthesiser_utils/utils.py: -------------------------------------------------------------------------------- 1 | from math import log, ceil 2 | from numpy import array, exp, isinf, full_like 3 | from numpy.random import choice 4 | from string import ascii_lowercase 5 | from itertools import combinations 6 | from pandas import Series, DataFrame 7 | from sklearn.metrics import mutual_info_score, normalized_mutual_info_score 8 | 9 | 10 | def mutual_information(labels_x: Series, labels_y: DataFrame): 11 | """Mutual information of distributions in format of Series or DataFrame. 12 | 13 | Parameters 14 | ---------- 15 | labels_x : Series 16 | labels_y : DataFrame 17 | """ 18 | if labels_y.shape[1] == 1: 19 | labels_y = labels_y.iloc[:, 0] 20 | else: 21 | labels_y = labels_y.apply(lambda x: ' '.join(x.values), axis=1) 22 | 23 | return mutual_info_score(labels_x, labels_y) 24 | 25 | 26 | def pairwise_attributes_mutual_information(dataset): 27 | """Compute normalized mutual information for all pairwise attributes. Return a DataFrame.""" 28 | sorted_columns = sorted(dataset.columns) 29 | mi_df = DataFrame(columns=sorted_columns, index=sorted_columns, dtype=float) 30 | for row in mi_df.columns: 31 | for col in mi_df.columns: 32 | mi_df.loc[row, col] = normalized_mutual_info_score(dataset[row].astype(str), 33 | dataset[col].astype(str), 34 | average_method='arithmetic') 35 | return mi_df 36 | 37 | 38 | def normalize_given_distribution(frequencies): 39 | distribution = array(frequencies, dtype=float) 40 | distribution = distribution.clip(0) # replace negative values with 0 41 | summation = distribution.sum() 42 | if summation > 0: 43 | if isinf(summation): 44 | return normalize_given_distribution(isinf(distribution)) 45 | else: 46 | return distribution / summation 47 | else: 48 | return full_like(distribution, 1 / distribution.size) 49 | 50 | 51 | def infer_numerical_attributes_in_dataframe(dataframe): 52 | describe = dataframe.describe() 53 | # DataFrame.describe() usually returns 8 rows. 54 | if describe.shape[0] == 8: 55 | return set(describe.columns) 56 | # DataFrame.describe() returns less than 8 rows when there is no numerical attribute. 57 | else: 58 | return set() 59 | 60 | 61 | def display_bayesian_network(bn): 62 | length = 0 63 | for child, _ in bn: 64 | if len(child) > length: 65 | length = len(child) 66 | 67 | print('Constructed Bayesian network:') 68 | for child, parents in bn: 69 | print(" {0:{width}} has parents {1}.".format(child, parents, width=length)) 70 | 71 | 72 | def generate_random_string(length): 73 | return ''.join(choice(list(ascii_lowercase), size=length)) 74 | 75 | 76 | def bayes_worker(paras): 77 | child, V, num_parents, split, dataset = paras 78 | parents_pair_list = [] 79 | mutual_info_list = [] 80 | 81 | if split + num_parents - 1 < len(V): 82 | for other_parents in combinations(V[split + 1:], num_parents - 1): 83 | parents = list(other_parents) 84 | parents.append(V[split]) 85 | parents_pair_list.append((child, parents)) 86 | mi = mutual_information(dataset[child], dataset[parents]) 87 | mutual_info_list.append(mi) 88 | 89 | return parents_pair_list, mutual_info_list 90 | 91 | 92 | def calculate_sensitivity(num_tuples, child, parents, attr_to_is_binary): 93 | """Sensitivity function for Bayesian network construction. PrivBayes Lemma 1. 94 | Parameters 95 | ---------- 96 | num_tuples : int 97 | Number of tuples in sensitive dataset. 98 | Return 99 | -------- 100 | int 101 | Sensitivity value. 102 | """ 103 | if attr_to_is_binary[child] or (len(parents) == 1 and attr_to_is_binary[parents[0]]): 104 | a = log(num_tuples) / num_tuples 105 | b = (num_tuples - 1) / num_tuples 106 | b_inv = num_tuples / (num_tuples - 1) 107 | return a + b * log(b_inv) 108 | else: 109 | a = (2 / num_tuples) * log((num_tuples + 1) / 2) 110 | b = (1 - 1 / num_tuples) * log(1 + 2 / (num_tuples - 1)) 111 | return a + b 112 | 113 | 114 | def calculate_delta(num_attributes, sensitivity, epsilon): 115 | """Computing delta, which is a factor when applying differential privacy. 116 | More info is in PrivBayes Section 4.2 "A First-Cut Solution". 117 | Parameters 118 | ---------- 119 | num_attributes : int 120 | Number of attributes in dataset. 121 | sensitivity : float 122 | Sensitivity of removing one tuple. 123 | epsilon : float 124 | Parameter of differential privacy. 125 | """ 126 | return (num_attributes - 1) * sensitivity / epsilon 127 | 128 | 129 | def exponential_mechanism(epsilon, mutual_info_list, parents_pair_list, attr_to_is_binary, num_tuples, num_attributes): 130 | """Applied in Exponential Mechanism to sample outcomes.""" 131 | delta_array = [] 132 | for (child, parents) in parents_pair_list: 133 | sensitivity = calculate_sensitivity(num_tuples, child, parents, attr_to_is_binary) 134 | delta = calculate_delta(num_attributes, sensitivity, epsilon) 135 | delta_array.append(delta) 136 | 137 | mi_array = array(mutual_info_list) / (2 * array(delta_array)) 138 | mi_array = exp(mi_array) 139 | mi_array = normalize_given_distribution(mi_array) 140 | return mi_array -------------------------------------------------------------------------------- /generative_models/generative_model.py: -------------------------------------------------------------------------------- 1 | """Parent class for all generative models""" 2 | 3 | class GenerativeModel(object): 4 | 5 | def fit(self, data): 6 | """Fit a generative model to the input dataset""" 7 | return NotImplementedError('Method needs to be overwritten by a subclass.') 8 | 9 | def generate_samples(self, nsamples): 10 | """Generate a synthetic dataset of size nsamples""" 11 | return NotImplementedError('Method needs to be overwritten by a subclass.') -------------------------------------------------------------------------------- /generative_models/gmm.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | from sklearn.mixture import GaussianMixture 3 | from generative_models.generative_model import GenerativeModel 4 | 5 | import logging 6 | from logging.config import fileConfig 7 | dirname = path.dirname(__file__) 8 | logconfig = path.join(dirname, '../logging_config.ini') 9 | fileConfig(logconfig) 10 | logger = logging.getLogger(__name__) 11 | 12 | class GaussianMixtureModel(GenerativeModel): 13 | 14 | def __init__(self): 15 | self.gm = GaussianMixture() 16 | self.trained = False 17 | 18 | def fit(self, data): 19 | """Fit a gaussian mixture model to the input data. Input data is assumed to be of shape (n_samples, n_features) 20 | See https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#sklearn.mixture.GaussianMixture.fit for details""" 21 | logger.debug(f'Start fitting GaussianMixtureModel to data of shape {data.shape}...') 22 | self.gm.fit(data) 23 | logger.debug(f'Finished fitting GMM') 24 | self.trained = True 25 | 26 | def generate_samples(self, nsamples): 27 | """Generate random samples from the fitted Gaussian distribution""" 28 | assert self.trained, "Model must first be fitted to some data." 29 | logger.debug(f'Generate synthetic dataset of size {nsamples}') 30 | synthetic_data, _ = self.gm.sample(nsamples) 31 | return synthetic_data 32 | 33 | -------------------------------------------------------------------------------- /generative_models/pate_gan.py: -------------------------------------------------------------------------------- 1 | """ 2 | A generative model training algorithm based on 3 | "PATE-GAN: Generating Synthetic Data with Differential Privacy Guarantees" 4 | by J. Yoon, J. Jordon, M. van der Schaar, published in International Conference on Learning Representations (ICLR), 2019 5 | Adapted from: https://bitbucket.org/mvdschaar/mlforhealthlabpub/src/82d7f91d46db54d256ff4fc920d513499ddd2ab8/alg/pategan/ 6 | """ 7 | 8 | import tensorflow.compat.v1 as tf 9 | tf.disable_v2_behavior() 10 | 11 | import numpy as np 12 | from pandas import DataFrame 13 | 14 | from generative_models.generative_model import GenerativeModel 15 | from utils.logging import LOGGER 16 | from utils.constants import * 17 | 18 | 19 | ZERO_TOL = 1e-8 20 | 21 | 22 | class PATEGAN(GenerativeModel): 23 | """ A generative adversarial network trained under the PATE framework to achieve differential privacy """ 24 | 25 | def __init__(self, metadata, 26 | eps=1, delta=1e-5, infer_ranges=False, 27 | num_teachers=10, n_iters=100, batch_size=128, 28 | learning_rate=1e-4, multiprocess=False): 29 | """ 30 | :param metadata: dict: Attribute metadata describing the data domain of the synthetic target data 31 | :param eps: float: Privacy parameter 32 | :param delta: float: Privacy parameter 33 | :param target: str: Name of the target variable for downstream classification tasks 34 | :param num_teachers: int: Number of teacher discriminators 35 | :param n_iters: int: Number of training iterations 36 | """ 37 | # Data description 38 | self.metadata, self.attribute_list = self.read_meta(metadata) 39 | self.datatype = DataFrame 40 | self.nfeatures = self.get_num_features() 41 | 42 | # Privacy params 43 | self.epsilon = eps 44 | self.delta = delta 45 | self.infer_ranges = infer_ranges 46 | 47 | # Training params 48 | self.num_teachers = num_teachers 49 | self.n_iters = n_iters 50 | self.batch_size = batch_size 51 | self.learning_rate = learning_rate 52 | self.z_dim = int(self.nfeatures / 4) 53 | self.h_dim = int(self.nfeatures) 54 | 55 | # Configure device 56 | device_name = tf.test.gpu_device_name() 57 | if device_name is '': 58 | self.device_spec = tf.DeviceSpec(device_type='CPU', device_index=0) 59 | else: 60 | self.device_spec = tf.DeviceSpec(device_type='GPU', device_index=0) 61 | 62 | with tf.device(self.device_spec.to_string()): 63 | # Variable init 64 | # Feature matrix 65 | self.X = tf.placeholder(tf.float32, shape=[None, self.nfeatures]) 66 | # Latent space 67 | self.Z = tf.placeholder(tf.float32, shape=[None, self.z_dim]) 68 | # Noise variable 69 | self.M = tf.placeholder(tf.float32, shape=[None, 1]) 70 | # Generator 71 | self.GDist = None 72 | self._generator() 73 | # Discriminator 74 | self._discriminator() 75 | self.sess = tf.Session() 76 | 77 | self.multiprocess = multiprocess 78 | 79 | self.trained = False 80 | 81 | self.__name__ = f'PateGanEps{self.epsilon}' 82 | 83 | @property 84 | def laplace_noise_scale(self): 85 | return np.sqrt(2 * np.log(1.25 * 10**self.delta)) / self.epsilon 86 | 87 | def get_num_features(self): 88 | nfeatures = 0 89 | 90 | for cname, cdict in self.metadata.items(): 91 | data_type = cdict['type'] 92 | if data_type == FLOAT or data_type == INTEGER: 93 | nfeatures += 1 94 | 95 | elif data_type == CATEGORICAL or data_type == ORDINAL: 96 | nfeatures += len(cdict['categories']) 97 | 98 | else: 99 | raise ValueError(f'Unkown data type {data_type} for attribute {cname}') 100 | 101 | return nfeatures 102 | 103 | def read_meta(self, metadata): 104 | meta_dict = {} 105 | attr_names = [] 106 | for cdict in metadata['columns']: 107 | attr_name = cdict['name'] 108 | data_type = cdict['type'] 109 | if data_type == FLOAT or data_type == INTEGER: 110 | meta_dict[attr_name] = { 111 | 'type': data_type, 112 | 'min': cdict['min'], 113 | 'max': cdict['max'] 114 | } 115 | 116 | elif data_type == CATEGORICAL or data_type == ORDINAL: 117 | meta_dict[attr_name] = { 118 | 'type': data_type, 119 | 'categories': cdict['i2s'] 120 | } 121 | 122 | else: 123 | raise ValueError(f'Unknown data type {data_type} for attribute {attr_name}') 124 | 125 | attr_names.append(attr_name) 126 | 127 | return meta_dict, attr_names 128 | 129 | def _generator(self): 130 | self.G_W1 = tf.Variable(self._xavier_init([self.z_dim, self.h_dim])) 131 | self.G_b1 = tf.Variable(tf.zeros(shape=[self.h_dim])) 132 | 133 | self.G_W2 = tf.Variable(self._xavier_init([self.h_dim, self.h_dim])) 134 | self.G_b2 = tf.Variable(tf.zeros(shape=[self.h_dim])) 135 | 136 | self.G_W3 = tf.Variable(self._xavier_init([self.h_dim, self.nfeatures])) 137 | self.G_b3 = tf.Variable(tf.zeros(shape=[self.nfeatures])) 138 | 139 | self.theta_G = [self.G_W1, self.G_W2, self.G_W3, self.G_b1, self.G_b2, self.G_b3] 140 | 141 | def _discriminator(self): 142 | self.D_W1 = tf.Variable(self._xavier_init([self.nfeatures, self.h_dim])) 143 | self.D_b1 = tf.Variable(tf.zeros(shape=[self.h_dim])) 144 | 145 | self.D_W2 = tf.Variable(self._xavier_init([self.h_dim, self.h_dim])) 146 | self.D_b2 = tf.Variable(tf.zeros(shape=[self.h_dim])) 147 | 148 | self.D_W3 = tf.Variable(self._xavier_init([self.h_dim, 1])) 149 | self.D_b3 = tf.Variable(tf.zeros(shape=[1])) 150 | 151 | self.theta_D = [self.D_W1, self.D_W2, self.D_W3, self.D_b1, self.D_b2, self.D_b3] 152 | 153 | def fit(self, data): 154 | """Fit a generative model of the training data distribution. 155 | :param data: DataFrame: Training set 156 | """ 157 | assert isinstance(data, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(data)}' 158 | 159 | # Clean up 160 | if self.trained: 161 | self._generator() 162 | self._discriminator() 163 | self.sess = tf.Session() 164 | self.trained = False 165 | 166 | LOGGER.debug(f'Start fitting {self.__class__.__name__} to data of shape {data.shape}...') 167 | nsamples = len(data) 168 | features_train = self._encode_data(data) 169 | 170 | with tf.device(self.device_spec.to_string()): 171 | # Generator 172 | self.GDist = self.gen_out(self.Z) 173 | 174 | # Discriminator 175 | D_real = self.discriminator_out(self.X) 176 | D_fake = self.discriminator_out(self.GDist) 177 | D_entire = tf.concat(axis=0, values=[D_real, D_fake]) 178 | 179 | # Replacement of Clipping algorithm to Penalty term 180 | # 1. Line 6 in Algorithm 1 181 | noisy_vals = tf.random_uniform([self.batch_size, 1], minval=0., maxval=1.) 182 | X_inter = noisy_vals * self.X + (1. - noisy_vals) * self.GDist 183 | 184 | # 2. Line 7 in Algorithm 1 185 | grad = tf.gradients(self.discriminator_out(X_inter), [X_inter])[0] 186 | grad_norm = tf.sqrt(tf.reduce_sum(grad ** 2 + ZERO_TOL, axis=1)) 187 | grad_pen = self.num_teachers * tf.reduce_mean((grad_norm - 1) ** 2) 188 | 189 | # Loss function 190 | discriminator_loss = tf.reduce_mean((1 - self.M) * D_entire) - tf.reduce_mean(self.M * D_entire) + grad_pen 191 | generator_loss = -tf.reduce_mean(D_fake) 192 | 193 | # Solver 194 | discriminator_solver = (tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.5).minimize(discriminator_loss, var_list=self.theta_D)) 195 | generator_solver = (tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.5).minimize(generator_loss, var_list=self.theta_G)) 196 | 197 | # Start session 198 | self.sess.run(tf.global_variables_initializer()) 199 | 200 | # Training iterations 201 | for _ in range(self.n_iters): 202 | # TODO: Move dataset splitting here 203 | # For fixed generator weights run teacher training 204 | for _ in range(self.num_teachers): 205 | # Sample latent vars 206 | latent_batch = self._sample_latent_z(self.batch_size, self.z_dim) 207 | 208 | # Sample real 209 | train_idx_teach = self._sample_real_x(nsamples, self.batch_size) # Does this way of sampling satisfy DP? Should be disjoint subsets! 210 | features_train_batch = features_train[train_idx_teach, :] 211 | 212 | labels_real = np.ones([self.batch_size, ]) 213 | labels_fake = np.zeros([self.batch_size, ]) 214 | 215 | labels_batch = np.concatenate((labels_real, labels_fake), 0) 216 | 217 | gaussian_noise = np.random.normal(loc=0.0, scale=self.laplace_noise_scale, size=self.batch_size * 2) 218 | 219 | labels_batch = labels_batch + gaussian_noise 220 | 221 | labels_batch = (labels_batch > 0.5) 222 | 223 | labels_batch = np.reshape(labels_batch.astype(float), (2 * self.batch_size, 1)) 224 | 225 | _, discriminator_loss_iter = self.sess.run([discriminator_solver, discriminator_loss], feed_dict={self.X: features_train_batch, self.Z: latent_batch, self.M: labels_batch}) 226 | 227 | # Update generator weights 228 | latent_batch = self._sample_latent_z(self.batch_size, self.z_dim) 229 | 230 | _, generator_loss_iter = self.sess.run([generator_solver, generator_loss], feed_dict={self.Z: latent_batch}) 231 | 232 | self.trained = True 233 | 234 | def generate_samples(self, nsamples): 235 | """"" 236 | Samples synthetic data records from the fitted generative distribution 237 | :param nsamples: int: Number of synthetic records to generate 238 | :return synData: DataFrame: A synthetic dataset 239 | """ 240 | with tf.device(self.device_spec.to_string()): 241 | # Output generation 242 | features_synthetic_encoded = self.sess.run([self.GDist], feed_dict={self.Z: self._sample_latent_z(nsamples, self.z_dim)})[0] 243 | 244 | # Revers numerical encoding 245 | synthetic_data = self._decode_data(features_synthetic_encoded) 246 | synthetic_data = synthetic_data.iloc[np.random.permutation(synthetic_data.index)].reset_index(drop=True) 247 | 248 | return synthetic_data 249 | 250 | 251 | def gen_out(self, z): 252 | G_h1 = tf.nn.tanh(tf.matmul(z, self.G_W1) + self.G_b1) 253 | G_h2 = tf.nn.tanh(tf.matmul(G_h1, self.G_W2) + self.G_b2) 254 | G_log_prob = tf.nn.sigmoid(tf.matmul(G_h2, self.G_W3) + self.G_b3) 255 | 256 | return G_log_prob 257 | 258 | def discriminator_out(self, x): 259 | D_h1 = tf.nn.relu(tf.matmul(x, self.D_W1) + self.D_b1) 260 | D_h2 = tf.nn.relu(tf.matmul(D_h1, self.D_W2) + self.D_b2) 261 | out = (tf.matmul(D_h2, self.D_W3) + self.D_b3) 262 | 263 | return out 264 | 265 | def _xavier_init(self,size): 266 | in_dim = size[0] 267 | xavier_stddev = 1. / tf.sqrt(in_dim / 2.) 268 | 269 | return tf.random_normal(shape=size, stddev=xavier_stddev) 270 | 271 | def _sample_latent_z(self, nsamples, ndims): 272 | return np.random.uniform(-1., 1., size=[nsamples, ndims]) 273 | 274 | def _sample_real_x(self, data_size, batch_size): 275 | return np.random.permutation(data_size)[:batch_size] 276 | 277 | def _encode_data(self, data): 278 | n_samples = len(data) 279 | features_encoded = np.empty((n_samples, self.nfeatures)) 280 | cidx = 0 281 | 282 | for attr_name, cdict in self.metadata.items(): 283 | data_type = cdict['type'] 284 | col_data = data[attr_name].to_numpy() 285 | 286 | if data_type == FLOAT or data_type == INTEGER: 287 | # Normalise continuous data 288 | if self.infer_ranges: 289 | col_max = max(col_data) 290 | col_min = min(col_data) 291 | 292 | self.metadata[attr_name]['max'] = col_max 293 | self.metadata[attr_name]['min'] = col_min 294 | 295 | else: 296 | col_max = cdict['max'] 297 | col_min = cdict['min'] 298 | 299 | features_encoded[:, cidx] = np.true_divide(col_data - col_min, col_max + ZERO_TOL) 300 | 301 | cidx += 1 302 | 303 | elif data_type == CATEGORICAL or data_type == ORDINAL: 304 | # One-hot encoded categorical columns 305 | col_cats = cdict['categories'] 306 | col_data_onehot = self._one_hot(col_data, col_cats) 307 | features_encoded[:, cidx : cidx + len(col_cats)] = col_data_onehot 308 | 309 | cidx += len(col_cats) 310 | 311 | return features_encoded 312 | 313 | def _decode_data(self, features_encoded): 314 | """ Revers feature encoding. """ 315 | data = DataFrame(columns=self.attribute_list) 316 | 317 | cidx = 0 318 | 319 | for attr_name, cdict in self.metadata.items(): 320 | data_type = cdict['type'] 321 | 322 | if data_type == FLOAT: 323 | col_min = cdict['min'] 324 | col_max = cdict['max'] 325 | 326 | col_data = features_encoded[:, cidx] 327 | col_data = col_data * (col_max + ZERO_TOL) + col_min 328 | data[attr_name] = col_data.astype(float) 329 | cidx += 1 330 | 331 | elif data_type == INTEGER: 332 | col_min = cdict['min'] 333 | col_max = cdict['max'] 334 | 335 | col_data = features_encoded[:, cidx] 336 | col_data = col_data * (col_max + ZERO_TOL) + col_min 337 | data[attr_name] = col_data.astype(int) 338 | cidx += 1 339 | 340 | elif data_type == CATEGORICAL or data_type == ORDINAL: 341 | col_cats = cdict['categories'] 342 | ncats = len(col_cats) 343 | 344 | col_data_onehot = features_encoded[:, cidx : cidx + ncats] 345 | col_data = self._reverse_one_hot(col_data_onehot, col_cats) 346 | data[attr_name] = col_data.astype(str) 347 | 348 | cidx += ncats 349 | 350 | return data 351 | 352 | def _one_hot(self, col_data, categories): 353 | col_data_onehot = np.zeros((len(col_data), len(categories))) 354 | cidx = [categories.index(c) for c in col_data] 355 | col_data_onehot[np.arange(len(col_data)), cidx] = 1 356 | 357 | return col_data_onehot 358 | 359 | def _reverse_one_hot(self, col_encoded, categories): 360 | cat_idx = np.argmax(col_encoded, axis=1) 361 | col_data = np.array([categories[i] for i in cat_idx]) 362 | 363 | return col_data -------------------------------------------------------------------------------- /inference_cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command-line interface for running privacy evaluation under an attribute inference adversary 3 | """ 4 | 5 | import json 6 | 7 | from os import mkdir, path 8 | from numpy.random import choice, seed 9 | from argparse import ArgumentParser 10 | 11 | from utils.datagen import load_s3_data_as_df, load_local_data_as_df 12 | from utils.utils import json_numpy_serialzer 13 | from utils.logging import LOGGER 14 | from utils.constants import * 15 | 16 | from generative_models.ctgan import CTGAN 17 | from generative_models.data_synthesiser import IndependentHistogram, BayesianNet, PrivBayes 18 | from generative_models.pate_gan import PATEGAN 19 | from sanitisation_techniques.sanitiser import SanitiserNHS 20 | from attack_models.reconstruction import LinRegAttack, RandForestAttack 21 | 22 | from warnings import simplefilter 23 | simplefilter('ignore', category=FutureWarning) 24 | simplefilter('ignore', category=DeprecationWarning) 25 | 26 | cwd = path.dirname(__file__) 27 | 28 | SEED = 42 29 | 30 | 31 | def main(): 32 | argparser = ArgumentParser() 33 | datasource = argparser.add_mutually_exclusive_group() 34 | datasource.add_argument('--s3name', '-S3', type=str, choices=['adult', 'census', 'credit', 'alarm', 'insurance'], help='Name of the dataset to run on') 35 | datasource.add_argument('--datapath', '-D', type=str, help='Relative path to cwd of a local data file') 36 | argparser.add_argument('--runconfig', '-RC', default='runconfig_mia.json', type=str, help='Path relative to cwd of runconfig file') 37 | argparser.add_argument('--outdir', '-O', default='tests', type=str, help='Path relative to cwd for storing output files') 38 | args = argparser.parse_args() 39 | 40 | # Load runconfig 41 | with open(path.join(cwd, args.runconfig)) as f: 42 | runconfig = json.load(f) 43 | print('Runconfig:') 44 | print(runconfig) 45 | 46 | # Load data 47 | if args.s3name is not None: 48 | rawPop, metadata = load_s3_data_as_df(args.s3name) 49 | dname = args.s3name 50 | else: 51 | rawPop, metadata = load_local_data_as_df(path.join(cwd, args.datapath)) 52 | dname = args.datapath.split('/')[-1] 53 | 54 | print(f'Loaded data {dname}:') 55 | print(rawPop.info()) 56 | 57 | # Make sure outdir exists 58 | if not path.isdir(args.outdir): 59 | mkdir(args.outdir) 60 | 61 | seed(SEED) 62 | 63 | ######################## 64 | #### GAME INPUTS ####### 65 | ######################## 66 | # Pick targets 67 | targetIDs = choice(list(rawPop.index), size=runconfig['nTargets'], replace=False).tolist() 68 | 69 | # If specified: Add specific target records 70 | if runconfig['Targets'] is not None: 71 | targetIDs.extend(runconfig['Targets']) 72 | 73 | targets = rawPop.loc[targetIDs, :] 74 | 75 | # Drop targets from population 76 | rawPopDropTargets = rawPop.drop(targetIDs) 77 | 78 | # List of candidate generative models to evaluate 79 | gmList = [] 80 | if 'generativeModels' in runconfig.keys(): 81 | for gm, paramsList in runconfig['generativeModels'].items(): 82 | if gm == 'IndependentHistogram': 83 | for params in paramsList: 84 | gmList.append(IndependentHistogram(metadata, *params)) 85 | elif gm == 'BayesianNet': 86 | for params in paramsList: 87 | gmList.append(BayesianNet(metadata, *params)) 88 | elif gm == 'PrivBayes': 89 | for params in paramsList: 90 | gmList.append(PrivBayes(metadata, *params)) 91 | elif gm == 'CTGAN': 92 | for params in paramsList: 93 | gmList.append(CTGAN(metadata, *params)) 94 | elif gm == 'PATEGAN': 95 | for params in paramsList: 96 | gmList.append(PATEGAN(metadata, *params)) 97 | else: 98 | raise ValueError(f'Unknown GM {gm}') 99 | 100 | # List of candidate sanitisation techniques to evaluate 101 | sanList = [] 102 | if 'sanitisationTechniques' in runconfig.keys(): 103 | for name, paramsList in runconfig['sanitisationTechniques'].items(): 104 | if name == 'SanitiserNHS': 105 | for params in paramsList: 106 | sanList.append(SanitiserNHS(metadata, *params)) 107 | else: 108 | raise ValueError(f'Unknown sanitisation technique {name}') 109 | 110 | ################################## 111 | ######### EVALUATION ############# 112 | ################################## 113 | resultsTargetPrivacy = {tid: {sa: {gm.__name__: {} for gm in gmList + sanList} for sa in runconfig['sensitiveAttributes']} for tid in targetIDs} 114 | # Add entry for raw 115 | for tid in targetIDs: 116 | for sa in runconfig['sensitiveAttributes']: 117 | resultsTargetPrivacy[tid][sa]['Raw'] = {} 118 | 119 | print('\n---- Start the game ----') 120 | for nr in range(runconfig['nIter']): 121 | print(f'\n--- Game iteration {nr + 1} ---') 122 | # Draw a raw dataset 123 | rIdx = choice(list(rawPopDropTargets.index), size=runconfig['sizeRawT'], replace=False).tolist() 124 | rawTout = rawPopDropTargets.loc[rIdx] 125 | 126 | ############### 127 | ## ATTACKS #### 128 | ############### 129 | attacks = {} 130 | for sa, atype in runconfig['sensitiveAttributes'].items(): 131 | if atype == 'LinReg': 132 | attacks[sa] = LinRegAttack(sensitiveAttribute=sa, metadata=metadata) 133 | elif atype == 'Classification': 134 | attacks[sa] = RandForestAttack(sensitiveAttribute=sa, metadata=metadata) 135 | 136 | #### Assess advantage raw 137 | for sa, Attack in attacks.items(): 138 | Attack.train(rawTout) 139 | 140 | for tid in targetIDs: 141 | target = targets.loc[[tid]] 142 | targetAux = target.loc[[tid], Attack.knownAttributes] 143 | targetSecret = target.loc[tid, Attack.sensitiveAttribute] 144 | 145 | guess = Attack.attack(targetAux, attemptLinkage=True, data=rawTout) 146 | pCorrect = Attack.get_likelihood(targetAux, targetSecret, attemptLinkage=True, data=rawTout) 147 | 148 | resultsTargetPrivacy[tid][sa]['Raw'][nr] = { 149 | 'AttackerGuess': [guess], 150 | 'ProbCorrect': [pCorrect], 151 | 'TargetPresence': [LABEL_OUT] 152 | } 153 | 154 | for tid in targetIDs: 155 | target = targets.loc[[tid]] 156 | rawTin = rawTout.append(target) 157 | 158 | for sa, Attack in attacks.items(): 159 | targetAux = target.loc[[tid], Attack.knownAttributes] 160 | targetSecret = target.loc[tid, Attack.sensitiveAttribute] 161 | 162 | guess = Attack.attack(targetAux, attemptLinkage=True, data=rawTin) 163 | pCorrect = Attack.get_likelihood(targetAux, targetSecret, attemptLinkage=True, data=rawTin) 164 | 165 | resultsTargetPrivacy[tid][sa]['Raw'][nr]['AttackerGuess'].append(guess) 166 | resultsTargetPrivacy[tid][sa]['Raw'][nr]['ProbCorrect'].append(pCorrect) 167 | resultsTargetPrivacy[tid][sa]['Raw'][nr]['TargetPresence'].append(LABEL_IN) 168 | 169 | ##### Assess advantage Syn 170 | for GenModel in gmList: 171 | LOGGER.info(f'Start: Evaluation for model {GenModel.__name__}...') 172 | GenModel.fit(rawTout) 173 | synTwithoutTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])] 174 | 175 | for sa, Attack in attacks.items(): 176 | for tid in targetIDs: 177 | resultsTargetPrivacy[tid][sa][GenModel.__name__][nr] = { 178 | 'AttackerGuess': [], 179 | 'ProbCorrect': [], 180 | 'TargetPresence': [LABEL_OUT for _ in range(runconfig['nSynT'])] 181 | } 182 | 183 | for syn in synTwithoutTarget: 184 | Attack.train(syn) 185 | 186 | for tid in targetIDs: 187 | target = targets.loc[[tid]] 188 | targetAux = target.loc[[tid], Attack.knownAttributes] 189 | targetSecret = target.loc[tid, Attack.sensitiveAttribute] 190 | 191 | guess = Attack.attack(targetAux) 192 | pCorrect = Attack.get_likelihood(targetAux, targetSecret) 193 | 194 | resultsTargetPrivacy[tid][sa][GenModel.__name__][nr]['AttackerGuess'].append(guess) 195 | resultsTargetPrivacy[tid][sa][GenModel.__name__][nr]['ProbCorrect'].append(pCorrect) 196 | 197 | del synTwithoutTarget 198 | 199 | for tid in targetIDs: 200 | LOGGER.info(f'Target: {tid}') 201 | target = targets.loc[[tid]] 202 | rawTin = rawTout.append(target) 203 | 204 | GenModel.fit(rawTin) 205 | synTwithTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])] 206 | 207 | for sa, Attack in attacks.items(): 208 | targetAux = target.loc[[tid], Attack.knownAttributes] 209 | targetSecret = target.loc[tid, Attack.sensitiveAttribute] 210 | 211 | for syn in synTwithTarget: 212 | Attack.train(syn) 213 | 214 | guess = Attack.attack(targetAux) 215 | pCorrect = Attack.get_likelihood(targetAux, targetSecret) 216 | 217 | resultsTargetPrivacy[tid][sa][GenModel.__name__][nr]['AttackerGuess'].append(guess) 218 | resultsTargetPrivacy[tid][sa][GenModel.__name__][nr]['ProbCorrect'].append(pCorrect) 219 | resultsTargetPrivacy[tid][sa][GenModel.__name__][nr]['TargetPresence'].append(LABEL_IN) 220 | del synTwithTarget 221 | 222 | for San in sanList: 223 | LOGGER.info(f'Start: Evaluation for sanitiser {San.__name__}...') 224 | attacks = {} 225 | for sa, atype in runconfig['sensitiveAttributes'].items(): 226 | if atype == 'LinReg': 227 | attacks[sa] = LinRegAttack(sensitiveAttribute=sa, metadata=metadata, quids=San.quids) 228 | elif atype == 'Classification': 229 | attacks[sa] = RandForestAttack(sensitiveAttribute=sa, metadata=metadata, quids=San.quids) 230 | 231 | sanOut = San.sanitise(rawTout) 232 | 233 | for sa, Attack in attacks.items(): 234 | Attack.train(sanOut) 235 | 236 | for tid in targetIDs: 237 | target = targets.loc[[tid]] 238 | targetAux = target.loc[[tid], Attack.knownAttributes] 239 | targetSecret = target.loc[tid, Attack.sensitiveAttribute] 240 | 241 | guess = Attack.attack(targetAux, attemptLinkage=True, data=sanOut) 242 | pCorrect = Attack.get_likelihood(targetAux, targetSecret, attemptLinkage=True, data=sanOut) 243 | 244 | resultsTargetPrivacy[tid][sa][San.__name__][nr] = { 245 | 'AttackerGuess': [guess], 246 | 'ProbCorrect': [pCorrect], 247 | 'TargetPresence': [LABEL_OUT] 248 | } 249 | 250 | for tid in targetIDs: 251 | LOGGER.info(f'Target: {tid}') 252 | target = targets.loc[[tid]] 253 | rawTin = rawTout.append(target) 254 | sanIn = San.sanitise(rawTin) 255 | 256 | for sa, Attack in attacks.items(): 257 | targetAux = target.loc[[tid], Attack.knownAttributes] 258 | targetSecret = target.loc[tid, Attack.sensitiveAttribute] 259 | 260 | 261 | Attack.train(sanIn) 262 | 263 | guess = Attack.attack(targetAux, attemptLinkage=True, data=sanIn) 264 | pCorrect = Attack.get_likelihood(targetAux, targetSecret, attemptLinkage=True, data=sanIn) 265 | 266 | resultsTargetPrivacy[tid][sa][San.__name__][nr]['AttackerGuess'].append(guess) 267 | resultsTargetPrivacy[tid][sa][San.__name__][nr]['ProbCorrect'].append(pCorrect) 268 | resultsTargetPrivacy[tid][sa][San.__name__][nr]['TargetPresence'].append(LABEL_IN) 269 | 270 | outfile = f"ResultsMLEAI_{dname}" 271 | LOGGER.info(f"Write results to {path.join(f'{args.outdir}', f'{outfile}')}") 272 | 273 | with open(path.join(f'{args.outdir}', f'{outfile}.json'), 'w') as f: 274 | json.dump(resultsTargetPrivacy, f, indent=2, default=json_numpy_serialzer) 275 | 276 | if __name__ == "__main__": 277 | main() -------------------------------------------------------------------------------- /linkage_cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command-line interface for running privacy evaluation with respect to the risk of linkability 3 | """ 4 | 5 | import json 6 | 7 | from os import mkdir, path 8 | from numpy.random import choice, seed 9 | from argparse import ArgumentParser 10 | from pandas import DataFrame 11 | 12 | from utils.datagen import load_s3_data_as_df, load_local_data_as_df 13 | from utils.utils import json_numpy_serialzer 14 | from utils.logging import LOGGER 15 | from utils.constants import * 16 | 17 | from feature_sets.independent_histograms import HistogramFeatureSet 18 | from feature_sets.model_agnostic import NaiveFeatureSet, EnsembleFeatureSet 19 | from feature_sets.bayes import CorrelationsFeatureSet 20 | 21 | from sanitisation_techniques.sanitiser import SanitiserNHS 22 | 23 | from generative_models.ctgan import CTGAN 24 | from generative_models.pate_gan import PATEGAN 25 | from generative_models.data_synthesiser import (IndependentHistogram, 26 | BayesianNet, 27 | PrivBayes) 28 | 29 | from attack_models.mia_classifier import (MIAttackClassifierRandomForest, 30 | generate_mia_shadow_data, 31 | generate_mia_anon_data) 32 | 33 | from warnings import simplefilter 34 | simplefilter('ignore', category=FutureWarning) 35 | simplefilter('ignore', category=DeprecationWarning) 36 | 37 | cwd = path.dirname(__file__) 38 | 39 | 40 | SEED = 42 41 | 42 | 43 | def main(): 44 | argparser = ArgumentParser() 45 | datasource = argparser.add_mutually_exclusive_group() 46 | datasource.add_argument('--s3name', '-S3', type=str, choices=['adult', 'census', 'credit', 'alarm', 'insurance'], help='Name of the dataset to run on') 47 | datasource.add_argument('--datapath', '-D', type=str, help='Relative path to cwd of a local data file') 48 | argparser.add_argument('--runconfig', '-RC', default='runconfig_mia.json', type=str, help='Path relative to cwd of runconfig file') 49 | argparser.add_argument('--outdir', '-O', default='tests', type=str, help='Path relative to cwd for storing output files') 50 | args = argparser.parse_args() 51 | 52 | # Load runconfig 53 | with open(path.join(cwd, args.runconfig)) as f: 54 | runconfig = json.load(f) 55 | print('Runconfig:') 56 | print(runconfig) 57 | 58 | # Load data 59 | if args.s3name is not None: 60 | rawPop, metadata = load_s3_data_as_df(args.s3name) 61 | dname = args.s3name 62 | else: 63 | rawPop, metadata = load_local_data_as_df(path.join(cwd, args.datapath)) 64 | dname = args.datapath.split('/')[-1] 65 | 66 | print(f'Loaded data {dname}:') 67 | print(rawPop.info()) 68 | 69 | # Make sure outdir exists 70 | if not path.isdir(args.outdir): 71 | mkdir(args.outdir) 72 | 73 | seed(SEED) 74 | 75 | ######################## 76 | #### GAME INPUTS ####### 77 | ######################## 78 | # Pick targets 79 | targetIDs = choice(list(rawPop.index), size=runconfig['nTargets'], replace=False).tolist() 80 | 81 | # If specified: Add specific target records 82 | if runconfig['Targets'] is not None: 83 | targetIDs.extend(runconfig['Targets']) 84 | 85 | targets = rawPop.loc[targetIDs, :] 86 | 87 | # Drop targets from population 88 | rawPopDropTargets = rawPop.drop(targetIDs) 89 | 90 | # Init adversary's prior knowledge 91 | rawAidx = choice(list(rawPopDropTargets.index), size=runconfig['sizeRawA'], replace=False).tolist() 92 | rawA = rawPop.loc[rawAidx, :] 93 | 94 | # List of candidate generative models to evaluate 95 | gmList = [] 96 | if 'generativeModels' in runconfig.keys(): 97 | for gm, paramsList in runconfig['generativeModels'].items(): 98 | if gm == 'IndependentHistogram': 99 | for params in paramsList: 100 | gmList.append(IndependentHistogram(metadata, *params)) 101 | elif gm == 'BayesianNet': 102 | for params in paramsList: 103 | gmList.append(BayesianNet(metadata, *params)) 104 | elif gm == 'PrivBayes': 105 | for params in paramsList: 106 | gmList.append(PrivBayes(metadata, *params)) 107 | elif gm == 'CTGAN': 108 | for params in paramsList: 109 | gmList.append(CTGAN(metadata, *params)) 110 | elif gm == 'PATEGAN': 111 | for params in paramsList: 112 | gmList.append(PATEGAN(metadata, *params)) 113 | else: 114 | raise ValueError(f'Unknown GM {gm}') 115 | 116 | # List of candidate sanitisation techniques to evaluate 117 | sanList = [] 118 | if 'sanitisationTechniques' in runconfig.keys(): 119 | for name, paramsList in runconfig['sanitisationTechniques'].items(): 120 | if name == 'SanitiserNHS': 121 | for params in paramsList: 122 | sanList.append(SanitiserNHS(metadata, *params)) 123 | else: 124 | raise ValueError(f'Unknown sanitisation technique {name}') 125 | 126 | ################################### 127 | #### ATTACK TRAINING ############# 128 | ################################## 129 | print('\n---- Attack training ----') 130 | attacks = {} 131 | 132 | for tid in targetIDs: 133 | print(f'\n--- Adversary picks target {tid} ---') 134 | target = targets.loc[[tid]] 135 | attacks[tid] = {} 136 | 137 | for San in sanList: 138 | LOGGER.info(f'Start: Attack training for {San.__name__}...') 139 | 140 | attacks[tid][San.__name__] = {} 141 | 142 | # Generate example datasets for training attack classifier 143 | sanA, labelsA = generate_mia_anon_data(San, target, rawA, runconfig['sizeRawT'], runconfig['nShadows'] * runconfig['nSynA']) 144 | 145 | # Train attack on shadow data 146 | for Feature in [NaiveFeatureSet(DataFrame), 147 | HistogramFeatureSet(DataFrame, metadata, nbins=San.histogram_size, quids=San.quids), 148 | CorrelationsFeatureSet(DataFrame, metadata, quids=San.quids), 149 | EnsembleFeatureSet(DataFrame, metadata, nbins=San.histogram_size, quasi_id_cols=San.quids)]: 150 | 151 | Attack = MIAttackClassifierRandomForest(metadata=metadata, FeatureSet=Feature, quids=San.quids) 152 | Attack.train(sanA, labelsA) 153 | attacks[tid][San.__name__][f'{Feature.__name__}'] = Attack 154 | 155 | # Clean up 156 | del sanA, labelsA 157 | 158 | LOGGER.info(f'Finished: Attack training.') 159 | 160 | for GenModel in gmList: 161 | LOGGER.info(f'Start: Attack training for {GenModel.__name__}...') 162 | 163 | attacks[tid][GenModel.__name__] = {} 164 | 165 | # Generate shadow model data for training attacks on this target 166 | synA, labelsSA = generate_mia_shadow_data(GenModel, target, rawA, runconfig['sizeRawT'], runconfig['sizeSynT'], runconfig['nShadows'], runconfig['nSynA']) 167 | 168 | # Train attack on shadow data 169 | for Feature in [NaiveFeatureSet(GenModel.datatype), HistogramFeatureSet(GenModel.datatype, metadata), CorrelationsFeatureSet(GenModel.datatype, metadata)]: 170 | Attack = MIAttackClassifierRandomForest(metadata, Feature) 171 | Attack.train(synA, labelsSA) 172 | attacks[tid][GenModel.__name__][f'{Feature.__name__}'] = Attack 173 | 174 | # Clean up 175 | del synA, labelsSA 176 | 177 | LOGGER.info(f'Finished: Attack training.') 178 | 179 | ################################## 180 | ######### EVALUATION ############# 181 | ################################## 182 | resultsTargetPrivacy = {tid: {gm.__name__: {} for gm in gmList + sanList} for tid in targetIDs} 183 | 184 | print('\n---- Start the game ----') 185 | for nr in range(runconfig['nIter']): 186 | print(f'\n--- Game iteration {nr + 1} ---') 187 | # Draw a raw dataset 188 | rIdx = choice(list(rawPopDropTargets.index), size=runconfig['sizeRawT'], replace=False).tolist() 189 | rawTout = rawPopDropTargets.loc[rIdx] 190 | 191 | for GenModel in gmList: 192 | LOGGER.info(f'Start: Evaluation for model {GenModel.__name__}...') 193 | # Train a generative model 194 | GenModel.fit(rawTout) 195 | synTwithoutTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])] 196 | synLabelsOut = [LABEL_OUT for _ in range(runconfig['nSynT'])] 197 | 198 | for tid in targetIDs: 199 | LOGGER.info(f'Target: {tid}') 200 | target = targets.loc[[tid]] 201 | resultsTargetPrivacy[tid][f'{GenModel.__name__}'][nr] = {} 202 | 203 | rawTin = rawTout.append(target) 204 | GenModel.fit(rawTin) 205 | synTwithTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])] 206 | synLabelsIn = [LABEL_IN for _ in range(runconfig['nSynT'])] 207 | 208 | synT = synTwithoutTarget + synTwithTarget 209 | synTlabels = synLabelsOut + synLabelsIn 210 | 211 | # Run attacks 212 | for feature, Attack in attacks[tid][f'{GenModel.__name__}'].items(): 213 | # Produce a guess for each synthetic dataset 214 | attackerGuesses = Attack.attack(synT) 215 | 216 | resDict = { 217 | 'Secret': synTlabels, 218 | 'AttackerGuess': attackerGuesses 219 | } 220 | resultsTargetPrivacy[tid][f'{GenModel.__name__}'][nr][feature] = resDict 221 | 222 | del synT, synTwithoutTarget, synTwithTarget 223 | 224 | LOGGER.info(f'Finished: Evaluation for model {GenModel.__name__}.') 225 | 226 | for San in sanList: 227 | LOGGER.info(f'Start: Evaluation for sanitiser {San.__name__}...') 228 | sanOut = San.sanitise(rawTout) 229 | 230 | for tid in targetIDs: 231 | LOGGER.info(f'Target: {tid}') 232 | target = targets.loc[[tid]] 233 | resultsTargetPrivacy[tid][San.__name__][nr] = {} 234 | 235 | rawTin = rawTout.append(target) 236 | sanIn = San.sanitise(rawTin) 237 | 238 | sanT = [sanOut, sanIn] 239 | sanTLabels = [LABEL_OUT, LABEL_IN] 240 | 241 | # Run attacks 242 | for feature, Attack in attacks[tid][San.__name__].items(): 243 | # Produce a guess for each synthetic dataset 244 | attackerGuesses = Attack.attack(sanT, attemptLinkage=True, target=target) 245 | 246 | resDict = { 247 | 'Secret': sanTLabels, 248 | 'AttackerGuess': attackerGuesses 249 | } 250 | resultsTargetPrivacy[tid][San.__name__][nr][feature] = resDict 251 | 252 | del sanT, sanOut, sanIn 253 | 254 | LOGGER.info(f'Finished: Evaluation for model {San.__name__}.') 255 | 256 | outfile = f"ResultsMIA_{dname}" 257 | LOGGER.info(f"Write results to {path.join(f'{args.outdir}', f'{outfile}')}") 258 | 259 | with open(path.join(f'{args.outdir}', f'{outfile}.json'), 'w') as f: 260 | json.dump(resultsTargetPrivacy, f, indent=2, default=json_numpy_serialzer) 261 | 262 | 263 | if __name__ == "__main__": 264 | main() -------------------------------------------------------------------------------- /notebooks/Analyse Results.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2\n", 11 | "%matplotlib inline\n", 12 | "\n", 13 | "from warnings import filterwarnings\n", 14 | "filterwarnings('ignore')" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import sys\n", 24 | "sys.path.append('../')\n", 25 | "\n", 26 | "from utils.analyse_results import *" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "# Linkage" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "dirname = '../tests/linkage/'\n", 43 | "linkage_gain = load_results_linkage(dirname)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "models = ['SanitiserNHSk10', 'BayesianNet', 'PrivBayesEps1.0']\n", 53 | "fig = plt_per_target_pg(linkage_gain, models, resFilter=('FeatureSet', 'Naive'))" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "models = ['SanitiserNHSk10', 'BayesianNet', 'PrivBayesEps1.0']\n", 63 | "fig = plt_per_target_pg(linkage_gain, models, resFilter=('FeatureSet', 'Correlations'))" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "# Inference" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "dirname = '../tests/inference/'\n", 80 | "dpath = '../data/texas'\n", 81 | "inference_gain = load_results_inference(dirname, dpath)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "models = ['SanitiserNHSk10', 'BayesianNet', 'PrivBayesEps1.0']\n", 91 | "fig = plt_per_target_pg(inference_gain, models, resFilter=('SensitiveAttribute', 'RACE'))" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "fig = plt_per_target_pg(inference_gain, models, resFilter=('SensitiveAttribute', 'LengthOfStay'))" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "# Aggregate Utility" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "dirname = '../tests/utility/'\n", 117 | "utility_record, utility_agg = load_results_utility(dirname)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "labelVar = 'RiskMortality'\n", 127 | "models = ['Raw','SanitiserNHSk10', 'BayesianNet', 'PrivBayesEps1.0']\n", 128 | "fig = plt_avg_accuracy(utility_agg, models)" 129 | ] 130 | } 131 | ], 132 | "metadata": { 133 | "kernelspec": { 134 | "display_name": "venv_syn", 135 | "language": "python", 136 | "name": "venv_syn" 137 | }, 138 | "language_info": { 139 | "codemirror_mode": { 140 | "name": "ipython", 141 | "version": 3 142 | }, 143 | "file_extension": ".py", 144 | "mimetype": "text/x-python", 145 | "name": "python", 146 | "nbconvert_exporter": "python", 147 | "pygments_lexer": "ipython3", 148 | "version": "3.6.8" 149 | } 150 | }, 151 | "nbformat": 4, 152 | "nbformat_minor": 4 153 | } 154 | -------------------------------------------------------------------------------- /predictive_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/predictive_models/__init__.py -------------------------------------------------------------------------------- /predictive_models/predictive_model.py: -------------------------------------------------------------------------------- 1 | """ Some predictive models to represent a simple analysis task. """ 2 | from sklearn.impute import SimpleImputer 3 | from sklearn.linear_model import LinearRegression 4 | from sklearn.linear_model import LogisticRegression 5 | from sklearn.ensemble import RandomForestClassifier 6 | from pandas import DataFrame 7 | from numpy import empty, true_divide, zeros, arange 8 | 9 | from utils.logging import LOGGER 10 | from utils.constants import * 11 | 12 | 13 | class PredictiveModel(object): 14 | """ A predictive model. """ 15 | def __init__(self, metadata, labelCol): 16 | """ 17 | :param metadata: dict: Metadata description 18 | :param labelCol: str: Name of the target variable 19 | """ 20 | self.metadata = metadata 21 | self.labelCol = labelCol 22 | self.nfeatures = self._get_num_features() 23 | 24 | self.ImputerCat = SimpleImputer(strategy='most_frequent') 25 | self.ImputerNum = SimpleImputer(strategy='median') 26 | 27 | self.datatype = DataFrame 28 | self.trained = False 29 | 30 | def train(self, data): 31 | return NotImplementedError("Method needs to be overwritten by a subclass") 32 | 33 | def predict(self, features): 34 | return NotImplementedError("Method needs to be overwritten by a subclass") 35 | 36 | def evalute(self, data): 37 | return NotImplementedError("Method needs to be overwritten by a subclass") 38 | 39 | def _encode_data(self, data): 40 | n_samples = len(data) 41 | features_encoded = empty((n_samples, self.nfeatures)) 42 | cidx = 0 43 | 44 | for cdict in self.metadata['columns']: 45 | data_type = cdict['type'] 46 | attr_name = cdict['name'] 47 | if attr_name != self.labelCol: 48 | col_data = data[attr_name].to_numpy() 49 | 50 | if data_type == FLOAT or data_type == INTEGER: 51 | col_max = cdict['max'] 52 | col_min = cdict['min'] 53 | features_encoded[:, cidx] = true_divide(col_data - col_min, col_max + ZERO_TOL) 54 | cidx += 1 55 | 56 | elif data_type == CATEGORICAL or data_type == ORDINAL: 57 | # One-hot encoded categorical columns 58 | col_cats = cdict['i2s'] 59 | col_data_onehot = self._one_hot(col_data, col_cats) 60 | features_encoded[:, cidx : cidx + len(col_cats)] = col_data_onehot 61 | cidx += len(col_cats) 62 | 63 | return features_encoded 64 | 65 | def _get_num_features(self): 66 | nfeatures = 0 67 | 68 | for cdict in self.metadata['columns']: 69 | data_type = cdict['type'] 70 | attr_name = cdict['name'] 71 | 72 | if attr_name != self.labelCol: 73 | if data_type == FLOAT or data_type == INTEGER: 74 | nfeatures += 1 75 | 76 | elif data_type == CATEGORICAL or data_type == ORDINAL: 77 | nfeatures += len(cdict['i2s']) 78 | 79 | else: 80 | raise ValueError(f'Unkown data type {data_type} for attribute {attr_name}') 81 | 82 | return nfeatures 83 | 84 | def _get_feature_names(self): 85 | featureNames = [] 86 | 87 | for i, cdict in enumerate(self.metadata['columns']): 88 | data_type = cdict['type'] 89 | attr_name = cdict['name'] 90 | 91 | if attr_name != self.labelCol: 92 | if data_type == FLOAT or data_type == INTEGER: 93 | featureNames.append(attr_name) 94 | 95 | elif data_type == CATEGORICAL or data_type == ORDINAL: 96 | col_cats = cdict['i2s'] 97 | featureNames.extend([f'{attr_name}_{c}' for c in col_cats]) 98 | 99 | return featureNames 100 | 101 | def _impute_missing_values(self, df): 102 | dfImpute = df.copy() 103 | 104 | catCols = [] 105 | numCols = [] 106 | 107 | for col in self.metadata['columns']: 108 | if col['name'] in list(dfImpute): 109 | if col['type'] in [CATEGORICAL, ORDINAL]: 110 | catCols.append(col['name']) 111 | elif col['type'] in NUMERICAL: 112 | numCols.append(col['name']) 113 | 114 | self.ImputerCat.fit(df[catCols]) 115 | dfImpute[catCols] = self.ImputerCat.transform(df[catCols]) 116 | 117 | self.ImputerNum.fit(df[numCols]) 118 | dfImpute[numCols] = self.ImputerNum.transform(df[numCols]) 119 | 120 | return dfImpute 121 | 122 | def _one_hot(self, col_data, categories): 123 | col_data_onehot = zeros((len(col_data), len(categories))) 124 | cidx = [categories.index(c) for c in col_data] 125 | col_data_onehot[arange(len(col_data)), cidx] = 1 126 | 127 | return col_data_onehot 128 | 129 | 130 | class ClassificationTask(PredictiveModel): 131 | """ A binary or multiclass classification model. """ 132 | 133 | def __init__(self, Distinguisher, metadata, labelCol): 134 | """ 135 | :param Distinguisher: sklearn.Classifier: A classification model 136 | :param metadata: dict: Metadata description 137 | :param labelCol: str: Name of the target variable 138 | """ 139 | super().__init__(metadata, labelCol) 140 | self.Distinguisher = Distinguisher 141 | 142 | labels = self._get_labels() 143 | self.labels = {l:i for i, l in enumerate(labels)} 144 | self.labelsInv = {i:l for l, i in self.labels.items()} 145 | 146 | self.__name__ = f'{self.Distinguisher.__class__.__name__}{self.labelCol}' 147 | 148 | def train(self, data): 149 | if not isinstance(data, self.datatype): 150 | raise ValueError(f"Model expects input as {self.datatype} but got {type(data)}") 151 | 152 | data = self._impute_missing_values(data) 153 | features = self._encode_data(data.drop(self.labelCol, axis=1)) 154 | labels = data[self.labelCol].apply(lambda x: self.labels[x]).values 155 | 156 | self.Distinguisher.fit(features, labels) 157 | 158 | LOGGER.debug('Finished training MIA distinguisher') 159 | self.trained = True 160 | 161 | def predict(self, data): 162 | if not isinstance(data, self.datatype): 163 | raise ValueError(f"Model expects input as {self.datatype} but got {type(data)}") 164 | 165 | features = self._encode_data(data.drop(self.labelCol, axis=1)) 166 | labels = self.Distinguisher.predict(features) 167 | 168 | return [self.labelsInv[i] for i in labels] 169 | 170 | def evaluate(self, data): 171 | if not isinstance(data, self.datatype): 172 | raise ValueError(f"Model expects input as {self.datatype} but got {type(data)}") 173 | 174 | features = self._encode_data(data.drop(self.labelCol, axis=1)) 175 | labelsTrue = data[self.labelCol].apply(lambda x: self.labels[x]).values 176 | labelsPred = self.Distinguisher.predict(features) 177 | 178 | return [int(l == p) for l, p in zip(labelsTrue, labelsPred)] 179 | 180 | def _get_accuracy(self, trueLabels, predLabels): 181 | return sum([g == l for g, l in zip(trueLabels, predLabels)])/len(trueLabels) 182 | 183 | def _get_labels(self): 184 | for cdict in self.metadata['columns']: 185 | if cdict['name'] == self.labelCol: 186 | if not cdict['type'] in [CATEGORICAL, ORDINAL]: 187 | raise ValueError('Label column must be discrete data type.') 188 | 189 | return cdict['i2s'] 190 | 191 | 192 | class RandForestClassTask(ClassificationTask): 193 | def __init__(self, metadata, labelCol): 194 | super().__init__(RandomForestClassifier(), metadata, labelCol) 195 | 196 | 197 | class LogRegClassTask(ClassificationTask): 198 | def __init__(self, metadata, labelCol): 199 | super().__init__(LogisticRegression(), metadata, labelCol) 200 | 201 | 202 | class RegressionTask(PredictiveModel): 203 | """ A binary or multiclass classification model. """ 204 | 205 | def __init__(self, Regressor, metadata, labelCol): 206 | """ 207 | 208 | :param Regressor: sklearn.Regressor: A regression model 209 | :param metadata: dict: Metadata description 210 | :param labels: list: Label names 211 | :param FeatureSet: object: Feature extraction object 212 | """ 213 | super().__init__(metadata, labelCol) 214 | self.Regressor = Regressor 215 | 216 | self.__name__ = f'{self.Regressor.__class__.__name__}{self.labelCol}' 217 | 218 | def train(self, data): 219 | if not isinstance(data, self.datatype): 220 | raise ValueError(f"Model expects input as {self.datatype} but got {type(data)}") 221 | 222 | data = self._impute_missing_values(data) 223 | features = self._encode_data(data.drop(self.labelCol, axis=1)) 224 | labels = data[self.labelCol].values 225 | 226 | self.Regressor.fit(features, labels) 227 | 228 | LOGGER.debug('Finished training regression model') 229 | self.trained = True 230 | 231 | def predict(self, features): 232 | if not isinstance(features, self.datatype): 233 | raise ValueError(f"Model expects input as {self.datatype} but got {type(features)}") 234 | 235 | features = self._encode_data(features) 236 | labels = self.Regressor.predict(features) 237 | 238 | return list(labels) 239 | 240 | def evaluate(self, data): 241 | if not isinstance(data, self.datatype): 242 | raise ValueError(f"Model expects input as {self.datatype} but got {type(data)}") 243 | 244 | features = self._encode_data(data.drop(self.labelCol, axis=1)) 245 | labelsTrue = data[self.labelCol].values 246 | labelsPred = self.Regressor.predict(features) 247 | 248 | return [true - pred for true, pred in zip(labelsTrue, labelsPred)] 249 | 250 | 251 | class LinRegTask(RegressionTask): 252 | def __init__(self, metadata, labelCol): 253 | super().__init__(LinearRegression(), metadata, labelCol) 254 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | husl==4.0.3 2 | loguru==0.5.3 3 | matplotlib==3.4.3 4 | palettable==3.3.0 5 | pandas==0.25.3 6 | scipy==1.7.1 7 | seaborn==0.11.2 8 | sklearn==0.0 9 | tensorflow==2.6.0 10 | torch==1.9.1 11 | -------------------------------------------------------------------------------- /sanitisation_techniques/sanitiser.py: -------------------------------------------------------------------------------- 1 | """ Parent class for sanitisers """ 2 | from pandas import DataFrame, cut 3 | from sklearn.impute import SimpleImputer 4 | from pandas.api.types import is_numeric_dtype 5 | 6 | from utils.constants import * 7 | 8 | class Sanitiser(object): 9 | 10 | def sanitise(self, data): 11 | """ Apply a privacy policy to the data. """ 12 | return NotImplementedError('Method needs to be overwritten by a subclass') 13 | 14 | 15 | class SanitiserNHS(Sanitiser): 16 | """ A sanitisation mechanism that follows the strategy described by NHS England. """ 17 | def __init__(self, metadata, 18 | nbins=10, thresh_rare=0, 19 | max_quantile = 1, anonymity_set_size=1, 20 | drop_cols=None, quids=None): 21 | 22 | self.metadata = self._read_meta(metadata, drop_cols, quids) 23 | self.datatype = DataFrame 24 | 25 | self.histogram_size = nbins 26 | self.unique_threshold = thresh_rare 27 | self.quids = quids 28 | self.max_quantile = max_quantile 29 | self.anonymity_set_size = anonymity_set_size 30 | 31 | self.ImputerCat = SimpleImputer(strategy='most_frequent') 32 | self.ImputerNum = SimpleImputer(strategy='median') 33 | 34 | self.trained = False 35 | 36 | self.__name__ = f'SanitiserNHSk{self.anonymity_set_size}' 37 | 38 | def sanitise(self, data): 39 | """ 40 | Sanitise a sensitive dataset 41 | 42 | :param data: DataFrame: Sensitive raw dataset 43 | :return: san_data: DataFrame: Sanitised dataset 44 | """ 45 | san_data = DataFrame(index=data.index) 46 | data = self._impute_missing_values(data) 47 | drop_records = [] 48 | 49 | for col, cdict in self.metadata.items(): 50 | coltype = cdict['type'] 51 | col_data = data[col].copy() 52 | 53 | if coltype == FLOAT or coltype == INTEGER: 54 | col_data = col_data.astype(int) 55 | 56 | # Cap numerical attributes 57 | cap = col_data.quantile(self.max_quantile) 58 | idx = col_data[col_data > cap].index 59 | col_data.loc[idx] = int(cap) 60 | 61 | elif coltype == CATEGORICAL or coltype == ORDINAL: 62 | if is_numeric_dtype(col_data): 63 | # Bins numerical cols marked as quid into specified bins 64 | col_data = cut(col_data, bins=cdict['bins'], labels=cdict['categories']) 65 | col_data = col_data.astype(str) 66 | 67 | # Remove any records with rare categories 68 | frequencies = col_data.value_counts() 69 | drop_cats = frequencies[frequencies <= self.unique_threshold].index 70 | 71 | for c in drop_cats: 72 | ridx = list(col_data[col_data == c].index) 73 | drop_records.extend(ridx) 74 | 75 | san_data[col] = col_data.values 76 | 77 | drop_records = list(set(drop_records)) 78 | san_data = san_data.drop(drop_records) 79 | 80 | # Enforce k-anonymity constraint 81 | if self.quids is not None: 82 | anonymity_sets = san_data.groupby(self.quids).size() 83 | groups = anonymity_sets[anonymity_sets < self.anonymity_set_size].index 84 | for g in groups: 85 | conditions = [f"{k} == '{v}'" for k,v in zip(self.quids, g)] 86 | query = " and ".join(conditions) 87 | didx = san_data.query(query).index 88 | san_data = san_data.drop(didx) 89 | 90 | return san_data 91 | 92 | def _read_meta(self, metadata, drop_cols, quids): 93 | """ Read metadata from metadata file.""" 94 | if quids is None: 95 | quids = [] 96 | 97 | if drop_cols is None: 98 | drop_cols = [] 99 | 100 | metadict = {} 101 | 102 | for cdict in metadata['columns']: 103 | col = cdict['name'] 104 | coltype = cdict['type'] 105 | 106 | if col not in drop_cols: 107 | if coltype == FLOAT or coltype == INTEGER: 108 | if col in quids: 109 | cbins = cdict['bins'] 110 | cats = [f'({cbins[i]},{cbins[i+1]}]' for i in range(len(cbins)-1)] 111 | 112 | metadict[col] = { 113 | 'type': CATEGORICAL, 114 | 'categories': cats, 115 | 'bins': cbins, 116 | 'size': len(cats) 117 | } 118 | 119 | else: 120 | metadict[col] = { 121 | 'type': coltype, 122 | 'min': cdict['min'], 123 | 'max': cdict['max'] 124 | } 125 | 126 | elif coltype == CATEGORICAL or coltype == ORDINAL: 127 | metadict[col] = { 128 | 'type': coltype, 129 | 'categories': cdict['i2s'], 130 | 'size': len(cdict['i2s']) 131 | } 132 | 133 | else: 134 | raise ValueError(f'Unknown data type {coltype} for attribute {col}') 135 | 136 | return metadict 137 | 138 | def _impute_missing_values(self, df): 139 | df_impute = df.copy() 140 | 141 | cat_cols = [] 142 | num_cols = [] 143 | 144 | for col, cdict in self.metadata.items(): 145 | if col in list(df_impute): 146 | if cdict['type'] in [CATEGORICAL, ORDINAL]: 147 | cat_cols.append(col) 148 | 149 | elif cdict['type'] in NUMERICAL: 150 | num_cols.append(col) 151 | 152 | self.ImputerCat.fit(df[cat_cols]) 153 | df_impute[cat_cols] = self.ImputerCat.transform(df[cat_cols]) 154 | 155 | self.ImputerNum.fit(df[num_cols]) 156 | df_impute[num_cols] = self.ImputerNum.transform(df[num_cols]) 157 | 158 | return df_impute -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | path = os.path.dirname(__file__) 3 | print(path) 4 | if path not in sys.path: 5 | sys.path.append(path) -------------------------------------------------------------------------------- /tests/germancredit_test.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:dac84a41b87e3f7b7ae067262b6ba9b23419339d5513aa75ca7b77b2eefb84c5 3 | size 18560 4 | -------------------------------------------------------------------------------- /tests/germancredit_test.json: -------------------------------------------------------------------------------- 1 | { 2 | "columns": [ 3 | { 4 | "name": "Age", 5 | "min": 19, 6 | "max": 75, 7 | "type": "Integer", 8 | "bins": [18, 25, 30, 40, 50, 60, 80] 9 | }, 10 | { 11 | "name": "Sex", 12 | "type": "Categorical", 13 | "size": 2, 14 | "i2s": [ 15 | "male", 16 | "female" 17 | ] 18 | }, 19 | { 20 | "name": "Job", 21 | "type": "Ordinal", 22 | "size": 4, 23 | "i2s": [ 24 | "unemployed", 25 | "unskilled", 26 | "skilled", 27 | "management" 28 | ] 29 | }, 30 | { 31 | "name": "Housing", 32 | "type": "Categorical", 33 | "size": 3, 34 | "i2s": [ 35 | "own", 36 | "free", 37 | "rent" 38 | ] 39 | }, 40 | { 41 | "name": "Saving accounts", 42 | "type": "Ordinal", 43 | "i2s": [ 44 | "no_info", 45 | "little", 46 | "moderate", 47 | "quite rich", 48 | "rich" 49 | ], 50 | "size": 5 51 | }, 52 | { 53 | "name": "Checking account", 54 | "type": "Ordinal", 55 | "size": 4, 56 | "i2s": [ 57 | "no_info", 58 | "little", 59 | "moderate", 60 | "rich" 61 | ] 62 | }, 63 | { 64 | "name": "Credit amount", 65 | "type": "Float", 66 | "min": 250.0, 67 | "max": 18424.0 68 | }, 69 | { 70 | "name": "Duration", 71 | "type": "Integer", 72 | "min": 4, 73 | "max": 72 74 | }, 75 | { 76 | "name": "Purpose", 77 | "type": "Categorical", 78 | "size": 8, 79 | "i2s": [ 80 | "radio/TV", 81 | "education", 82 | "furniture/equipment", 83 | "car", 84 | "business", 85 | "domestic appliances", 86 | "repairs", 87 | "vacation/others" 88 | ] 89 | }, 90 | { 91 | "name": "Risk", 92 | "type": "Categorical", 93 | "size": 2, 94 | "i2s": [ 95 | "good", 96 | "bad" 97 | ] 98 | } 99 | ] 100 | } -------------------------------------------------------------------------------- /tests/inference/runconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "sensitiveAttributes": {"LENGTH_OF_STAY": "LinReg", 3 | "RACE": "Classification"}, 4 | "nIter": 15, 5 | "sizeRawT": 1000, 6 | "sizeSynT": 1000, 7 | "nSynT": 10, 8 | "nTargets": 0, 9 | "Targets": ["ID26241", "ID31432", "ID27428", "ID29265", "ID14086"], 10 | "generativeModels": { 11 | "BayesianNet": [[25, 1]], 12 | "PrivBayes": [[25, 1, 1.0]] 13 | }, 14 | "sanitisationTechniques": { 15 | "SanitiserNHS": [[10, 1, 0.99, 10, [], ["PAT_STATE", "SEX_CODE", "RACE", "ETHNICITY", "PAT_AGE"]]] 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /tests/linkage/runconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "nIter": 15, 3 | "sizeRawA": 10000, 4 | "nSynA": 10, 5 | "nShadows": 10, 6 | "sizeRawT": 1000, 7 | "sizeSynT": 1000, 8 | "nSynT": 5, 9 | "nTargets": 0, 10 | "Targets": ["ID26241", "ID31432", "ID27428", "ID29265", "ID14086"], 11 | "generativeModels": { 12 | "BayesianNet": [[25, 1]], 13 | "PrivBayes": [[25, 1, 1.0]] 14 | }, 15 | "sanitisationTechniques": { 16 | "SanitiserNHS": [[10, 1, 0.99, 10, [], ["PAT_STATE", "SEX_CODE", "RACE", "ETHNICITY", "PAT_AGE"]]] 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /tests/test_attacks.py: -------------------------------------------------------------------------------- 1 | """A template file for writing a simple test for a new attack model""" 2 | from unittest import TestCase 3 | from pandas import DataFrame 4 | 5 | from warnings import filterwarnings 6 | filterwarnings('ignore') 7 | 8 | from os import path 9 | cwd = path.dirname(__file__) 10 | 11 | from attack_models.mia_classifier import (MIAttackClassifierLinearSVC, 12 | MIAttackClassifierLogReg, 13 | MIAttackClassifierRandomForest, 14 | generate_mia_shadow_data, 15 | generate_mia_anon_data) 16 | 17 | from generative_models.data_synthesiser import IndependentHistogram 18 | from sanitisation_techniques.sanitiser import SanitiserNHS 19 | from feature_sets.independent_histograms import HistogramFeatureSet 20 | from utils.datagen import load_local_data_as_df 21 | 22 | class TestAttacks(TestCase): 23 | @classmethod 24 | def setUp(self) -> None: 25 | self.raw, self.metadata = load_local_data_as_df(path.join(cwd, 'germancredit_test')) 26 | self.sizeS = int(len(self.raw)/2) 27 | self.GenModel = IndependentHistogram(self.metadata) 28 | self.San = SanitiserNHS(self.metadata) 29 | self.FeatureSet = HistogramFeatureSet(DataFrame, metadata=self.metadata) 30 | 31 | self.target = self.raw.sample() 32 | self.shadowDataSyn = generate_mia_shadow_data(self.GenModel, self.target, self.raw, self.sizeS, self.sizeS, numModels=2, numCopies=2) 33 | self.shadowDataSan = generate_mia_anon_data(self.San, self.target, self.raw, self.sizeS, numSamples=2) 34 | 35 | self.GenModel.fit(self.raw) 36 | self.synthetic = [self.GenModel.generate_samples(self.sizeS) for _ in range(10)] 37 | self.sanitised = [self.San.sanitise(self.raw) for _ in range(10)] 38 | 39 | def test_mia_randforest(self): 40 | print('\nTest MIA RandForest') 41 | ## Default without feature extraction 42 | Attack = MIAttackClassifierRandomForest(metadata=self.metadata) 43 | Attack.train(*self.shadowDataSyn) 44 | 45 | guesses = Attack.attack(self.synthetic) 46 | self.assertEqual(len(guesses), len(self.synthetic)) 47 | 48 | ## With FeatureSet 49 | Attack = MIAttackClassifierRandomForest(metadata=self.metadata, FeatureSet=self.FeatureSet) 50 | Attack.train(*self.shadowDataSyn) 51 | 52 | guesses = Attack.attack(self.synthetic) 53 | self.assertEqual(len(guesses), len(self.synthetic)) 54 | 55 | ## Test linkage 56 | Attack.train(*self.shadowDataSan) 57 | guesses = Attack.attack(self.sanitised, attemptLinkage=True, target=self.target) 58 | self.assertEqual(len(guesses), len(self.sanitised)) 59 | 60 | 61 | def test_mia_logreg(self): 62 | print('\nTest MIA LogReg') 63 | Attack = MIAttackClassifierLogReg(metadata=self.metadata, FeatureSet=self.FeatureSet) 64 | Attack.train(*self.shadowDataSyn) 65 | 66 | guesses = Attack.attack(self.synthetic) 67 | self.assertEqual(len(guesses), len(self.synthetic)) 68 | 69 | ## Test linkage 70 | Attack.train(*self.shadowDataSan) 71 | guesses = Attack.attack(self.sanitised, attemptLinkage=True, target=self.target) 72 | self.assertEqual(len(guesses), len(self.sanitised)) 73 | 74 | def test_mia_svc(self): 75 | print('\nTest MIA SVC') 76 | Attack = MIAttackClassifierLinearSVC(metadata=self.metadata, FeatureSet=self.FeatureSet) 77 | Attack.train(*self.shadowDataSyn) 78 | 79 | guesses = Attack.attack(self.synthetic) 80 | self.assertEqual(len(guesses), len(self.synthetic)) 81 | 82 | ## Test linkage 83 | Attack.train(*self.shadowDataSan) 84 | guesses = Attack.attack(self.sanitised, attemptLinkage=True, target=self.target) 85 | self.assertEqual(len(guesses), len(self.sanitised)) 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /tests/test_gms.py: -------------------------------------------------------------------------------- 1 | """A template file for writing a simple test for a new generative model""" 2 | from unittest import TestCase 3 | 4 | from warnings import filterwarnings 5 | filterwarnings('ignore') 6 | 7 | from os import path 8 | cwd = path.dirname(__file__) 9 | 10 | from generative_models.data_synthesiser import IndependentHistogram, BayesianNet, PrivBayes 11 | from generative_models.ctgan import CTGAN 12 | from generative_models.pate_gan import PATEGAN 13 | 14 | from utils.datagen import * 15 | 16 | SEED = 42 17 | 18 | class TestGenerativeModel(TestCase): 19 | 20 | @classmethod 21 | def setUp(self) -> None: 22 | self.raw, self.metadata = load_local_data_as_df(path.join(cwd, 'germancredit_test')) 23 | self.sizeS = len(self.raw) 24 | 25 | def test_independent_histogram(self): 26 | print('\nTest IndependentHistogram') 27 | ## Test default params 28 | gm = IndependentHistogram(self.metadata) 29 | gm.fit(self.raw) 30 | synthetic_data = gm.generate_samples(self.sizeS) 31 | 32 | self.assertListEqual(list(synthetic_data), list(self.raw)) 33 | 34 | ## Changing nbins 35 | gm = IndependentHistogram(self.metadata, histogram_bins=25) 36 | gm.fit(self.raw) 37 | synthetic_data = gm.generate_samples(self.sizeS) 38 | 39 | self.assertListEqual(list(synthetic_data), list(self.raw)) 40 | 41 | def test_bayesian_net(self): 42 | print('\nTest BayesianNet') 43 | ## Test default params 44 | gm = BayesianNet(self.metadata) 45 | gm.fit(self.raw) 46 | synthetic_data = gm.generate_samples(self.sizeS) 47 | 48 | self.assertListEqual(list(synthetic_data), list(self.raw)) 49 | 50 | ## Change network degree 51 | gm = BayesianNet(self.metadata, degree=2) 52 | gm.fit(self.raw) 53 | synthetic_data = gm.generate_samples(self.sizeS) 54 | 55 | self.assertListEqual(list(synthetic_data), list(self.raw)) 56 | 57 | ## Infer ranges 58 | gm = BayesianNet(self.metadata, infer_ranges=True) 59 | gm.fit(self.raw) 60 | synthetic_data = gm.generate_samples(self.sizeS) 61 | 62 | self.assertListEqual(list(synthetic_data), list(self.raw)) 63 | 64 | ## Fix seed 65 | gm = BayesianNet(self.metadata, seed=SEED) 66 | gm.fit(self.raw) 67 | synthetic_data = gm.generate_samples(self.sizeS) 68 | 69 | self.assertListEqual(list(synthetic_data), list(self.raw)) 70 | 71 | def test_priv_bayes(self): 72 | print('\nTest PrivBayes') 73 | ## Test default params 74 | gm = PrivBayes(self.metadata) 75 | gm.fit(self.raw) 76 | synthetic_data = gm.generate_samples(self.sizeS) 77 | 78 | self.assertListEqual(list(synthetic_data), list(self.raw)) 79 | 80 | ## Change privacy param 81 | gm = PrivBayes(self.metadata, epsilon=1e-9) 82 | gm.fit(self.raw) 83 | synthetic_data = gm.generate_samples(self.sizeS) 84 | 85 | self.assertListEqual(list(synthetic_data), list(self.raw)) 86 | 87 | ## Fix seed 88 | gm = PrivBayes(self.metadata, seed=SEED) 89 | gm.fit(self.raw) 90 | synthetic_data = gm.generate_samples(self.sizeS) 91 | 92 | self.assertListEqual(list(synthetic_data), list(self.raw)) 93 | 94 | def test_ctgan(self): 95 | print('\nTest CTGAN') 96 | 97 | gm = CTGAN(self.metadata, batch_size=10, epochs=2) 98 | gm.fit(self.raw) 99 | synthetic_data = gm.generate_samples(self.sizeS) 100 | 101 | self.assertListEqual(list(synthetic_data), list(self.raw)) 102 | 103 | 104 | def test_pategan(self): 105 | # Default params 106 | gm = PATEGAN(self.metadata) 107 | gm.fit(self.raw) 108 | synthetic_data = gm.generate_samples(self.sizeS) 109 | 110 | self.assertTupleEqual(synthetic_data.shape, self.raw.shape) 111 | 112 | # Change privacy params 113 | gm = PATEGAN(self.metadata, eps=10, delta=1e-1) 114 | gm.fit(self.raw) 115 | synthetic_data = gm.generate_samples(self.sizeS) 116 | 117 | self.assertTupleEqual(synthetic_data.shape, self.raw.shape) 118 | 119 | # Infer ranges 120 | gm = PATEGAN(self.metadata, infer_ranges=True) 121 | gm.fit(self.raw) 122 | synthetic_data = gm.generate_samples(self.sizeS) 123 | 124 | self.assertTupleEqual(synthetic_data.shape, self.raw.shape) 125 | 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /tests/test_sanitisation.py: -------------------------------------------------------------------------------- 1 | """A template file for writing a simple test for a sanitisation technique""" 2 | from unittest import TestCase 3 | 4 | from warnings import filterwarnings 5 | filterwarnings('ignore') 6 | 7 | from os import path 8 | cwd = path.dirname(__file__) 9 | 10 | from sanitisation_techniques.sanitiser import SanitiserNHS 11 | 12 | from utils.datagen import load_local_data_as_df 13 | from utils.constants import * 14 | 15 | 16 | class TestSanitisation(TestCase): 17 | 18 | @classmethod 19 | def setUp(self) -> None: 20 | self.raw, self.metadata = load_local_data_as_df(path.join(cwd, 'germancredit_test')) 21 | self.sizeS = len(self.raw) 22 | 23 | def test_sanitise_nhs(self): 24 | print('\nTest SanitiserNHS') 25 | 26 | ## Test default params 27 | sanitiser = SanitiserNHS(self.metadata) 28 | san = sanitiser.sanitise(self.raw) 29 | 30 | # Expect no columns to be dropped or rows removed 31 | self.assertTupleEqual(san.shape, self.raw.shape) 32 | 33 | ## Test dropping columns 34 | sanitiser = SanitiserNHS(self.metadata, drop_cols=['Purpose']) 35 | san = sanitiser.sanitise(self.raw) 36 | 37 | # Purpose should be dropped 38 | self.assertTrue('Purpose' not in list(san)) 39 | 40 | ## Test rare value threshold 41 | sanitiser = SanitiserNHS(self.metadata, thresh_rare=2) 42 | san = sanitiser.sanitise(self.raw) 43 | 44 | for cdict in self.metadata['columns']: 45 | if cdict['type'] == CATEGORICAL or cdict['type'] == ORDINAL: 46 | counts = san[cdict['name']].value_counts() 47 | self.assertTrue(len(counts[counts > 2]) == len(counts)) 48 | 49 | ## Test converting numerical into categorical attributes 50 | demographics = ['Age', 'Sex', 'Job', 'Housing'] 51 | sanitiser = SanitiserNHS(self.metadata, quids=demographics) 52 | san = sanitiser.sanitise(self.raw) 53 | 54 | self.assertListEqual([type(str) for _ in demographics], list(san[demographics].dtypes)) 55 | 56 | ## Test k-anonymity constraint 57 | sanitiser = SanitiserNHS(self.metadata, quids=demographics, anonymity_set_size=7) 58 | san = sanitiser.sanitise(self.raw) 59 | 60 | counts = san.groupby(demographics).size() 61 | self.assertTrue(len(counts[counts >= 7]) == len(counts)) 62 | 63 | 64 | def write_to_dict(nr, results): 65 | results[nr] = 'a' 66 | 67 | -------------------------------------------------------------------------------- /tests/utility/runconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "nIter": 15, 3 | "sizeRawT": 1000, 4 | "sizeSynT": 1000, 5 | "nSynT": 10, 6 | "nTargets": 0, 7 | "Targets": ["ID26241", "ID31432", "ID27428", "ID29265", "ID14086"], 8 | "TestRecords": ["ID71120", "ID84282", "ID88763", "ID79216", "ID92777"], 9 | "generativeModels": { 10 | "BayesianNet": [[25, 1]], 11 | "PrivBayes": [[25, 1, 1.0]] 12 | }, 13 | "sanitisationTechniques": { 14 | "SanitiserNHS": [[10, 1, 0.99, 10, [], ["PAT_STATE", "SEX_CODE", "RACE", "ETHNICITY", "PAT_AGE"]]] 15 | }, 16 | "utilityTasks": { 17 | "RandForestClass": [["RISK_MORTALITY"]] 18 | }, 19 | "dataFilter": { 20 | "train": "DISCHARGE in ['2013Q1', '2013Q2', '2013Q3', '2013Q4']", 21 | "test": "DISCHARGE in ['2014Q1', '2014Q2', '2014Q3', '2014Q4']" 22 | } 23 | } 24 | 25 | -------------------------------------------------------------------------------- /utility_cli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command-line interface for running utility evaluation 3 | """ 4 | 5 | import json 6 | 7 | from os import mkdir, path 8 | from numpy import mean 9 | from numpy.random import choice, seed 10 | from argparse import ArgumentParser 11 | 12 | from utils.datagen import load_s3_data_as_df, load_local_data_as_df 13 | from utils.utils import json_numpy_serialzer 14 | from utils.logging import LOGGER 15 | 16 | from sanitisation_techniques.sanitiser import SanitiserNHS 17 | from generative_models.data_synthesiser import BayesianNet, PrivBayes, IndependentHistogram 18 | from generative_models.ctgan import CTGAN 19 | from generative_models.pate_gan import PATEGAN 20 | from predictive_models.predictive_model import RandForestClassTask, LogRegClassTask, LinRegTask 21 | 22 | from warnings import simplefilter 23 | simplefilter('ignore', category=FutureWarning) 24 | simplefilter('ignore', category=DeprecationWarning) 25 | 26 | cwd = path.dirname(__file__) 27 | 28 | SEED = 42 29 | 30 | 31 | def main(): 32 | argparser = ArgumentParser() 33 | datasource = argparser.add_mutually_exclusive_group() 34 | datasource.add_argument('--s3name', '-S3', type=str, choices=['adult', 'census', 'credit', 'alarm', 'insurance'], help='Name of the dataset to run on') 35 | datasource.add_argument('--datapath', '-D', type=str, help='Relative path to cwd of a local data file') 36 | argparser.add_argument('--runconfig', '-RC', default='runconfig_mia.json', type=str, help='Path relative to cwd of runconfig file') 37 | argparser.add_argument('--outdir', '-O', default='outputs/test', type=str, help='Path relative to cwd for storing output files') 38 | args = argparser.parse_args() 39 | 40 | seed(SEED) 41 | # Load runconfig 42 | with open(path.join(cwd, args.runconfig)) as f: 43 | runconfig = json.load(f) 44 | print('Runconfig:') 45 | print(runconfig) 46 | 47 | # Load data 48 | if args.s3name is not None: 49 | rawPop, metadata = load_s3_data_as_df(args.s3name) 50 | dname = args.s3name 51 | else: 52 | rawPop, metadata = load_local_data_as_df(path.join(cwd, args.datapath)) 53 | dname = args.datapath.split('/')[-1] 54 | 55 | print(f'Loaded data {dname}:') 56 | print(rawPop.info()) 57 | 58 | # Make sure outdir exists 59 | if not path.isdir(args.outdir): 60 | mkdir(args.outdir) 61 | 62 | ######################## 63 | #### GAME INPUTS ####### 64 | ######################## 65 | # Train test split 66 | rawTrain = rawPop.query(runconfig['dataFilter']['train']) 67 | rawTest = rawPop.query(runconfig['dataFilter']['test']) 68 | 69 | # Pick targets 70 | targetIDs = choice(list(rawTrain.index), size=runconfig['nTargets'], replace=False).tolist() 71 | 72 | # If specified: Add specific target records 73 | if runconfig['Targets'] is not None: 74 | targetIDs.extend(runconfig['Targets']) 75 | 76 | targets = rawTrain.loc[targetIDs, :] 77 | 78 | # Drop targets from population 79 | rawTrainWoTargets = rawTrain.drop(targetIDs) 80 | 81 | # Get test target records 82 | testRecordIDs = choice(list(rawTest.index), size=runconfig['nTargets'], replace=False).tolist() 83 | 84 | # If specified: Add specific target records 85 | if runconfig['TestRecords'] is not None: 86 | testRecordIDs.extend(runconfig['TestRecords']) 87 | 88 | testRecords = rawTest.loc[testRecordIDs, :] 89 | 90 | # List of candidate generative models to evaluate 91 | gmList = [] 92 | if 'generativeModels' in runconfig.keys(): 93 | for gm, paramsList in runconfig['generativeModels'].items(): 94 | if gm == 'IndependentHistogram': 95 | for params in paramsList: 96 | gmList.append(IndependentHistogram(metadata, *params)) 97 | elif gm == 'BayesianNet': 98 | for params in paramsList: 99 | gmList.append(BayesianNet(metadata, *params)) 100 | elif gm == 'PrivBayes': 101 | for params in paramsList: 102 | gmList.append(PrivBayes(metadata, *params)) 103 | elif gm == 'CTGAN': 104 | for params in paramsList: 105 | gmList.append(CTGAN(metadata, *params)) 106 | elif gm == 'PATEGAN': 107 | for params in paramsList: 108 | gmList.append(PATEGAN(metadata, *params)) 109 | else: 110 | raise ValueError(f'Unknown GM {gm}') 111 | 112 | # List of candidate sanitisation techniques to evaluate 113 | sanList = [] 114 | if 'sanitisationTechniques' in runconfig.keys(): 115 | for name, paramsList in runconfig['sanitisationTechniques'].items(): 116 | if name == 'SanitiserNHS': 117 | for params in paramsList: 118 | sanList.append(SanitiserNHS(metadata, *params)) 119 | else: 120 | raise ValueError(f'Unknown sanitisation technique {name}') 121 | 122 | utilityTasks = [] 123 | for taskName, paramsList in runconfig['utilityTasks'].items(): 124 | if taskName == 'RandForestClass': 125 | for params in paramsList: 126 | utilityTasks.append(RandForestClassTask(metadata, *params)) 127 | elif taskName == 'LogRegClass': 128 | for params in paramsList: 129 | utilityTasks.append(LogRegClassTask(metadata, *params)) 130 | elif taskName == 'LinReg': 131 | for params in paramsList: 132 | utilityTasks.append(LinRegTask(metadata, *params)) 133 | 134 | ################################## 135 | ######### EVALUATION ############# 136 | ################################## 137 | resultsTargetUtility = {ut.__name__: {gm.__name__: {} for gm in gmList + sanList} for ut in utilityTasks} 138 | resultsAggUtility = {ut.__name__: {gm.__name__: {'TargetID': [], 139 | 'Accuracy': []} for gm in gmList + sanList} for ut in utilityTasks} 140 | 141 | # Add entry for raw 142 | for ut in utilityTasks: 143 | resultsTargetUtility[ut.__name__]['Raw'] = {} 144 | resultsAggUtility[ut.__name__]['Raw'] = {'TargetID': [], 145 | 'Accuracy': []} 146 | 147 | print('\n---- Start the game ----') 148 | for nr in range(runconfig['nIter']): 149 | print(f'\n--- Game iteration {nr + 1} ---') 150 | # Draw a raw dataset 151 | rIdx = choice(list(rawTrainWoTargets.index), size=runconfig['sizeRawT'], replace=False).tolist() 152 | rawTout = rawTrain.loc[rIdx] 153 | 154 | LOGGER.info('Start: Utility evaluation on Raw...') 155 | # Get utility from raw without targets 156 | for ut in utilityTasks: 157 | resultsTargetUtility[ut.__name__]['Raw'][nr] = {} 158 | 159 | predErrorTargets = [] 160 | predErrorAggr = [] 161 | for _ in range(runconfig['nSynT']): 162 | ut.train(rawTout) 163 | predErrorTargets.append(ut.evaluate(testRecords)) 164 | predErrorAggr.append(ut.evaluate(rawTest)) 165 | 166 | resultsTargetUtility[ut.__name__]['Raw'][nr]['OUT'] = { 167 | 'TestRecordID': testRecordIDs, 168 | 'Accuracy': list(mean(predErrorTargets, axis=0)) 169 | } 170 | 171 | resultsAggUtility[ut.__name__]['Raw']['TargetID'].append('OUT') 172 | resultsAggUtility[ut.__name__]['Raw']['Accuracy'].append(mean(predErrorAggr)) 173 | 174 | # Get utility from raw with each target 175 | for tid in targetIDs: 176 | target = targets.loc[[tid]] 177 | rawIn = rawTout.append(target) 178 | 179 | for ut in utilityTasks: 180 | predErrorTargets = [] 181 | predErrorAggr = [] 182 | for _ in range(runconfig['nSynT']): 183 | ut.train(rawIn) 184 | predErrorTargets.append(ut.evaluate(testRecords)) 185 | predErrorAggr.append(ut.evaluate(rawTest)) 186 | 187 | resultsTargetUtility[ut.__name__]['Raw'][nr][tid] = { 188 | 'TestRecordID': testRecordIDs, 189 | 'Accuracy': list(mean(predErrorTargets, axis=0)) 190 | } 191 | 192 | resultsAggUtility[ut.__name__]['Raw']['TargetID'].append(tid) 193 | resultsAggUtility[ut.__name__]['Raw']['Accuracy'].append(mean(predErrorAggr)) 194 | 195 | LOGGER.info('Finished: Utility evaluation on Raw.') 196 | 197 | for GenModel in gmList: 198 | LOGGER.info(f'Start: Evaluation for model {GenModel.__name__}...') 199 | GenModel.fit(rawTout) 200 | synTwithoutTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])] 201 | 202 | # Util evaluation for synthetic without all targets 203 | for ut in utilityTasks: 204 | resultsTargetUtility[ut.__name__][GenModel.__name__][nr] = {} 205 | 206 | predErrorTargets = [] 207 | predErrorAggr = [] 208 | for syn in synTwithoutTarget: 209 | ut.train(syn) 210 | predErrorTargets.append(ut.evaluate(testRecords)) 211 | predErrorAggr.append(ut.evaluate(rawTest)) 212 | 213 | resultsTargetUtility[ut.__name__][GenModel.__name__][nr]['OUT'] = { 214 | 'TestRecordID': testRecordIDs, 215 | 'Accuracy': list(mean(predErrorTargets, axis=0)) 216 | } 217 | 218 | resultsAggUtility[ut.__name__][GenModel.__name__]['TargetID'].append('OUT') 219 | resultsAggUtility[ut.__name__][GenModel.__name__]['Accuracy'].append(mean(predErrorAggr)) 220 | 221 | for tid in targetIDs: 222 | LOGGER.info(f'Target: {tid}') 223 | target = targets.loc[[tid]] 224 | 225 | rawTin = rawTout.append(target) 226 | GenModel.fit(rawTin) 227 | synTwithTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])] 228 | 229 | # Util evaluation for synthetic with this target 230 | for ut in utilityTasks: 231 | predErrorTargets = [] 232 | predErrorAggr = [] 233 | for syn in synTwithTarget: 234 | ut.train(syn) 235 | predErrorTargets.append(ut.evaluate(testRecords)) 236 | predErrorAggr.append(ut.evaluate(rawTest)) 237 | 238 | resultsTargetUtility[ut.__name__][GenModel.__name__][nr][tid] = { 239 | 'TestRecordID': testRecordIDs, 240 | 'Accuracy': list(mean(predErrorTargets, axis=0)) 241 | } 242 | 243 | resultsAggUtility[ut.__name__][GenModel.__name__]['TargetID'].append(tid) 244 | resultsAggUtility[ut.__name__][GenModel.__name__]['Accuracy'].append(mean(predErrorAggr)) 245 | 246 | del synTwithoutTarget, synTwithTarget 247 | 248 | LOGGER.info(f'Finished: Evaluation for model {GenModel.__name__}.') 249 | 250 | for San in sanList: 251 | LOGGER.info(f'Start: Evaluation for sanitiser {San.__name__}...') 252 | sanOut = San.sanitise(rawTout) 253 | 254 | for ut in utilityTasks: 255 | resultsTargetUtility[ut.__name__][San.__name__][nr] = {} 256 | 257 | predErrorTargets = [] 258 | predErrorAggr = [] 259 | for _ in range(runconfig['nSynT']): 260 | ut.train(sanOut) 261 | predErrorTargets.append(ut.evaluate(testRecords)) 262 | predErrorAggr.append(ut.evaluate(rawTest)) 263 | 264 | resultsTargetUtility[ut.__name__][San.__name__][nr]['OUT'] = { 265 | 'TestRecordID': testRecordIDs, 266 | 'Accuracy': list(mean(predErrorTargets, axis=0)) 267 | } 268 | 269 | resultsAggUtility[ut.__name__][San.__name__]['TargetID'].append('OUT') 270 | resultsAggUtility[ut.__name__][San.__name__]['Accuracy'].append(mean(predErrorAggr)) 271 | 272 | for tid in targetIDs: 273 | LOGGER.info(f'Target: {tid}') 274 | target = targets.loc[[tid]] 275 | 276 | rawTin = rawTout.append(target) 277 | sanIn = San.sanitise(rawTin) 278 | 279 | for ut in utilityTasks: 280 | predErrorTargets = [] 281 | predErrorAggr = [] 282 | for _ in range(runconfig['nSynT']): 283 | ut.train(sanIn) 284 | predErrorTargets.append(ut.evaluate(testRecords)) 285 | predErrorAggr.append(ut.evaluate(rawTest)) 286 | 287 | resultsTargetUtility[ut.__name__][San.__name__][nr][tid] = { 288 | 'TestRecordID': testRecordIDs, 289 | 'Accuracy': list(mean(predErrorTargets, axis=0)) 290 | } 291 | 292 | resultsAggUtility[ut.__name__][San.__name__]['TargetID'].append(tid) 293 | resultsAggUtility[ut.__name__][San.__name__]['Accuracy'].append(mean(predErrorAggr)) 294 | 295 | del sanOut, sanIn 296 | 297 | LOGGER.info(f'Finished: Evaluation for model {San.__name__}.') 298 | 299 | outfile = f"ResultsUtilTargets_{dname}" 300 | LOGGER.info(f"Write results to {path.join(f'{args.outdir}', f'{outfile}')}") 301 | 302 | with open(path.join(f'{args.outdir}', f'{outfile}.json'), 'w') as f: 303 | json.dump(resultsTargetUtility, f, indent=2, default=json_numpy_serialzer) 304 | 305 | outfile = f"ResultsUtilAgg_{dname}" 306 | LOGGER.info(f"Write results to {path.join(f'{args.outdir}', f'{outfile}')}") 307 | 308 | with open(path.join(f'{args.outdir}', f'{outfile}.json'), 'w') as f: 309 | json.dump(resultsAggUtility, f, indent=2, default=json_numpy_serialzer) 310 | 311 | 312 | if __name__ == "__main__": 313 | main() -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/utils/__init__.py -------------------------------------------------------------------------------- /utils/analyse_results.py: -------------------------------------------------------------------------------- 1 | import json 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | from glob import glob 6 | from pandas import DataFrame, concat 7 | from itertools import cycle 8 | from os import path 9 | 10 | from warnings import filterwarnings 11 | filterwarnings('ignore') 12 | 13 | from .datagen import load_local_data_as_df 14 | from .plot_setup import set_style, pltmarkers as MARKERS, fontsizelabels as FSIZELABELS, fontsizeticks as FSIZETICKS 15 | from .evaluation_framework import * 16 | set_style() 17 | 18 | PREDTASKS = ['RandomForestClassifier', 'LogisticRegression', 'LinearRegression'] 19 | 20 | MARKERCYCLE = cycle(MARKERS) 21 | HUEMARKERS = [next(MARKERCYCLE) for _ in range(20)] 22 | 23 | 24 | ###### Load results 25 | def load_results_linkage(dirname): 26 | """ 27 | Helper function to load results of privacy evaluation under risk of linkability 28 | :param dirname: str: Directory that contains results files 29 | :return: results: DataFrame: Results of privacy evaluation 30 | """ 31 | 32 | files = glob(path.join(dirname, f'ResultsMIA_*.json')) 33 | 34 | resList = [] 35 | for fpath in files: 36 | with open(fpath) as f: 37 | resDict = json.load(f) 38 | 39 | dataset = fpath.split('.json')[0].split('_')[-1] 40 | 41 | for tid, tres in resDict.items(): 42 | for gm, gmDict in tres.items(): 43 | for nr, nrDict in gmDict.items(): 44 | for fset, fsetDict in nrDict.items(): 45 | df = DataFrame(fsetDict) 46 | 47 | df['Run'] = nr 48 | df['FeatureSet'] = fset 49 | df['TargetModel'] = gm 50 | df['TargetID'] = tid 51 | df['Dataset'] = dataset 52 | 53 | resList.append(df) 54 | 55 | results = concat(resList) 56 | 57 | resAgg = [] 58 | 59 | games = results.groupby(['TargetID', 'TargetModel', 'FeatureSet', 'Run']) 60 | for gameParams, gameRes in games: 61 | tpSyn, fpSyn = get_tp_fp_rates(gameRes['AttackerGuess'], gameRes['Secret']) 62 | advantageSyn = get_mia_advantage(tpSyn, fpSyn) 63 | advantageRaw = 1 64 | 65 | resAgg.append(gameParams + (tpSyn, fpSyn, advantageSyn, advantageRaw)) 66 | 67 | resAgg = DataFrame(resAgg) 68 | 69 | resAgg.columns = ['TargetID','TargetModel', 'FeatureSet', 'Run', 'TPSyn', 'FPSyn', 'AdvantageSyn', 'AdvantageRaw'] 70 | 71 | resAgg['PrivacyGain'] = resAgg['AdvantageRaw'] - resAgg['AdvantageSyn'] 72 | 73 | return resAgg 74 | 75 | 76 | def load_results_inference(dirname, dpath): 77 | """ 78 | Helper function to load results of privacy evaluation under risk of inference 79 | :param dirname: str: Directory that contains results files 80 | :param dpath: str: Dataset path (needed to extract some metadata) 81 | :return: results: DataFrame: Results of privacy evaluation 82 | """ 83 | df, metadata = load_local_data_as_df(dpath) 84 | 85 | files = glob(path.join(dirname, f'ResultsMLEAI_*.json')) 86 | resList = [] 87 | for fpath in files: 88 | 89 | with open(fpath) as f: 90 | resDict = json.load(f) 91 | 92 | dataset = fpath.split('.json')[0].split('_')[-1] 93 | 94 | for tid, tdict in resDict.items(): 95 | for sa, sdict in tdict.items(): 96 | tsecret = df.loc[tid, sa] 97 | satype = None 98 | 99 | for cdict in metadata['columns']: 100 | if cdict['name'] == sa: 101 | satype = cdict['type'] 102 | 103 | if '_' in sa: 104 | sa = ''.join([s.capitalize() for s in sa.split('_')]) 105 | elif '-' in sa: 106 | sa = ''.join([s.capitalize() for s in sa.split('-')]) 107 | 108 | for gm, gdict in sdict.items(): 109 | for nr, res in gdict.items(): 110 | 111 | resDF = DataFrame(res) 112 | resDF['TargetID'] = tid 113 | resDF['TargetSecret'] = tsecret 114 | resDF['SensitiveType'] = satype 115 | resDF['TargetModel'] = gm 116 | resDF['Run'] = nr 117 | resDF['SensitiveAttribute'] = sa 118 | resDF['Dataset'] = dataset 119 | 120 | resList.append(resDF) 121 | 122 | results = concat(resList) 123 | 124 | resAdv = [] 125 | for gameParams, game in results.groupby(['Dataset', 'TargetID', 'SensitiveAttribute', 'Run']): 126 | rawRes = game.groupby(['TargetModel']).get_group('Raw') 127 | if all(game['SensitiveType'].isin([INTEGER, FLOAT])): 128 | pCorrectRIn, pCorrectROut = get_probs_correct(rawRes['ProbCorrect'], rawRes['TargetPresence']) 129 | 130 | elif all(game['SensitiveType'].isin([CATEGORICAL, ORDINAL])): 131 | pCorrectRIn, pCorrectROut = get_accuracy(rawRes['AttackerGuess'], rawRes['TargetSecret'], rawRes['TargetPresence']) 132 | 133 | else: 134 | raise ValueError('Unknown sensitive attribute type.') 135 | 136 | advR = get_ai_advantage(pCorrectRIn, pCorrectROut) 137 | 138 | for gm, gmRes in game.groupby(['TargetModel']): 139 | if gm != 'Raw': 140 | if all(gmRes['SensitiveType'].isin([INTEGER, FLOAT])): 141 | pCorrectSIn, pCorrectSOut = get_probs_correct(gmRes['ProbCorrect'], gmRes['TargetPresence']) 142 | 143 | elif all(gmRes['SensitiveType'].isin([CATEGORICAL, ORDINAL])): 144 | pCorrectSIn, pCorrectSOut = get_accuracy(gmRes['AttackerGuess'], gmRes['TargetSecret'], gmRes['TargetPresence']) 145 | 146 | else: 147 | raise ValueError('Unknown sensitive attribute type.') 148 | 149 | advS = get_ai_advantage(pCorrectSIn, pCorrectSOut) 150 | 151 | 152 | resAdv.append(gameParams + (gm, pCorrectRIn, pCorrectROut, advR, pCorrectSIn, pCorrectSOut, advS)) 153 | 154 | 155 | resAdv = DataFrame(resAdv) 156 | resAdv.columns =['Dataset', 'TargetID', 'SensitiveAttribute','Run', 'TargetModel', 157 | 'ProbCorrectRawIn', 'ProbCorrectRawOut', 'AdvantageRaw', 158 | 'ProbCorrectSynIn', 'ProbCorrectSynOut', 'AdvantageSyn'] 159 | 160 | resAdv['PrivacyGain'] = resAdv['AdvantageRaw'] - resAdv['AdvantageSyn'] 161 | 162 | return resAdv 163 | 164 | 165 | def load_results_utility(dirname): 166 | """ 167 | Helper function to load results of utility evaluation 168 | :param dirname: str: Directory that contains results files 169 | :return: resultsTarget: DataFrame: Results of utility evaluation on individual records 170 | :return: resultsAgg: DataFrame: Results of average utility evaluation 171 | """ 172 | 173 | # Load individual target utility results 174 | files = glob(path.join(dirname, f'ResultsUtilTargets_*.json')) 175 | 176 | resList = [] 177 | for fpath in files: 178 | with open(fpath) as f: 179 | results = json.load(f) 180 | 181 | dataset = fpath.split('.json')[0].split('_')[-1] 182 | 183 | for ut, ures in results.items(): 184 | model = [m for m in PREDTASKS if m in ut][0] 185 | labelVar = ut.split(model)[-1] 186 | 187 | if '_' in labelVar: 188 | labelVar = ''.join([s.capitalize() for s in labelVar.split('_')]) 189 | 190 | if '-' in labelVar: 191 | labelVar = ''.join([s.capitalize() for s in labelVar.split('-')]) 192 | 193 | for gm, gmres in ures.items(): 194 | for n, nres in gmres.items(): 195 | for tid, tres in nres.items(): 196 | res = DataFrame(tres) 197 | 198 | res['TargetID'] = tid 199 | res['Run'] = f'Run {n}' 200 | res['TargetModel'] = gm 201 | res['PredictionModel'] = model 202 | res['LabelVar'] = labelVar 203 | res['Dataset'] = dataset 204 | 205 | resList.append(res) 206 | 207 | resultsTargets = concat(resList) 208 | 209 | # Load aggregate utility results 210 | files = glob(path.join(dirname, f'ResultsUtilAgg_*.json')) 211 | 212 | resList = [] 213 | for fpath in files: 214 | with open(fpath) as f: 215 | results = json.load(f) 216 | 217 | dataset = fpath.split('.json')[0].split('_')[-1] 218 | 219 | for ut, utres in results.items(): 220 | model = [m for m in PREDTASKS if m in ut][0] 221 | labelVar = ut.split(model)[-1] 222 | 223 | if '_' in labelVar: 224 | labelVar = ''.join([s.capitalize() for s in labelVar.split('_')]) 225 | 226 | if '-' in labelVar: 227 | labelVar = ''.join([s.capitalize() for s in labelVar.split('-')]) 228 | 229 | for gm, gmres in utres.items(): 230 | resDF = DataFrame(gmres) 231 | resDF['PredictionModel'] = model 232 | resDF['LabelVar'] = labelVar 233 | resDF['TargetModel'] = gm 234 | resDF['Dataset'] = dataset 235 | 236 | resList.append(resDF) 237 | 238 | resultsAgg = concat(resList) 239 | 240 | return resultsTargets, resultsAgg 241 | 242 | 243 | ### Plotting 244 | def plt_per_target_pg(results, models, resFilter=('FeatureSet', 'Naive')): 245 | """ Plot per record average privacy gain. """ 246 | results = results[results[resFilter[0]] == resFilter[1]] 247 | 248 | fig, ax = plt.subplots(figsize=(10, 6)) 249 | pointplot(results, 'TargetModel', 'PrivacyGain', 'TargetID', ax, models) 250 | 251 | ax.set_title(f'Attack on {resFilter[0]}: {resFilter[1]}', fontsize=FSIZELABELS) 252 | ax.legend(loc='upper center', bbox_to_anchor=(.5, 1.3), ncol=5, title='TargetID') 253 | ax.set_ylabel('$\mathtt{PG}$', fontsize=FSIZELABELS) 254 | 255 | return fig 256 | 257 | 258 | def plt_avg_accuracy(results, models): 259 | fig, ax = plt.subplots(figsize=(12, 5)) 260 | 261 | pltdata = results[results['TargetID'] == 'OUT'] 262 | 263 | boxplot(pltdata, 'TargetModel', 'Accuracy', 'LabelVar', ax, models) 264 | 265 | ax.hlines(0.2, *ax.get_xlim(), 'grey', '--') 266 | ax.set_ylabel('$\mathtt{Accuracy}$', fontsize=FSIZELABELS) 267 | ax.set_xlabel('') 268 | 269 | return fig 270 | 271 | 272 | def pointplot(data, x, y, hue, ax, order): 273 | ncats = data[hue].nunique() 274 | huemarkers = HUEMARKERS[:ncats] 275 | 276 | sns.pointplot(data=data, y=y, 277 | x=x, hue=hue, 278 | order=order, 279 | ax=ax, dodge=True, 280 | join=False, markers=huemarkers, 281 | scale=1.2, errwidth=2, 282 | linestyles='--') 283 | 284 | # Remove legend 285 | ax.get_legend().remove() 286 | 287 | # Set x- and y-label 288 | ax.set_xlabel('') 289 | 290 | # Resize y-tick labels 291 | for tick in ax.yaxis.get_major_ticks(): 292 | tick.label.set_fontsize(FSIZETICKS) 293 | 294 | # Resize x-tick labels 295 | for tick in ax.xaxis.get_major_ticks(): 296 | tick.label.set_fontsize(FSIZETICKS) 297 | 298 | 299 | def boxplot(data, x, y, hue, ax, order, hue_order=None): 300 | sns.boxenplot(data=data, y=y, 301 | x=x, hue=hue, 302 | order=order, hue_order=hue_order, 303 | ax=ax, dodge=True) 304 | 305 | # Resize y-tick labels 306 | for tick in ax.yaxis.get_major_ticks(): 307 | tick.label.set_fontsize(FSIZETICKS) 308 | 309 | # Resize x-tick labels 310 | for tick in ax.xaxis.get_major_ticks(): 311 | tick.label.set_fontsize(FSIZETICKS) 312 | -------------------------------------------------------------------------------- /utils/constants.py: -------------------------------------------------------------------------------- 1 | # Data coding constants 2 | FILLNA_VALUE_CAT = "NaN" 3 | CATEGORICAL = "Categorical" 4 | ORDINAL = "Ordinal" 5 | INTEGER = "Integer" 6 | FLOAT = "Float" 7 | NUMERICAL = [INTEGER, FLOAT] 8 | STRINGS = [CATEGORICAL, ORDINAL] 9 | 10 | # Runtime constant 11 | PROCESSES = 16 12 | 13 | # Experiment constants 14 | LABEL_IN = 1 15 | LABEL_OUT = 0 16 | ZERO_TOL = 1e-12 -------------------------------------------------------------------------------- /utils/datagen.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helper functions for loading, converting, reshaping data 3 | _load_file, _load_json, _get_columns are copies of utility functions in 4 | https://github.com/sdv-dev/SDGym published under MIT License Copyright (c) 2019, MIT Data To AI Lab 5 | """ 6 | import numpy as np 7 | import pandas as pd 8 | import json 9 | import urllib 10 | from os import path 11 | from os import makedirs 12 | from pandas.api.types import CategoricalDtype 13 | 14 | from utils.constants import * 15 | 16 | BASE_URL = 'http://sdgym.s3.amazonaws.com/datasets/' 17 | DATA_PATH = path.join(path.dirname(__file__), 'data') 18 | MNIST_IMAGE_SZIE = (28, 28, 1) 19 | 20 | 21 | def load_mnist(filename): 22 | """Load and prepare MNIST dataset""" 23 | train = pd.read_csv(filename, sep=" ") 24 | y_train = np.array(train.values[:, -1], dtype=np.float32) 25 | X_train = np.array(train.values[:, :-1], dtype=np.float32) 26 | X_train = X_train.astype("float32") 27 | y_train = y_train.astype("float32") 28 | X_train /= 255 29 | 30 | X_train = X_train.reshape(len(X_train), *MNIST_IMAGE_SZIE) 31 | 32 | return (X_train, y_train) 33 | 34 | 35 | def load_local_data_as_df(filename): 36 | with open(f'{filename}.json') as f: 37 | metadata = json.load(f) 38 | dtypes = {cd['name']:_get_dtype(cd) for cd in metadata['columns']} 39 | df = pd.read_csv(f'{filename}.csv', dtype=dtypes) 40 | metadata['categorical_columns'], metadata['ordinal_columns'], metadata['continuous_columns'] = _get_columns(metadata) 41 | 42 | df['ID'] = [f'ID{i}' for i in np.arange(len(df))] 43 | df = df.set_index('ID') 44 | 45 | return df, metadata 46 | 47 | 48 | def load_local_data_as_array(filename): 49 | df = pd.read_csv(f'{filename}.csv') 50 | with open(f'{filename}.json') as f: 51 | metadata = json.load(f) 52 | metadata['categorical_columns'], metadata['ordinal_columns'], metadata['continuous_columns'] = _get_columns(metadata) 53 | 54 | data = convert_df_to_array(df, metadata) 55 | 56 | return data, metadata 57 | 58 | 59 | def load_s3_data_as_array(filename): 60 | data = _load_file(filename + '.npz', np.load) 61 | metadata = _load_file(filename + '.json', _load_json) 62 | metadata['categorical_columns'], metadata['ordinal_columns'], metadata['continuous_columns'] = _get_columns(metadata) 63 | 64 | return np.concatenate([data['train'], data['test']]), metadata 65 | 66 | 67 | def load_s3_data_as_df(filename): 68 | data = _load_file(filename + '.npz', np.load) 69 | metadata = _load_file(filename + '.json', _load_json) 70 | metadata['categorical_columns'], metadata['ordinal_columns'], metadata['continuous_columns'] = _get_columns(metadata) 71 | 72 | df = convert_array_to_df(np.concatenate([data['train'], data['test']]), metadata) 73 | 74 | df['ID'] = [f'ID{i}' for i in np.arange(len(df))] 75 | df = df.set_index('ID') 76 | 77 | return df, metadata 78 | 79 | 80 | def _get_dtype(cd): 81 | if cd['type'] == FLOAT: 82 | return np.float 83 | elif cd['type'] == INTEGER: 84 | return np.int 85 | else: 86 | return np.object 87 | 88 | 89 | def _get_columns(metadata): 90 | categorical_columns = list() 91 | ordinal_columns = list() 92 | continuous_columns = list() 93 | for column_idx, column in enumerate(metadata['columns']): 94 | if column['type'] == CATEGORICAL: 95 | categorical_columns.append(column_idx) 96 | elif column['type'] == ORDINAL: 97 | ordinal_columns.append(column_idx) 98 | elif column['type'] in NUMERICAL: 99 | continuous_columns.append(column_idx) 100 | 101 | return categorical_columns, ordinal_columns, continuous_columns 102 | 103 | 104 | def _load_json(path): 105 | with open(path) as json_file: 106 | return json.load(json_file) 107 | 108 | 109 | def _load_file(filename, loader): 110 | local_path = path.join(DATA_PATH, filename) 111 | if not path.exists(local_path): 112 | makedirs(DATA_PATH, exist_ok=True) 113 | urllib.request.urlretrieve(BASE_URL + filename, local_path) 114 | 115 | return loader(local_path) 116 | 117 | 118 | def convert_array_to_df(data, metadata): 119 | df = pd.DataFrame(data) 120 | column_names = [] 121 | for i, col in enumerate(metadata['columns']): 122 | column_names.append(col['name']) 123 | if col['type'] in [CATEGORICAL, ORDINAL]: 124 | df.iloc[:, i] = df.iloc[:, i].astype('object') 125 | df.iloc[:, i] = df.iloc[:, i].map(pd.Series(col['i2s'])) 126 | 127 | df.columns = column_names 128 | return df 129 | 130 | 131 | def convert_df_to_array(df, metadata): 132 | dfcopy = df.copy() 133 | for col in metadata['columns']: 134 | if col['name'] in list(dfcopy): 135 | col_data = dfcopy[col['name']] 136 | if col['type'] in [CATEGORICAL, ORDINAL]: 137 | if len(col_data) > len(col_data.dropna()): 138 | col_data = col_data.fillna(FILLNA_VALUE_CAT) 139 | if FILLNA_VALUE_CAT not in col['i2s']: 140 | col['i2s'].append(FILLNA_VALUE_CAT) 141 | col['size'] += 1 142 | cat = CategoricalDtype(categories=col['i2s'], ordered=True) 143 | col_data = col_data.astype(cat) 144 | dfcopy[col['name']] = col_data.cat.codes 145 | 146 | return dfcopy.values 147 | 148 | 149 | def convert_series_to_array(scopy, metadata): 150 | scopy = scopy.copy() 151 | for col in metadata['columns']: 152 | if col['name'] == scopy.name: 153 | if col['type'] in [CATEGORICAL, ORDINAL]: 154 | if len(scopy) > len(scopy.dropna()): 155 | scopy = scopy.fillna(FILLNA_VALUE_CAT) 156 | if FILLNA_VALUE_CAT not in col['i2s']: 157 | col['i2s'].append(FILLNA_VALUE_CAT) 158 | col['size'] += 1 159 | cat = CategoricalDtype(categories=col['i2s'], ordered=True) 160 | scopy = scopy.astype(cat) 161 | scopy = scopy.cat.codes 162 | 163 | return scopy.values 164 | 165 | 166 | -------------------------------------------------------------------------------- /utils/evaluation_framework.py: -------------------------------------------------------------------------------- 1 | """ 2 | Procedures for running a privacy evaluation on a generative model 3 | """ 4 | 5 | from numpy import where, mean 6 | 7 | from utils.constants import * 8 | 9 | def get_accuracy(guesses, labels, targetPresence): 10 | idxIn = where(targetPresence == LABEL_IN)[0] 11 | idxOut = where(targetPresence == LABEL_OUT)[0] 12 | 13 | pIn = sum([g == l for g,l in zip(guesses[idxIn], labels[idxIn])])/len(idxIn) 14 | pOut = sum([g == l for g,l in zip(guesses[idxOut], labels[idxOut])])/len(idxOut) 15 | return pIn, pOut 16 | 17 | 18 | def get_tp_fp_rates(guesses, labels): 19 | targetIn = where(labels == LABEL_IN)[0] 20 | targetOut = where(labels == LABEL_OUT)[0] 21 | return sum(guesses[targetIn] == LABEL_IN)/len(targetIn), sum(guesses[targetOut] == LABEL_IN)/len(targetOut) 22 | 23 | 24 | def get_probs_correct(pdf, targetPresence): 25 | idxIn = where(targetPresence == LABEL_IN)[0] 26 | idxOut = where(targetPresence == LABEL_OUT)[0] 27 | 28 | pdf[pdf > 1.] = 1. 29 | return mean(pdf[idxIn]), mean(pdf[idxOut]) 30 | 31 | 32 | def get_mia_advantage(tp_rate, fp_rate): 33 | return tp_rate - fp_rate 34 | 35 | 36 | def get_ai_advantage(pCorrectIn, pCorrectOut): 37 | return pCorrectIn - pCorrectOut 38 | 39 | 40 | def get_util_advantage(pCorrectIn, pCorrectOut): 41 | return pCorrectIn - pCorrectOut 42 | 43 | 44 | def get_prob_removed(before, after): 45 | idxIn = where(before == LABEL_IN)[0] 46 | return 1.0 - sum(after[idxIn]/len(idxIn)) 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /utils/logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import sys 3 | 4 | 5 | def setup_logger(stream=sys.stderr): 6 | """Setup a logger.""" 7 | logger = logging.getLogger() 8 | logger.setLevel(logging.INFO) 9 | handler = logging.StreamHandler(stream=stream) 10 | handler.setLevel(logging.INFO) 11 | formatter = logging.Formatter("%(asctime)s:%(name)s:%(levelname)s:%(message)s") 12 | handler.setFormatter(formatter) 13 | logger.addHandler(handler) 14 | 15 | return logger 16 | 17 | 18 | LOGGER = setup_logger() 19 | -------------------------------------------------------------------------------- /utils/plot_setup.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from matplotlib.colors import to_rgb 3 | import seaborn as sns 4 | from palettable import cartocolors 5 | from husl import hex_to_husl 6 | 7 | colours = cartocolors.qualitative.Safe_10.hex_colors 8 | cpalette = sns.color_palette(colours) 9 | cpalette_light = sns.light_palette(hex_to_husl(colours[1]), input="husl") 10 | colours_rgb = [to_rgb(c) for c in colours] 11 | 12 | cmap_qualitative = cartocolors.qualitative.Safe_10.mpl_colormap 13 | cmap_light = sns.light_palette(hex_to_husl(colours[1]), input="husl", as_cmap=True) 14 | 15 | pltmarkers = ['o', 'X', 'D', 'P', '^'] 16 | 17 | fontsizelabels = 26 18 | fontsizeticks = 24 19 | 20 | def set_style(): 21 | sns.set_palette(cpalette) 22 | sns.set_style('whitegrid', {'axes.spines.right': True, 23 | 'axes.spines.top': True, 24 | 'axes.edgecolor': 'k', 25 | 'xtick.color': 'k', 26 | 'ytick.color': 'k', 27 | 'grid.color':'0.7', 28 | 'font.family': 'serif', 29 | 'font.sans-serif': 'cm', 30 | 'text.usetex': True}) 31 | 32 | plt.rcParams.update({ 33 | 'font.family': 'serif', 34 | 'font.sans-serif': 'cm', 35 | 'text.usetex': True, 36 | 'font.size': 14, 37 | 38 | 'xtick.labelsize': 14, 39 | 'ytick.labelsize': 14, 40 | 'axes.labelsize': 16, 41 | 'axes.titlesize': 18, 42 | 43 | 'savefig.dpi': 75, 44 | 45 | 'figure.autolayout': False, 46 | 'figure.figsize': (13, 7), 47 | 'figure.titlesize': 20, 48 | 49 | 'lines.linewidth': 2.0, 50 | 'lines.markersize': 8, 51 | 'legend.fontsize': 14 52 | }) -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | 4 | import numpy as np 5 | import multiprocessing as mp 6 | 7 | from warnings import simplefilter 8 | simplefilter('ignore', category=FutureWarning) 9 | simplefilter('ignore', category=DeprecationWarning) 10 | 11 | 12 | def json_numpy_serialzer(o): 13 | """ Serialize numpy types for json 14 | 15 | Parameters: 16 | o (object): any python object which fails to be serialized by json 17 | 18 | Example: 19 | 20 | >>> import json 21 | >>> a = np.array([1, 2, 3]) 22 | >>> json.dumps(a, default=json_numpy_serializer) 23 | 24 | """ 25 | numpy_types = ( 26 | np.bool_, 27 | np.float16, 28 | np.float32, 29 | np.float64, 30 | # np.float128, -- special handling below 31 | np.int8, 32 | np.int16, 33 | np.int32, 34 | np.int64, 35 | np.str_, 36 | np.timedelta64, 37 | np.uint8, 38 | np.uint16, 39 | np.uint32, 40 | np.uint64, 41 | np.void, 42 | ) 43 | 44 | if isinstance(o, np.ndarray): 45 | return o.tolist() 46 | elif isinstance(o, numpy_types): 47 | return o.item() 48 | elif isinstance(o, np.float128): 49 | return o.astype(np.float64).item() 50 | else: 51 | raise TypeError("{} of type {} is not JSON serializable".format(repr(o), type(o))) 52 | 53 | 54 | def set_random_seed(seed=0): 55 | random.seed(seed) 56 | np.random.seed(seed) 57 | 58 | 59 | def read_json_file(json_file): 60 | with open(json_file, 'r') as file: 61 | return json.load(file) 62 | 63 | 64 | def get_mia_gain(pCorrectSyn): 65 | # return min(1, 2*(1 - pCorrectSyn)) 66 | return 2 * (1 - pCorrectSyn) 67 | 68 | 69 | def get_accuracy(guesses, labels): 70 | return sum([g == l for g, l in zip(guesses, labels)])/len(labels) 71 | 72 | 73 | class CustomProcess(mp.Process): 74 | def run(self, *args, **kwargs): 75 | import warnings 76 | with warnings.catch_warnings(): 77 | warnings.simplefilter('ignore', category=FutureWarning) 78 | warnings.simplefilter('ignore', category=DeprecationWarning) 79 | return mp.Process.run(self, *args, **kwargs) 80 | 81 | 82 | 83 | 84 | 85 | 86 | --------------------------------------------------------------------------------