├── .gitignore
├── LICENSE
├── LICENSE.MIT
├── README.md
├── attack_models
    ├── __init__.py
    ├── attack_model.py
    ├── mia_classifier.py
    └── reconstruction.py
├── data
    ├── germancredit.json
    ├── texas.csv
    └── texas.json
├── docker
    ├── Dockerfile
    └── requirements.txt
├── executables
    ├── __init__.py
    ├── generate_metadata_file.py
    └── generate_synthetic_dataset.py
├── feature_sets
    ├── __init__.py
    ├── bayes.py
    ├── feature_set.py
    ├── independent_histograms.py
    └── model_agnostic.py
├── generative_models
    ├── __init__.py
    ├── ctgan.py
    ├── data_synthesiser.py
    ├── data_synthesiser_utils
    │   ├── __init__.py
    │   ├── datatypes
    │   │   ├── AbstractAttribute.py
    │   │   ├── FloatAttribute.py
    │   │   ├── IntegerAttribute.py
    │   │   ├── StringAttribute.py
    │   │   ├── __init__.py
    │   │   ├── constants.py
    │   │   └── utils
    │   │   │   ├── DataType.py
    │   │   │   └── __init__.py
    │   └── utils.py
    ├── generative_model.py
    ├── gmm.py
    └── pate_gan.py
├── inference_cli.py
├── linkage_cli.py
├── notebooks
    └── Analyse Results.ipynb
├── predictive_models
    ├── __init__.py
    └── predictive_model.py
├── requirements.txt
├── sanitisation_techniques
    └── sanitiser.py
├── tests
    ├── __init__.py
    ├── germancredit_test.csv
    ├── germancredit_test.json
    ├── inference
    │   └── runconfig.json
    ├── linkage
    │   └── runconfig.json
    ├── test_attacks.py
    ├── test_gms.py
    ├── test_sanitisation.py
    └── utility
    │   └── runconfig.json
├── utility_cli.py
└── utils
    ├── __init__.py
    ├── analyse_results.py
    ├── constants.py
    ├── datagen.py
    ├── evaluation_framework.py
    ├── logging.py
    ├── plot_setup.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | venv*
 2 | outputs/*
 3 | .idea/*
 4 | spring_synthetic_data.iml
 5 | .DS_Store
 6 | __pycache__/*
 7 | */__pycache__/*
 8 | */__pycache__
 9 | __pycache__
10 | paper/*.aux
11 | paper/*.synctex.gz
12 | paper/*.pdf
13 | paper/*.log
14 | paper/*.bbl
15 | paper/*.blg
16 | paper/*.out
17 | syn_data_files/
18 | notebooks_local/
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD-3-Clause License
 2 | 
 3 | Copyright 2021 Theresa Stadler (EPFL SPRING Lab), Bristena Oprisanu (UCL)
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 8 | 
 9 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
10 | 
11 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
12 | 
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
14 | 


--------------------------------------------------------------------------------
/LICENSE.MIT:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright <2018> <dataresponsibly.com>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Privacy evaluation framework for synthetic data publishing
  2 | A practical framework to evaluate the privacy-utility tradeoff of synthetic data publishing 
  3 | 
  4 | Based on "Synthetic Data - Anonymisation Groundhog Day, Theresa Stadler, Bristena Oprisanu, and Carmela Troncoso, [arXiv](https://arxiv.org/abs/2011.07018), 2020"
  5 | 
  6 | # Attack models
  7 | The module `attack_models` so far includes
  8 | 
  9 | A privacy adversary to test for privacy gain with respect to linkage attacks modelled as a membership inference attack `MIAAttackClassifier`.
 10 | 
 11 | A simple attribute inference attack `AttributeInferenceAttack` that aims to infer a target's sensitive value given partial knowledge about the target record
 12 | 
 13 | # Generative models
 14 | The module `generative_models` so far includes:   
 15 | - `IndependentHistogram`: An independent histogram model adapted from [Data Responsibly's DataSynthesiser](https://github.com/DataResponsibly/DataSynthesizer)
 16 | - `BayesianNet`: A generative model based on a Bayesian Network adapted from [Data Responsibly's DataSynthesiser](https://github.com/DataResponsibly/DataSynthesizer)
 17 | - `PrivBayes`: A differentially private version of the BayesianNet model adapted from [Data Responsibly's DataSynthesiser](https://github.com/DataResponsibly/DataSynthesizer)
 18 | - `CTGAN`: A conditional tabular generative adversarial network that integrates the CTGAN model from [CTGAN](https://github.com/sdv-dev/CTGAN)  
 19 | - `PATE-GAN`: A differentially private generative adversarial network adapted from its original implementation by the [MLforHealth Lab](https://bitbucket.org/mvdschaar/mlforhealthlabpub/src/82d7f91d46db54d256ff4fc920d513499ddd2ab8/alg/pategan/)
 20 | 
 21 | # Setup
 22 | 
 23 | ## Docker Distribution
 24 | 
 25 | For your convenience, Synthetic Data is also distributed as a ready-to-use Docker image containing Python 3.9 and CUDA 11.4.2, along with all dependencies required by Synthetic Data, including jupyter notebook to visualise and analyse the results.
 26 | 
 27 | **Note:** This distribution includes CUDA binaries, before downloading the image, ensure to read [its EULA](https://docs.nvidia.com/cuda/eula/index.html) and to agree to its terms.
 28 | 
 29 | Pull the image and run a container (and bind a volume where you want to save the data):
 30 | 
 31 | ```
 32 | docker pull springepfl/synthetic-data:latest
 33 | docker run -it --rm -v "$(pwd)/output:/output" -p 8888:8888 springepfl/synthetic-data
 34 | ```
 35 | 
 36 | The Synthetic Data directory is placed at the root directory of the container.
 37 | ```
 38 | cd /synthetic_data_release
 39 | ```
 40 | 
 41 | You should now be able to run the examples without encountering any problems, and you should be able to visualize the results with Jupyter by running
 42 | ```
 43 | jupyter notebook --allow-root --ip=0.0.0.0
 44 | ```
 45 | 
 46 | and opening the notebook with your favourite web browser at the url `http://127.0.0.1:8888/?token=<authentication token>`.
 47 | 
 48 | 
 49 | ## Direct Installation
 50 | 
 51 | ### Requirements
 52 | The framework and its building blocks have been developed and tested under Python 3.9 .
 53 | 
 54 | We recommend to create a virtual environment for installing all dependencies and running the code
 55 | ```
 56 | python3 -m venv pyvenv3
 57 | source pyvenv3/bin/activate
 58 | pip install numpy==1.19.5 && pip install -r requirements.txt
 59 | ```
 60 | 
 61 | Note: Some people encountered problems due to the API of Numpy having changed between versions, to ensure all dependencies are compiled against the same Numpy version, it needs to be installed first.
 62 | 
 63 | ### Dependencies
 64 | The `CTGAN` model depends on a fork of the original model training algorithm that can be found here
 65 | [CTGAN-SPRING](https://github.com/spring-epfl/CTGAN.git)
 66 | 
 67 | To install the correct version clone the repository above and run
 68 | ```
 69 | cd CTGAN
 70 | make install
 71 | ```
 72 | 
 73 | Add the path to this directory to your python path. You can also add this line
 74 | in your shell configuration file (e.g., `~/.bashrc`) to load it automatically.
 75 | ```bash
 76 | # Execute this in the CTGAN folder, otherwise replace `pwd` with the actual path
 77 | export PYTHONPATH=$PYTHONPATH:`pwd`
 78 | ```
 79 | 
 80 | To test your installation try to run
 81 | ```
 82 | import ctgan
 83 | ```
 84 | from within your virtualenv `python`
 85 | 
 86 | # Example runs
 87 | To run a privacy evaluation with respect to the privacy concern of linkability you can run
 88 | 
 89 | ```
 90 | python3 linkage_cli.py -D data/texas -RC tests/linkage/runconfig.json -O tests/linkage
 91 | ```
 92 | 
 93 | The results file produced after successfully running the script will be written to `tests/linkage` and can be parsed with the function `load_results_linkage` provided in `utils/analyse_results.py`. 
 94 | A jupyter notebook to visualise and analyse the results is included at `notebooks/Analyse Results.ipynb`.
 95 | 
 96 | 
 97 | To run a privacy evaluation with respect to the privacy concern of inference you can run
 98 | 
 99 | ```
100 | python3 inference_cli.py -D data/texas -RC tests/inference/runconfig.json -O tests/inference
101 | ```
102 | 
103 | The results file produced after successfully running the script can be parsed with the function `load_results_inference` provided in `utils/analyse_results.py`.
104 | A jupyter notebook to visualise and analyse the results is included at `notebooks/Analyse Results.ipynb`.
105 | 
106 | 
107 | To run a utility evaluation with respect to a simple classification task as utility function run
108 | 
109 | ```
110 | python3 utility_cli.py -D data/texas -RC tests/utility/runconfig.json -O tests/utility
111 | ```
112 | 
113 | The results file produced after successfully running the script can be parsed with the function `load_results_utility` provided in `utils/analyse_results.py`.
114 | 
115 | 


--------------------------------------------------------------------------------
/attack_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/attack_models/__init__.py


--------------------------------------------------------------------------------
/attack_models/attack_model.py:
--------------------------------------------------------------------------------
 1 | """Parent class for all privacy attacks"""
 2 | 
 3 | class PrivacyAttack(object):
 4 | 
 5 |     def train(self, *args):
 6 |         """Train privacy adversary"""
 7 |         return NotImplementedError('Method needs to be overwritten by a subclass.')
 8 | 
 9 |     def attack(self, *args):
10 |         """Make a guess about target's secret"""
11 |         return NotImplementedError('Method needs to be overwritten by a subclass.')


--------------------------------------------------------------------------------
/attack_models/mia_classifier.py:
--------------------------------------------------------------------------------
  1 | """Parent class for launching a membership inference attack on the output of a generative model"""
  2 | from pandas import DataFrame
  3 | from pandas.api.types import CategoricalDtype
  4 | from numpy import ndarray, concatenate, stack, array, round, zeros, arange
  5 | 
  6 | from sklearn.svm import SVC
  7 | from sklearn.linear_model import LogisticRegression
  8 | from sklearn.ensemble import RandomForestClassifier
  9 | from sklearn.neighbors import KNeighborsClassifier
 10 | from sklearn.neural_network import MLPClassifier
 11 | from sklearn.model_selection import ShuffleSplit
 12 | 
 13 | from utils.datagen import convert_df_to_array
 14 | from utils.utils import CustomProcess
 15 | from utils.constants import *
 16 | 
 17 | from attack_models.attack_model import PrivacyAttack
 18 | 
 19 | from warnings import simplefilter
 20 | simplefilter('ignore', category=FutureWarning)
 21 | simplefilter('ignore', category=DeprecationWarning)
 22 | 
 23 | import multiprocessing as mp
 24 | 
 25 | class MIAttackClassifier(PrivacyAttack):
 26 |     """"Parent class for membership inference attack on the output of a generative model using sklearn classifier"""
 27 |     def __init__(self, Distinguisher, metadata, FeatureSet=None, quids=None):
 28 | 
 29 |         self.Distinguisher = Distinguisher
 30 |         self.FeatureSet = FeatureSet
 31 | 
 32 |         self.metadata, self.categoricalAttributes, self.numericalAttributes = self._read_meta(metadata, quids)
 33 | 
 34 |         self.trained = False
 35 | 
 36 |         self.__name__ = f'{self.Distinguisher.__class__.__name__}{self.FeatureSet.__class__.__name__}'
 37 | 
 38 |     def train(self, synA, labels):
 39 |         """Train a membership inference attack on a labelled training set"""
 40 | 
 41 |         if self.FeatureSet is not None:
 42 |             synA = stack([self.FeatureSet.extract(s) for s in synA])
 43 |         else:
 44 |             synA = stack([self._df_to_array(s).flatten() for s in synA])
 45 | 
 46 |         if not isinstance(labels, ndarray):
 47 |             labels = array(labels)
 48 | 
 49 |         self.Distinguisher.fit(synA, labels)
 50 | 
 51 |         self.trained = True
 52 | 
 53 |     def attack(self, datasets, attemptLinkage=False, target=None):
 54 |         """
 55 |         Make a guess about the target's membership in the training data of the
 56 |         generative model that produced the synthetic input data
 57 | 
 58 |         :param datasets: list: A list of synthetic or sanitised datasets
 59 |         :return: guess: list: A guess about the target's membership for each of the synthetic input datasets
 60 |         """
 61 |         assert self.trained, 'Attack must first be trained.'
 62 | 
 63 |         if attemptLinkage:
 64 |             assert target is not None, 'Attacker needs target record to attempt linkage'
 65 | 
 66 |         guesses = []
 67 |         for df in datasets:
 68 |             if attemptLinkage:
 69 |                 try:
 70 |                     k = df.groupby(self.categoricalAttributes).size()[target[self.categoricalAttributes].values]
 71 |                     if all(k == 1):
 72 |                         guess = LABEL_IN
 73 |                     else:
 74 |                         guess = self._make_guess(df)
 75 |                 except:
 76 |                     guess = self._make_guess(df)
 77 |             else:
 78 |                 guess = self._make_guess(df)
 79 | 
 80 |             guesses.append(guess)
 81 | 
 82 |         return guesses
 83 | 
 84 |     def _make_guess(self, df):
 85 |         if self.FeatureSet is not None:
 86 |             f = self.FeatureSet.extract(df).reshape(1, -1)
 87 |         else:
 88 |             f = self._df_to_array(df).reshape(1, -1)
 89 | 
 90 |         return round(self.Distinguisher.predict(f), 0).astype(int)[0]
 91 | 
 92 | 
 93 |     def get_confidence(self, synT, secret):
 94 |         """Calculate probability that attacker correctly predicts whether target was present in model's training data"""
 95 |         assert self.trained, 'Attack must first be trained.'
 96 |         if self.FeatureSet is not None:
 97 |             synT = stack([self.FeatureSet.extract(s) for s in synT])
 98 |         else:
 99 |             if isinstance(synT[0], DataFrame):
100 |                 synT = stack([convert_df_to_array(s, self.metadata).flatten() for s in synT])
101 |             else:
102 |                 synT = stack([s.flatten() for s in synT])
103 | 
104 |         probs = self.Distinguisher.predict_proba(synT)
105 | 
106 |         return [p[s] for p,s in zip(probs, secret)]
107 | 
108 |     def _read_meta(self, metadata, quids):
109 |         if quids is None:
110 |             quids = []
111 | 
112 |         meta_dict = {}
113 |         categoricalAttributes = []
114 |         numericalAttributes = []
115 | 
116 |         for cdict in metadata['columns']:
117 |             attr_name = cdict['name']
118 |             data_type = cdict['type']
119 | 
120 |             if data_type == FLOAT or data_type == INTEGER:
121 |                 if attr_name in quids:
122 |                     cat_bins = cdict['bins']
123 |                     cat_labels = [f'({cat_bins[i]},{cat_bins[i+1]}]' for i in range(len(cat_bins)-1)]
124 | 
125 |                     meta_dict[attr_name] = {
126 |                         'type': CATEGORICAL,
127 |                         'categories': cat_labels,
128 |                         'size': len(cat_labels)
129 |                     }
130 | 
131 |                     categoricalAttributes.append(attr_name)
132 | 
133 |                 else:
134 |                     meta_dict[attr_name] = {
135 |                         'type': data_type,
136 |                         'min': cdict['min'],
137 |                         'max': cdict['max']
138 |                     }
139 | 
140 |                     numericalAttributes.append(attr_name)
141 | 
142 |             elif data_type == CATEGORICAL or data_type == ORDINAL:
143 |                 meta_dict[attr_name] = {
144 |                     'type': data_type,
145 |                     'categories': cdict['i2s'],
146 |                     'size': len(cdict['i2s'])
147 |                 }
148 | 
149 |                 categoricalAttributes.append(attr_name)
150 | 
151 |             else:
152 |                 raise ValueError(f'Unknown data type {data_type} for attribute {attr_name}')
153 | 
154 |         return meta_dict, categoricalAttributes, numericalAttributes
155 | 
156 |     def _df_to_array(self, data):
157 |         dfAsArray = []
158 |         for col, cdict in self.metadata.items():
159 |             if col in list(data):
160 |                 colData = data[col].copy()
161 |                 coltype = cdict['type']
162 | 
163 |                 if coltype in STRINGS:
164 |                     if len(colData) > len(colData.dropna()):
165 |                         colData = colData.fillna(FILLNA_VALUE_CAT)
166 |                         if FILLNA_VALUE_CAT not in cdict['categories']:
167 |                             col['categories'].append(FILLNA_VALUE_CAT)
168 |                             col['size'] += 1
169 | 
170 |                     if coltype == ORDINAL:
171 |                         cat = CategoricalDtype(categories=cdict['categories'], ordered=True)
172 |                         colData = colData.astype(cat)
173 |                         colArray = colData.cat.codes.values.reshape(-1, 1)
174 | 
175 |                     else:
176 |                         colArray = self._one_hot(colData.values, cdict['categories'])
177 | 
178 |                 elif coltype in NUMERICAL:
179 |                     colArray = colData.values.reshape(-1, 1)
180 | 
181 |                 else:
182 |                     raise ValueError(f'Unknown type {coltype} for col {col}')
183 | 
184 |                 dfAsArray.append(colArray)
185 | 
186 |         return concatenate(dfAsArray, axis=1)
187 | 
188 |     def _one_hot(self, col_data, categories):
189 |         col_data_onehot = zeros((len(col_data), len(categories)))
190 |         cidx = [categories.index(c) for c in col_data]
191 |         col_data_onehot[arange(len(col_data)), cidx] = 1
192 | 
193 |         return col_data_onehot
194 | 
195 | 
196 | class MIAttackClassifierLinearSVC(MIAttackClassifier):
197 | 
198 |     def __init__(self, metadata, FeatureSet=None):
199 |         super().__init__(SVC(kernel='linear', probability=True), metadata, FeatureSet)
200 | 
201 | 
202 | class MIAttackClassifierSVC(MIAttackClassifier):
203 | 
204 |     def __init__(self, metadata, FeatureSet=None):
205 |         super().__init__(SVC(probability=True), metadata, FeatureSet)
206 | 
207 | 
208 | class MIAttackClassifierLogReg(MIAttackClassifier):
209 | 
210 |     def __init__(self, metadata, FeatureSet=None):
211 |         super().__init__(LogisticRegression(), metadata, FeatureSet)
212 | 
213 | 
214 | class MIAttackClassifierRandomForest(MIAttackClassifier):
215 | 
216 |     def __init__(self, metadata, FeatureSet=None, quids=None):
217 |         super().__init__(RandomForestClassifier(), metadata=metadata, FeatureSet=FeatureSet, quids=quids)
218 | 
219 | 
220 | class MIAttackClassifierKNN(MIAttackClassifier):
221 | 
222 |     def __init__(self, metadata, FeatureSet=None, quids=None):
223 |         super().__init__(KNeighborsClassifier(n_neighbors=5), metadata=metadata, FeatureSet=FeatureSet, quids=quids)
224 | 
225 | 
226 | class MIAttackClassifierMLP(MIAttackClassifier):
227 | 
228 |     def __init__(self, metadata, FeatureSet=None, quids=None):
229 |         super().__init__(MLPClassifier((200,), solver='lbfgs'), metadata=metadata, FeatureSet=FeatureSet, quids=quids)
230 | 
231 | 
232 | def generate_mia_shadow_data(GenModel, target, rawA, sizeRaw, sizeSyn, numModels, numCopies):
233 |     assert isinstance(rawA, GenModel.datatype), f"GM expects datatype {GenModel.datatype} but got {type(rawA)}"
234 |     assert isinstance(target, type(rawA)), f"Mismatch of datatypes between target record and raw data"
235 | 
236 |     kf = ShuffleSplit(n_splits=numModels, train_size=sizeRaw)
237 | 
238 |     if GenModel.multiprocess:
239 | 
240 |         manager = mp.Manager()
241 |         synA = manager.list()
242 |         labelsA = manager.list()
243 |         jobs = []
244 |         tasks = [(rawA, train_index, GenModel, target, sizeSyn, numCopies, synA, labelsA) for train_index, _ in kf.split(rawA)]
245 | 
246 |         for task in tasks:
247 |             p = CustomProcess(target=worker_train_shadow, args=task)
248 |             jobs.append(p)
249 |             p.start()
250 | 
251 |         for p in jobs:
252 |             p.join()
253 | 
254 |     else:
255 |         synA, labelsA = [], []
256 |         for train_index, _ in kf.split(rawA):
257 |             worker_train_shadow(rawA, train_index, GenModel, target, sizeSyn, numCopies, synA, labelsA)
258 | 
259 |     return synA, labelsA
260 | 
261 | 
262 | def worker_train_shadow(rawA, train_index, GenModel, target, sizeSyn, numCopies, synA, labelsA):
263 |     # Fit GM to data without target's data
264 |     if isinstance(rawA, DataFrame):
265 |         rawAout = rawA.iloc[train_index]
266 |     else:
267 |         rawAout = rawA[train_index, :]
268 |     GenModel.fit(rawAout)
269 | 
270 |     # Generate synthetic sample for data without target
271 |     synOut = [GenModel.generate_samples(sizeSyn) for _ in range(numCopies)]
272 |     labelsOut = [LABEL_OUT for _ in range(numCopies)]
273 | 
274 |     # Insert targets into training data
275 |     if isinstance(rawA, DataFrame):
276 |         rawAin = rawAout.append(target)
277 |     else:
278 |         if len(target.shape) == 1:
279 |             target = target.reshape(1, len(target))
280 |         rawAin = concatenate([rawAout, target])
281 | 
282 |     # Fit generative model to data including target
283 |     GenModel.fit(rawAin)
284 | 
285 |     # Generate synthetic sample for data including target
286 |     synIn = [GenModel.generate_samples(sizeSyn) for _ in range(numCopies)]
287 |     labelsIn = [LABEL_IN for _ in range(numCopies)]
288 | 
289 |     syn = synOut + synIn
290 |     labels = labelsOut + labelsIn
291 | 
292 |     synA.extend(syn)
293 |     labelsA.extend(labels)
294 | 
295 | 
296 | def generate_mia_anon_data(Sanitiser, target, rawA, sizeRaw, numSamples):
297 |     assert isinstance(rawA, Sanitiser.datatype), f"GM expects datatype {Sanitiser.datatype} but got {type(rawA)}"
298 |     assert isinstance(target, type(rawA)), f"Mismatch of datatypes between target record and raw data"
299 | 
300 |     kf = ShuffleSplit(n_splits=numSamples, train_size=sizeRaw)
301 | 
302 |     sanA, labelsA = [], []
303 |     for train_index, _ in kf.split(rawA):
304 |         worker_sanitise_data(rawA, train_index, Sanitiser, target, sanA, labelsA)
305 | 
306 |     return sanA, labelsA
307 | 
308 | 
309 | def worker_sanitise_data(rawA, train_index, Sanitiser, target, sanA, labelsA):
310 |     # Fit GM to data without target's data
311 |     if isinstance(rawA, DataFrame):
312 |         rawAout = rawA.iloc[train_index]
313 |     else:
314 |         rawAout = rawA[train_index, :]
315 |     sanOut = Sanitiser.sanitise(rawAout)
316 |     sanA.append(sanOut)
317 |     labelsA.append(LABEL_OUT)
318 | 
319 |     # Insert targets into training data
320 |     if isinstance(rawA, DataFrame):
321 |         rawAin = rawAout.append(target)
322 |     else:
323 |         if len(target.shape) == 1:
324 |             target = target.reshape(1, len(target))
325 |         rawAin = concatenate([rawAout, target])
326 | 
327 |     # Fit generative model to data including target
328 |     sanIn = Sanitiser.sanitise(rawAin)
329 |     sanA.append(sanIn)
330 |     labelsA.append(LABEL_IN)
331 | 
332 | 
333 | 
334 | 


--------------------------------------------------------------------------------
/attack_models/reconstruction.py:
--------------------------------------------------------------------------------
  1 | from os import path
  2 | from pandas.api.types import CategoricalDtype
  3 | from numpy import mean, concatenate, ones, sqrt, zeros, arange
  4 | from scipy.stats import norm
  5 | from sklearn.impute import SimpleImputer
  6 | from sklearn.linear_model import LinearRegression
  7 | from sklearn.ensemble import RandomForestClassifier
  8 | 
  9 | from attack_models.attack_model import PrivacyAttack
 10 | from utils.constants import *
 11 | from utils.logging import LOGGER
 12 | 
 13 | 
 14 | class AttributeInferenceAttack(PrivacyAttack):
 15 |     """A privacy attack that aims to reconstruct a sensitive attribute c given a partial target record T"""
 16 | 
 17 |     def __init__(self, PredictionModel, sensitiveAttribute, metadata, quids=None):
 18 |         """
 19 |         Parent class for simple regression attribute inference attack
 20 | 
 21 |         :param PredictionModel: object: sklearn-type prediction model
 22 |         :param sensitiveAttribute: string: name of a column in a DataFrame that is considered the unknown, sensitive attribute
 23 |         :param metadata: dict: schema for the data to be attacked
 24 |         :param backgroundKnowledge: pd.DataFrame: adversary's background knowledge dataset
 25 |         """
 26 | 
 27 |         self.PredictionModel = PredictionModel
 28 |         self.sensitiveAttribute = sensitiveAttribute
 29 | 
 30 |         self.metadata, self.knownAttributes, self.categoricalAttributes, self.nfeatures = self._read_meta(metadata, quids)
 31 | 
 32 |         self.ImputerCat = SimpleImputer(strategy='most_frequent')
 33 |         self.ImputerNum = SimpleImputer(strategy='median')
 34 | 
 35 |         self.trained = False
 36 | 
 37 |         self.__name__ = f'{self.PredictionModel.__class__.__name__}'
 38 | 
 39 |     def attack(self, targetAux, attemptLinkage=False, data=None):
 40 |         """Makes a guess about the target's secret attribute"""
 41 |         assert self.trained, 'Attack must first be trained on some data before can predict sensitive target value'
 42 | 
 43 |         if attemptLinkage:
 44 |             assert data is not None, "Need a dataset for linkage attack."
 45 |             try:
 46 |                 groups = data.groupby(self.categoricalAttributes)
 47 |                 targetCats = targetAux[self.categoricalAttributes].values
 48 |                 groupSize = groups.size()[targetCats]
 49 |                 if all(groupSize == 1):
 50 |                     guess = groups.get_group(tuple(targetCats[0]))[self.sensitiveAttribute].values[0]
 51 |                 else:
 52 |                     guess = self._make_guess(targetAux)
 53 |             except:
 54 |                 guess = self._make_guess(targetAux)
 55 |         else:
 56 |             guess = self._make_guess(targetAux)
 57 | 
 58 |         return guess
 59 | 
 60 |     def _make_guess(self, targetAux):
 61 |         raise NotImplementedError('Method must be overriden by a subclass')
 62 | 
 63 |     def _read_meta(self, metadata, quids):
 64 |         if quids is None:
 65 |             quids = []
 66 | 
 67 |         meta_dict = {}
 68 |         knownAttributes = []
 69 |         categoricalAttributes = []
 70 |         nfeatures = 0
 71 | 
 72 |         for cdict in metadata['columns']:
 73 |             attr_name = cdict['name']
 74 |             data_type = cdict['type']
 75 | 
 76 |             if data_type == FLOAT or data_type == INTEGER:
 77 |                 if attr_name in quids:
 78 |                     cat_bins = cdict['bins']
 79 |                     cat_labels = [f'({cat_bins[i]},{cat_bins[i+1]}]' for i in range(len(cat_bins)-1)]
 80 | 
 81 |                     meta_dict[attr_name] = {
 82 |                         'type': CATEGORICAL,
 83 |                         'categories': cat_labels,
 84 |                         'size': len(cat_labels)
 85 |                     }
 86 | 
 87 |                     nfeatures += len(cat_labels)
 88 | 
 89 |                     if attr_name != self.sensitiveAttribute:
 90 |                         categoricalAttributes.append(attr_name)
 91 | 
 92 |                 else:
 93 |                     meta_dict[attr_name] = {
 94 |                         'type': data_type,
 95 |                         'min': cdict['min'],
 96 |                         'max': cdict['max']
 97 |                     }
 98 | 
 99 |                     nfeatures += 1
100 | 
101 |             elif data_type == CATEGORICAL or data_type == ORDINAL:
102 |                 meta_dict[attr_name] = {
103 |                     'type': data_type,
104 |                     'categories': cdict['i2s'],
105 |                     'size': len(cdict['i2s'])
106 |                 }
107 | 
108 |                 nfeatures += len(cdict['i2s'])
109 | 
110 |                 if attr_name != self.sensitiveAttribute:
111 |                     categoricalAttributes.append(attr_name)
112 | 
113 |             else:
114 |                 raise ValueError(f'Unknown data type {data_type} for attribute {attr_name}')
115 | 
116 |             if attr_name != self.sensitiveAttribute:
117 |                 knownAttributes.append(attr_name)
118 | 
119 |         return meta_dict, knownAttributes, categoricalAttributes, nfeatures
120 | 
121 |     def _encode_data(self, data):
122 |         dfcopy = data.copy()
123 |         for col, cdict in self.metadata.items():
124 |             if col in list(dfcopy):
125 |                 col_data = dfcopy[col]
126 |                 if cdict['type'] in [CATEGORICAL, ORDINAL]:
127 |                     if len(col_data) > len(col_data.dropna()):
128 |                         col_data = col_data.fillna(FILLNA_VALUE_CAT)
129 |                         if FILLNA_VALUE_CAT not in cdict['categories']:
130 |                             col['categories'].append(FILLNA_VALUE_CAT)
131 |                             col['size'] += 1
132 | 
133 |                     cat = CategoricalDtype(categories=cdict['categories'], ordered=True)
134 |                     col_data = col_data.astype(cat)
135 |                     dfcopy[col] = col_data.cat.codes
136 | 
137 |         return dfcopy.values
138 | 
139 |     def _impute_missing_values(self, df):
140 |         dfImpute = df.copy()
141 | 
142 |         catCols = []
143 |         numCols = []
144 | 
145 |         for attr, col in self.metadata.items():
146 |             if attr in list(dfImpute):
147 |                 if col['type'] in [CATEGORICAL, ORDINAL]:
148 |                     catCols.append(attr)
149 |                 elif col['type'] in NUMERICAL:
150 |                     numCols.append(attr)
151 | 
152 |         self.ImputerCat.fit(df[catCols])
153 |         dfImpute[catCols] = self.ImputerCat.transform(df[catCols])
154 | 
155 |         self.ImputerNum.fit(df[numCols])
156 |         dfImpute[numCols] = self.ImputerNum.transform(df[numCols])
157 | 
158 |         return dfImpute
159 | 
160 |     def _one_hot(self, col_data, categories):
161 |         col_data_onehot = zeros((len(col_data), len(categories)))
162 |         cidx = [categories.index(c) for c in col_data]
163 |         col_data_onehot[arange(len(col_data)), cidx] = 1
164 | 
165 |         return col_data_onehot
166 | 
167 | 
168 | class LinRegAttack(AttributeInferenceAttack):
169 |     """An AttributeInferenceAttack based on a simple Linear Regression model"""
170 |     def __init__(self, sensitiveAttribute, metadata, quids=None):
171 |         super().__init__(LinearRegression(fit_intercept=False), sensitiveAttribute, metadata, quids)
172 | 
173 |         self.scaleFactor = None
174 |         self.coefficients = None
175 |         self.sigma = None
176 | 
177 | 
178 |     def train(self, data):
179 |         """
180 |         Train a MLE attack to reconstruct an unknown sensitive value from a vector of known attributes
181 |         :param data: type(DataFrame) A dataset of shape (n, k)
182 |         """
183 |         features = self._encode_data(data.drop(self.sensitiveAttribute, axis=1))
184 |         labels = data[self.sensitiveAttribute].values
185 | 
186 |         n, k = features.shape
187 | 
188 |         # Center independent variables for better regression performance
189 |         self.scaleFactor = mean(features, axis=0)
190 |         featuresScaled = features - self.scaleFactor
191 |         featuresScaled = concatenate([ones((n, 1)), featuresScaled], axis=1) # append all  ones for inclu intercept in beta vector
192 | 
193 |         # Get MLE for linear coefficients
194 |         self.PredictionModel.fit(featuresScaled, labels)
195 |         self.coefficients = self.PredictionModel.coef_
196 |         self.sigma = sum((labels - featuresScaled.dot(self.coefficients))**2)/(n-k)
197 | 
198 |         LOGGER.debug('Finished training regression model')
199 |         self.trained = True
200 | 
201 |     def _make_guess(self, targetAux):
202 |         targetFeatures = self._encode_data(targetAux)
203 |         targetFeaturesScaled = targetFeatures - self.scaleFactor
204 |         targetFeaturesScaled = concatenate([ones((len(targetFeaturesScaled), 1)), targetFeatures], axis=1)
205 | 
206 |         guess = targetFeaturesScaled.dot(self.coefficients)[0]
207 | 
208 |         return guess
209 | 
210 |     def get_likelihood(self, targetAux, targetSensitive, attemptLinkage=False, data=None):
211 |         assert self.trained, 'Attack must first be trained on some data before can predict sensitive target value'
212 | 
213 |         targetFeatures = self._encode_data(targetAux)
214 |         targetFeaturesScaled = targetFeatures - self.scaleFactor
215 |         targetFeaturesScaled = concatenate([ones((len(targetFeaturesScaled), 1)), targetFeatures], axis=1)
216 | 
217 |         if attemptLinkage:
218 |             assert data is not None, "Need a dataset for linkage attack."
219 |             try:
220 |                 groups = data.groupby(self.categoricalAttributes)
221 |                 targetCats = targetAux[self.categoricalAttributes].values
222 |                 groupSize = groups.size()[targetCats]
223 |                 if all(groupSize == 1):
224 |                     pCorrect = 1.
225 | 
226 |                 else:
227 |                     pdfLikelihood = norm(loc=targetFeaturesScaled.dot(self.coefficients), scale=sqrt(self.sigma))
228 |                     pCorrect = pdfLikelihood.pdf(targetSensitive)[0]
229 | 
230 |             except:
231 |                 pdfLikelihood = norm(loc=targetFeaturesScaled.dot(self.coefficients), scale=sqrt(self.sigma))
232 |                 pCorrect = pdfLikelihood.pdf(targetSensitive)[0]
233 |         else:
234 |             pdfLikelihood = norm(loc=targetFeaturesScaled.dot(self.coefficients), scale=sqrt(self.sigma))
235 |             pCorrect = pdfLikelihood.pdf(targetSensitive)[0]
236 | 
237 |         return pCorrect
238 | 
239 | 
240 | class RandForestAttack(AttributeInferenceAttack):
241 |     """An AttributeInferenceAttack based on a simple Linear Regression model"""
242 |     def __init__(self, sensitiveAttribute, metadata, quids=None):
243 |         super().__init__(RandomForestClassifier(), sensitiveAttribute, metadata, quids)
244 | 
245 |         self.labels = {l:i for i, l in enumerate(self.metadata[self.sensitiveAttribute]['categories'])}
246 |         self.labelsInv = {i:l for l, i in self.labels.items()}
247 | 
248 |         self.scaleFactor = None
249 | 
250 |     def train(self, data):
251 |         """
252 |         Train a Classifier to reconstruct an unknown sensitive label from a vector of known attributes
253 |         :param data: type(DataFrame) A dataset of shape (n, k)
254 |         """
255 |         features = self._encode_data(data.drop(self.sensitiveAttribute, axis=1))
256 |         labels = data[self.sensitiveAttribute].apply(lambda x: self.labels[x]).values
257 | 
258 |         # Feature normalisation
259 |         self.scaleFactor = mean(features, axis=0)
260 |         featuresScaled = features - self.scaleFactor
261 | 
262 |         # Get MLE for linear coefficients
263 |         self.PredictionModel.fit(featuresScaled, labels)
264 | 
265 |         LOGGER.debug('Finished training regression model')
266 |         self.trained = True
267 | 
268 |     def _make_guess(self, targetAux):
269 |         targetFeatures = self._encode_data(targetAux)
270 |         targetFeaturesScaled = targetFeatures - self.scaleFactor
271 | 
272 |         guess = self.PredictionModel.predict(targetFeaturesScaled)
273 | 
274 |         return self.labelsInv[guess[0]]
275 | 
276 |     def get_likelihood(self, targetAux, targetSensitive, attemptLinkage=False, data=None):
277 |         assert self.trained, 'Attack must first be trained on some data before can predict sensitive target value'
278 | 
279 |         targetFeatures = self._encode_data(targetAux)
280 |         targetFeaturesScaled = targetFeatures - self.scaleFactor
281 | 
282 |         if attemptLinkage:
283 |             assert data is not None, "Need a dataset for linkage attack."
284 |             try:
285 |                 groups = data.groupby(self.categoricalAttributes)
286 |                 targetCats = targetAux[self.categoricalAttributes].values
287 |                 groupSize = groups.size()[targetCats]
288 |                 if all(groupSize == 1):
289 |                     pCorrect = 1.
290 | 
291 |                 else:
292 |                     probs = self.PredictionModel.predict_proba(targetFeaturesScaled).flatten()
293 |                     pCorrect = probs[self.labels[targetSensitive]]
294 | 
295 |             except:
296 |                 probs = self.PredictionModel.predict_proba(targetFeaturesScaled).flatten()
297 |                 pCorrect = probs[self.labels[targetSensitive]]
298 |         else:
299 |             probs = self.PredictionModel.predict_proba(targetFeaturesScaled).flatten()
300 |             pCorrect = probs[self.labels[targetSensitive]]
301 | 
302 |         return pCorrect


--------------------------------------------------------------------------------
/data/germancredit.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "columns": [
  3 |     {
  4 |       "name": "Age",
  5 |       "min": 19.0,
  6 |       "max": 75.0,
  7 |       "type": "Float"
  8 |     },
  9 |     {
 10 |       "name": "Sex",
 11 |       "type": "Categorical",
 12 |       "size": 2,
 13 |       "i2s": [
 14 |         "male",
 15 |         "female"
 16 |       ]
 17 |     },
 18 |     {
 19 |       "name": "Job",
 20 |       "type": "Ordinal",
 21 |       "size": 4,
 22 |       "i2s": [
 23 |         "unemployed",
 24 |         "unskilled",
 25 |         "skilled",
 26 |         "management"
 27 |       ]
 28 |     },
 29 |     {
 30 |       "name": "Housing",
 31 |       "type": "Categorical",
 32 |       "size": 3,
 33 |       "i2s": [
 34 |         "own",
 35 |         "free",
 36 |         "rent"
 37 |       ]
 38 |     },
 39 |     {
 40 |       "name": "Saving accounts",
 41 |       "type": "Ordinal",
 42 |       "i2s": [
 43 |         "no_info",
 44 |         "little",
 45 |         "moderate",
 46 |         "quite rich",
 47 |         "rich"
 48 |       ],
 49 |       "size": 5
 50 |     },
 51 |     {
 52 |       "name": "Checking account",
 53 |       "type": "Ordinal",
 54 |       "size": 4,
 55 |       "i2s": [
 56 |         "no_info",
 57 |         "little",
 58 |         "moderate",
 59 |         "rich"
 60 |       ]
 61 |     },
 62 |     {
 63 |       "name": "Credit amount",
 64 |       "type": "Float",
 65 |       "min": 250.0,
 66 |       "max": 18424.0
 67 |     },
 68 |     {
 69 |       "name": "Duration",
 70 |       "type": "Float",
 71 |       "min": 4.0,
 72 |       "max": 72.0
 73 |     },
 74 |     {
 75 |       "name": "Purpose",
 76 |       "type": "Categorical",
 77 |       "size": 8,
 78 |       "i2s": [
 79 |         "radio/TV",
 80 |         "education",
 81 |         "furniture/equipment",
 82 |         "car",
 83 |         "business",
 84 |         "domestic appliances",
 85 |         "repairs",
 86 |         "vacation/others"
 87 |       ]
 88 |     },
 89 |     {
 90 |       "name": "Risk",
 91 |       "type": "Categorical",
 92 |       "size": 2,
 93 |       "i2s": [
 94 |         "good",
 95 |         "bad"
 96 |       ]
 97 |     }
 98 |   ]
 99 | }
100 | 


--------------------------------------------------------------------------------
/data/texas.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "columns": [
  3 |     {
  4 |       "name": "DISCHARGE",
  5 |       "type": "Categorical",
  6 |       "size": 9,
  7 |       "i2s": [
  8 |         "2013Q4",
  9 |         "2013Q1",
 10 |         "2013Q3",
 11 |         "2013Q2",
 12 |         "2012Q4",
 13 |         "2014Q4",
 14 |         "2014Q1",
 15 |         "2014Q2",
 16 |         "2014Q3"
 17 |       ]
 18 |     },
 19 |     {
 20 |       "name": "TYPE_OF_ADMISSION",
 21 |       "type": "Categorical",
 22 |       "size": 7,
 23 |       "i2s": [
 24 |         "3",
 25 |         "4",
 26 |         "1",
 27 |         "2",
 28 |         "9",
 29 |         "5",
 30 |         "INVALID"
 31 |       ]
 32 |     },
 33 |     {
 34 |       "name": "PAT_STATE",
 35 |       "type": "Categorical",
 36 |       "size": 9,
 37 |       "i2s": [
 38 |         "TX",
 39 |         "NM",
 40 |         "AR",
 41 |         "ZZ",
 42 |         "OK",
 43 |         "FC",
 44 |         "LA",
 45 |         "XX",
 46 |         "INVALID"
 47 |       ]
 48 |     },
 49 |     {
 50 |       "name": "PAT_STATUS",
 51 |       "type": "Categorical",
 52 |       "size": 23,
 53 |       "i2s": [
 54 |         "6",
 55 |         "1",
 56 |         "3",
 57 |         "2",
 58 |         "51",
 59 |         "62",
 60 |         "50",
 61 |         "20",
 62 |         "65",
 63 |         "5",
 64 |         "63",
 65 |         "7",
 66 |         "9",
 67 |         "4",
 68 |         "INVALID",
 69 |         "66",
 70 |         "61",
 71 |         "64",
 72 |         "30",
 73 |         "43",
 74 |         "8",
 75 |         "41",
 76 |         "40"
 77 |       ]
 78 |     },
 79 |     {
 80 |       "name": "SEX_CODE",
 81 |       "type": "Categorical",
 82 |       "size": 4,
 83 |       "i2s": [
 84 |         "F",
 85 |         "M",
 86 |         "U",
 87 |         "INVALID"
 88 |       ]
 89 |     },
 90 |     {
 91 |       "name": "RACE",
 92 |       "type": "Categorical",
 93 |       "size": 6,
 94 |       "i2s": [
 95 |         "4",
 96 |         "5",
 97 |         "3",
 98 |         "2",
 99 |         "INVALID",
100 |         "1"
101 |       ]
102 |     },
103 |     {
104 |       "name": "ETHNICITY",
105 |       "type": "Categorical",
106 |       "size": 3,
107 |       "i2s": [
108 |         "2",
109 |         "1",
110 |         "INVALID"
111 |       ]
112 |     },
113 |     {
114 |       "name": "ADMIT_WEEKDAY",
115 |       "type": "Categorical",
116 |       "size": 7,
117 |       "i2s": [
118 |         "3",
119 |         "4",
120 |         "1",
121 |         "2",
122 |         "7",
123 |         "5",
124 |         "6"
125 |       ]
126 |     },
127 |     {
128 |       "name": "PAT_AGE",
129 |       "type": "Ordinal",
130 |       "size": 23,
131 |       "i2s": [
132 |         "00",
133 |         "01",
134 |         "02",
135 |         "03",
136 |         "04",
137 |         "05",
138 |         "06",
139 |         "07",
140 |         "08",
141 |         "09",
142 |         "10",
143 |         "11",
144 |         "12",
145 |         "13",
146 |         "14",
147 |         "15",
148 |         "16",
149 |         "17",
150 |         "18",
151 |         "19",
152 |         "20",
153 |         "21",
154 |         "INVALID"
155 |       ]
156 |     },
157 |     {
158 |       "name": "RISK_MORTALITY",
159 |       "type": "Ordinal",
160 |       "size": 5,
161 |       "i2s": [
162 |         "0",
163 |         "1",
164 |         "2",
165 |         "3",
166 |         "4"
167 |       ]
168 |     },
169 |     {
170 |       "name": "ILLNESS_SEVERITY",
171 |       "type": "Ordinal",
172 |       "size": 5,
173 |       "i2s": [
174 |         "0",
175 |         "1",
176 |         "2",
177 |         "3",
178 |         "4"
179 |       ]
180 |     },
181 |     {
182 |       "name": "LENGTH_OF_STAY",
183 |       "type": "Integer",
184 |       "min": 1,
185 |       "max": 986
186 |     },
187 |     {
188 |       "name": "TOTAL_CHARGES",
189 |       "type": "Float",
190 |       "min": 0.0,
191 |       "max": 3293072.0
192 |     },
193 |     {
194 |       "name": "TOTAL_NON_COV_CHARGES",
195 |       "type": "Float",
196 |       "min": 0.0,
197 |       "max": 969641.0
198 |     },
199 |     {
200 |       "name": "TOTAL_CHARGES_ACCOMM",
201 |       "type": "Float",
202 |       "min": 0.0,
203 |       "max": 974433.0
204 |     },
205 |     {
206 |       "name": "TOTAL_NON_COV_CHARGES_ACCOMM",
207 |       "type": "Float",
208 |       "min": 0.0,
209 |       "max": 412751.0
210 |     },
211 |     {
212 |       "name": "TOTAL_CHARGES_ANCIL",
213 |       "type": "Float",
214 |       "min": 0.0,
215 |       "max": 2994631.0
216 |     },
217 |     {
218 |       "name": "TOTAL_NON_COV_CHARGES_ANCIL",
219 |       "type": "Float",
220 |       "min": 0.0,
221 |       "max": 642921.0
222 |     }
223 |   ]
224 | }
225 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvcr.io/nvidia/cuda:11.4.2-cudnn8-devel-ubuntu20.04
 2 | 
 3 | RUN export DEBIAN_FRONTEND="noninteractive" && \
 4 | apt-get update && \
 5 | apt-get upgrade -y && \
 6 | apt-get autoremove -y && \
 7 | apt-get install --no-install-recommends -y \
 8 | cm-super \
 9 | cython3 \
10 | dvipng \
11 | git \
12 | libfreetype6-dev \
13 | pkgconf \
14 | python3-dev \
15 | python3-pip \
16 | texlive \
17 | texlive-latex-extra \
18 | && \
19 | apt-get clean -y && \
20 | rm -rf /var/lib/apt/lists/*
21 | 
22 | COPY ./requirements.txt /requirements.txt
23 | 
24 | RUN cd / && \
25 | python3 -m pip install --upgrade pip && \
26 | python3 -m pip install --upgrade wheel && \
27 | python3 -m pip install numpy==1.19.5 && \
28 | python3 -m pip install -r requirements.txt
29 | 
30 | RUN cd / && \
31 | git clone https://github.com/spring-epfl/CTGAN.git && \
32 | git clone https://github.com/spring-epfl/synthetic_data_release.git && \
33 | cd CTGAN && \
34 | python3 -m pip install .
35 | 
36 | ENV PYTHONPATH "${PYTHONPATH}:/CTGAN"
37 | 
38 | ENTRYPOINT ["/bin/bash"]
39 | 


--------------------------------------------------------------------------------
/docker/requirements.txt:
--------------------------------------------------------------------------------
 1 | husl==4.0.3
 2 | loguru==0.5.3
 3 | matplotlib==3.4.3
 4 | notebook==6.4.6
 5 | palettable==3.3.0
 6 | pandas==0.25.3
 7 | scipy==1.7.1
 8 | seaborn==0.11.2
 9 | sklearn==0.0
10 | tensorflow==2.6.0
11 | torch==1.9.1
12 | 


--------------------------------------------------------------------------------
/executables/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Module containing executable scripts.
4 | 
5 | -----
6 | Nampoina Andriamilanto <tompo.andri@gmail.com>
7 | """
8 | 


--------------------------------------------------------------------------------
/executables/generate_metadata_file.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Generate the json metadata file given a dataset in csv format.
  4 | 
  5 | Please set the two global variables IMPLICIT_ORDINAL_ATTRIBUTES and
  6 | EXPLICIT_ORDINAL_ATTRIBUTES to correspond to the dataset that you use.
  7 | 
  8 | Great care should be taken when using this script to infer the type and the
  9 | domain of the attributes as it relies on the dataset that is given in
 10 | parameter.
 11 | 
 12 | 
 13 | usage: generate_metadata_file.py [-h] --dataset DATASET [--output OUTPUT]
 14 | 
 15 | optional arguments:
 16 |   -h, --help            show this help message and exit
 17 |   --dataset DATASET, -i DATASET
 18 |                         Path to the dataset in csv format
 19 |   --output OUTPUT, -o OUTPUT
 20 |                         Path where to write the json metadata file
 21 | 
 22 | -----
 23 | Nampoina Andriamilanto <tompo.andri@gmail.com>
 24 | """
 25 | 
 26 | import json
 27 | from argparse import ArgumentParser
 28 | from pathlib import Path
 29 | from typing import Set
 30 | 
 31 | import numpy as np
 32 | import pandas as pd
 33 | from loguru import logger
 34 | 
 35 | from utils.constants import (CATEGORICAL, FLOAT, INTEGER, NUMERICAL, ORDINAL)
 36 | from utils.utils import json_numpy_serialzer
 37 | 
 38 | # Please define the set of the ordinal attributes which values can be
 39 | # automatically sorted (using the sorted() python function)
 40 | IMPLICIT_ORDINAL_ATTRIBUTES = {'age'}
 41 | 
 42 | # Please define the set of the ordinal attributes which values are ordered
 43 | # manually
 44 | EXPLICIT_ORDINAL_ATTRIBUTES = {
 45 |     'education': ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th',
 46 |                   '11th', '12th', 'HS-grad', 'Prof-school', 'Assoc-acdm',
 47 |                   'Assoc-voc', 'Some-college', 'Bachelors', 'Masters',
 48 |                   'Doctorate']}
 49 | 
 50 | ORDINAL_ATTRIBUTES = IMPLICIT_ORDINAL_ATTRIBUTES.union(
 51 |     set(EXPLICIT_ORDINAL_ATTRIBUTES.keys()))
 52 | 
 53 | OUTPUT_FILE_SUFFIX = '.json'
 54 | JSON_SPACE_INDENT = 2
 55 | 
 56 | 
 57 | def main():
 58 |     """Generate the json metadata file."""
 59 |     # Parse the arguments
 60 |     argparser = ArgumentParser()
 61 |     argparser.add_argument('--dataset', '-i', type=str, required=True,
 62 |                            help='Path to the dataset in csv format')
 63 |     argparser.add_argument('--output', '-o', type=str,
 64 |                            help='Path where to write the json metadata file')
 65 |     args = argparser.parse_args()
 66 | 
 67 |     # Load the dataset
 68 |     logger.info(f'Loading the data from {args.dataset}')
 69 |     dataset_path = Path(args.dataset)
 70 |     dataset = pd.read_csv(dataset_path, header=0)
 71 |     logger.debug(f'Sample of the loaded dataset:\n{dataset}')
 72 |     dataset.info()
 73 | 
 74 |     # Generate the metadata of each attribute
 75 |     logger.info('Generating the metadata of the attributes')
 76 |     attributes = []
 77 |     for column in dataset.columns:
 78 |         # Get the numpy type of the column
 79 |         numpy_type = dataset[column].dtype
 80 |         logger.debug(f'{column} has the numpy type {numpy_type}')
 81 | 
 82 |         # Infer its type among (Integer, Float, Ordinal, Categorical)
 83 |         inferred_type = infer_type(column, numpy_type, ORDINAL_ATTRIBUTES)
 84 |         column_infos = {'name': column, 'type': inferred_type}
 85 |         logger.debug(column_infos)
 86 | 
 87 |         # If the type is numerical, set the min and max value
 88 |         if inferred_type in NUMERICAL:
 89 |             column_infos['min'] = dataset[column].min()
 90 |             column_infos['max'] = dataset[column].max()
 91 |         else:
 92 |             # If the type is explicitely ordinal, we retrieve its ordered
 93 |             # values which are set manually in EXPLICIT_ORDINAL_ATTRIBUTES.
 94 |             # Otherwise (implicit ordinal or categorical), we get the sorted
 95 |             # list of values from the dataset (the second parameter of get()).
 96 |             ordered_values = EXPLICIT_ORDINAL_ATTRIBUTES.get(
 97 |                 column, sorted(dataset[column].unique()))
 98 |             column_infos['size'] = len(ordered_values)
 99 | 
100 |             # If the values are numbers, we cast them to strings as the
101 |             # metadata configuration files seem to have the values of ordinal
102 |             # and categorical attributes specified as strings
103 |             if isinstance(ordered_values[0], np.number):
104 |                 ordered_values = [str(value) for value in ordered_values]
105 | 
106 |             column_infos['i2s'] = ordered_values
107 | 
108 |         attributes.append(column_infos)
109 | 
110 |     # Write the json metadata file
111 |     if args.output:
112 |         output_path = args.output
113 |     else:
114 |         output_path = dataset_path.with_name(
115 |             dataset_path.stem + OUTPUT_FILE_SUFFIX)
116 |     logger.info(f'Writting the metadata to {output_path}')
117 | 
118 |     with open(output_path, 'w+') as json_output_file:
119 |         json.dump({'columns': attributes}, json_output_file,
120 |                   indent=JSON_SPACE_INDENT, default=json_numpy_serialzer)
121 | 
122 | 
123 | def infer_type(column: str, numpy_type: str, ordinal_attributes: Set[str]
124 |                ) -> str:
125 |     """Infer the type of an attribute given its numpy type.
126 | 
127 |     Args:
128 |         column: The name of the column.
129 |         numpy_type: The numpy type of the column.
130 |         ordinal_attributes: The set of the ordinal attributes.
131 |     """
132 |     if column in ordinal_attributes:
133 |         return ORDINAL
134 |     if np.issubdtype(numpy_type, np.integer):
135 |         return INTEGER
136 |     if np.issubdtype(numpy_type, np.floating):
137 |         return FLOAT
138 |     return CATEGORICAL
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     main()
143 | 


--------------------------------------------------------------------------------
/executables/generate_synthetic_dataset.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Launcher to process the PrivBayes mechanism.
  4 | 
  5 | This script is an adaptation of the execution scripts from
  6 | https://github.com/spring-epfl/synthetic_data_release.
  7 | 
  8 | -----
  9 | Nampoina Andriamilanto <tompo.andri@gmail.com>
 10 | """
 11 | 
 12 | from argparse import ArgumentParser
 13 | from ast import literal_eval
 14 | from pathlib import Path
 15 | from warnings import simplefilter
 16 | simplefilter('ignore', category=FutureWarning)
 17 | simplefilter('ignore', category=DeprecationWarning)
 18 | 
 19 | from loguru import logger
 20 | 
 21 | from generative_models.ctgan import CTGAN
 22 | from generative_models.data_synthesiser import (
 23 |     IndependentHistogram, BayesianNet, PrivBayes)
 24 | from generative_models.pate_gan import PATEGAN
 25 | from utils.datagen import load_s3_data_as_df, load_local_data_as_df
 26 | 
 27 | 
 28 | DEFAULT_SAMPLE_SIZE = 1000
 29 | 
 30 | 
 31 | def main():
 32 |     """Execute the PrivBayes mechanism."""
 33 |     # Parse the arguments
 34 |     argparser = ArgumentParser()
 35 |     datasource = argparser.add_mutually_exclusive_group()
 36 |     datasource.add_argument('--s3name', '-S3', type=str, choices=[
 37 |                             'adult', 'census', 'credit', 'alarm', 'insurance'],
 38 |                             help='Name of the dataset to run on')
 39 |     datasource.add_argument('--datapath', '-D', type=str,
 40 |                             help='Path to a local data file')
 41 |     argparser.add_argument('--mechanism', '-M', type=str, choices=[
 42 |         'IndependentHistogram', 'BayesianNet', 'PrivBayes', 'CTGAN', 'PATEGAN'
 43 |         ], default='PrivBayes', help='The mechanism to use')
 44 |     argparser.add_argument('--parameters', '-P', type=str, default=None,
 45 |                            help='The parameters of the mechanism to use '
 46 |                                 'separated by a colon')
 47 |     argparser.add_argument('--output-file', '-O', type=str,
 48 |                            help='The file where to store the synthetic dataset'
 49 |                            )
 50 |     argparser.add_argument('--sample-size', '-N', type=int,
 51 |                            default=DEFAULT_SAMPLE_SIZE,
 52 |                            help='The size of the synthetic dataset')
 53 |     args = argparser.parse_args()
 54 | 
 55 |     # Load data
 56 |     if args.s3name:
 57 |         raw_pop, metadata = load_s3_data_as_df(args.s3name)
 58 |         dname = args.s3name
 59 |     elif args.datapath:
 60 |         raw_pop, metadata = load_local_data_as_df(Path(args.datapath))
 61 |         dname = args.datapath.split('/')[-1]
 62 |     else:
 63 |         raise ValueError('Please provide a dataset')
 64 |     logger.info(f'Loaded data {dname}:\n{raw_pop}')
 65 |     logger.info(f'Loaded the corresponding metadata: {metadata}')
 66 | 
 67 |     # Initialize the mechanism
 68 |     parameters = []
 69 |     if args.parameters:
 70 |         parameters = [literal_eval(param)
 71 |                       for param in args.parameters.split(',')]
 72 |     logger.debug(f'Parameters: {parameters}')
 73 | 
 74 |     # IndependentHistogram parameters:
 75 |     # histogram_bins=10, infer_ranges=False, multiprocess=True
 76 |     if args.mechanism == 'IndependentHistogram':
 77 |         mechanism = IndependentHistogram(metadata, *parameters)
 78 | 
 79 |     # BayesianNet parameters:
 80 |     # histogram_bins=10, degree=1, infer_ranges=False, multiprocess=True,
 81 |     # seed=None
 82 |     elif args.mechanism == 'BayesianNet':
 83 |         mechanism = BayesianNet(metadata, *parameters)
 84 | 
 85 |     # PrivBayes parameters:
 86 |     # histogram_bins=10, degree=1, epsilon=.1, infer_ranges=False,
 87 |     # multiprocess=True, seed=None
 88 |     elif args.mechanism == 'PrivBayes':
 89 |         mechanism = PrivBayes(metadata, *parameters)
 90 | 
 91 |     # CTGAN parameters:
 92 |     # embedding_dim=128, gen_dim=(256, 256), dis_dim=(256, 256), l2scale=1e-6,
 93 |     # batch_size=500, epochs=300, multiprocess=False
 94 |     elif args.mechanism == 'CTGAN':
 95 |         mechanism = CTGAN(metadata, *parameters)
 96 | 
 97 |     # PATEGAN parameters:
 98 |     # eps=1, delta=1e-5, infer_ranges=False, num_teachers=10, n_iters=100,
 99 |     # batch_size=128, learning_rate=1e-4, multiprocess=False
100 |     elif args.mechanism == 'PATEGAN':
101 |         mechanism = PATEGAN(metadata, *parameters)
102 | 
103 |     # Unknown mechanism
104 |     else:
105 |         raise ValueError(f'Unknown mechanism {args.mechanism}')
106 | 
107 |     # Set the output path
108 |     output_path = Path(f'{mechanism.__name__}.csv')
109 |     if args.output_file:
110 |         output_path = Path(args.output_file)
111 | 
112 |     # Generate the synthetic data
113 |     logger.info('Generating the synthetic data, this can take time...')
114 |     mechanism.fit(raw_pop)
115 |     mechanism.generate_samples(args.sample_size).to_csv(output_path)
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     main()
120 | 


--------------------------------------------------------------------------------
/feature_sets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/feature_sets/__init__.py


--------------------------------------------------------------------------------
/feature_sets/bayes.py:
--------------------------------------------------------------------------------
 1 | """A set of features that a Bayesian Net model is expected to extract from the raw data"""
 2 | from pandas import DataFrame, get_dummies
 3 | from pandas.api.types import CategoricalDtype
 4 | from numpy import ndarray, all, corrcoef, concatenate, nan_to_num, zeros_like, triu_indices_from
 5 | from itertools import combinations
 6 | 
 7 | from utils.constants import *
 8 | from utils.logging import LOGGER
 9 | from feature_sets.feature_set import FeatureSet
10 | from feature_sets.independent_histograms import HistogramFeatureSet
11 | 
12 | 
13 | class CorrelationsFeatureSet(FeatureSet):
14 |     def __init__(self, datatype, metadata, quids=None):
15 |         assert datatype in [DataFrame, ndarray], 'Unknown data type {}'.format(datatype)
16 |         self.datatype = datatype
17 |         self.nfeatures = 0
18 | 
19 |         self.cat_attributes = []
20 |         self.num_attributes = []
21 | 
22 |         self.category_codes = {}
23 | 
24 |         if quids is None:
25 |             quids = []
26 | 
27 |         for cdict in metadata['columns']:
28 |             attr_name = cdict['name']
29 |             dtype = cdict['type']
30 | 
31 |             if dtype == FLOAT or dtype == INTEGER:
32 |                 if attr_name not in quids:
33 |                     self.num_attributes.append(attr_name)
34 |                 else:
35 |                     self.cat_attributes.append(attr_name)
36 |                     cat_bins = cdict['bins']
37 |                     cat_labels = [f'({cat_bins[i]},{cat_bins[i+1]}]' for i in range(len(cat_bins)-1)]
38 |                     self.category_codes[attr_name] = cat_labels
39 |                     self.nfeatures += len(cat_labels)
40 | 
41 |             elif dtype == CATEGORICAL or dtype == ORDINAL:
42 |                 self.cat_attributes.append(attr_name)
43 |                 self.category_codes[attr_name] = cdict['i2s']
44 |                 self.nfeatures += len(cdict['i2s'])
45 | 
46 |         LOGGER.debug(f'Feature set will have length {self.nfeatures}')
47 | 
48 |         self.__name__ = 'Correlations'
49 | 
50 |     def extract(self, data, flatten=True):
51 |         assert isinstance(data, self.datatype), f'Feature extraction expects {self.datatype} as input type'
52 | 
53 |         assert all([c in list(data) for c in self.cat_attributes]), 'Missing some categorical attributes in input data'
54 |         assert all([c in list(data) for c in self.num_attributes]), 'Missing some numerical attributes in input data'
55 | 
56 |         encoded = data[self.num_attributes].copy()
57 |         for c in self.cat_attributes:
58 |             col = data[c]
59 |             col = col.astype(CategoricalDtype(categories=self.category_codes[c], ordered=True))
60 |             encoded = encoded.merge(get_dummies(col, drop_first=True, prefix=c), left_index=True, right_index=True)
61 | 
62 |         col_names = list(encoded)
63 |         self.feature_names = list(combinations(col_names, r=2))
64 | 
65 |         corr = encoded.corr().fillna(0).values
66 | 
67 |         mask = zeros_like(corr).astype(bool)
68 |         mask[triu_indices_from(mask, k=1)] = True
69 | 
70 |         if flatten:
71 |             features = corr[mask].flatten()
72 |         else:
73 |             features = corr
74 | 
75 |         return features
76 | 
77 | 
78 | class BayesFeatureSet(FeatureSet):
79 |     def __init__(self, datatype, metadata, nbins=10, quids=None):
80 |         assert datatype in [DataFrame, ndarray], 'Unknown data type {}'.format(datatype)
81 |         self.datatype = datatype
82 | 
83 |         self.histograms  = HistogramFeatureSet(datatype, metadata, nbins, quids)
84 |         self.correlations = CorrelationsFeatureSet(datatype, metadata, quids)
85 | 
86 |     def extract(self, data):
87 |         Hist = self.histograms.extract(data)
88 |         Corr = self.correlations.extract(data)
89 | 
90 |         return concatenate([Hist, Corr])
91 | 


--------------------------------------------------------------------------------
/feature_sets/feature_set.py:
--------------------------------------------------------------------------------
1 | """A parent class for all feature extraction layers"""
2 | 
3 | 
4 | class FeatureSet(object):
5 |     def extract(self, data):
6 |         return NotImplementedError('Method needs to be overwritten by subclass')


--------------------------------------------------------------------------------
/feature_sets/independent_histograms.py:
--------------------------------------------------------------------------------
 1 | from pandas import DataFrame
 2 | from numpy import ndarray, array, linspace, all
 3 | from pandas.api.types import CategoricalDtype
 4 | 
 5 | from feature_sets.feature_set import FeatureSet
 6 | from utils.logging import LOGGER
 7 | from utils.constants import *
 8 | 
 9 | from warnings import filterwarnings
10 | filterwarnings('ignore', message=r"Parsing", category=FutureWarning)
11 | 
12 | 
13 | class HistogramFeatureSet(FeatureSet):
14 |     def __init__(self, datatype, metadata, nbins=10, quids=None):
15 |         assert datatype in [DataFrame], 'Unknown data type {}'.format(datatype)
16 |         self.datatype = datatype
17 |         self.nfeatures = 0
18 | 
19 |         self.cat_attributes = []
20 |         self.num_attributes = []
21 | 
22 |         self.histogram_bins = {}
23 |         self.category_codes = {}
24 | 
25 |         if quids is None:
26 |             quids = []
27 | 
28 |         for cdict in metadata['columns']:
29 |             attr_name = cdict['name']
30 |             dtype = cdict['type']
31 | 
32 |             if dtype == FLOAT or dtype == INTEGER:
33 |                 if attr_name not in quids:
34 |                     self.num_attributes.append(attr_name)
35 |                     self.histogram_bins[attr_name] = linspace(cdict['min'], cdict['max'], nbins+1)
36 |                     self.nfeatures += nbins
37 |                 else:
38 |                     self.cat_attributes.append(attr_name)
39 |                     cat_bins = cdict['bins']
40 |                     cat_labels = [f'({cat_bins[i]},{cat_bins[i+1]}]' for i in range(len(cat_bins)-1)]
41 |                     self.category_codes[attr_name] = cat_labels
42 |                     self.nfeatures += len(cat_labels)
43 | 
44 |             elif dtype == CATEGORICAL or dtype == ORDINAL:
45 |                 self.cat_attributes.append(attr_name)
46 |                 self.category_codes[attr_name] = cdict['i2s']
47 |                 self.nfeatures += len(cdict['i2s'])
48 | 
49 |         LOGGER.debug(f'Feature set will have length {self.nfeatures}')
50 | 
51 |         self.__name__ = 'Histogram'
52 | 
53 |     def extract(self, data):
54 |         assert isinstance(data, self.datatype), f'Feature extraction expects {self.datatype} as input type'
55 | 
56 |         assert all([c in list(data) for c in self.cat_attributes]), 'Missing some categorical attributes in input data'
57 |         assert all([c in list(data) for c in self.num_attributes]), 'Missing some numerical attributes in input data'
58 | 
59 |         features = []
60 |         for attr in self.num_attributes:
61 |             col = data[attr]
62 |             F = col.value_counts(bins=self.histogram_bins[attr]).values
63 |             features.extend(F.tolist())
64 | 
65 |         for attr in self.cat_attributes:
66 |             col = data[attr]
67 |             col = col.astype(CategoricalDtype(categories=self.category_codes[attr], ordered=True))
68 |             F = col.value_counts().loc[self.category_codes[attr]].values
69 |             features.extend(F.tolist())
70 | 
71 |         assert len(features) == self.nfeatures, f'Expected number of features is {self.nfeatures} but found {len(features)}'
72 | 
73 |         return array(features)
74 | 
75 |     def _get_names(self):
76 |         feature_names = []
77 |         for attr in self.num_attributes:
78 |             bins = self.histogram_bins[attr]
79 |             feature_names.extend([f'{attr}({int(bins[i-1])},{int(bins[i])}]' for i in range(1,len(bins))])
80 | 
81 |         for attr in self.cat_attributes:
82 |             feature_names.extend([f'{attr}_{c}' for c in self.category_codes[attr]])
83 | 
84 |         return feature_names
85 | 
86 | 


--------------------------------------------------------------------------------
/feature_sets/model_agnostic.py:
--------------------------------------------------------------------------------
 1 | """A simple feature extraction layer for data with a mix of categorical and numerical attributes"""
 2 | from os import path
 3 | from pandas import DataFrame
 4 | from numpy import ndarray, nanmean, nanmedian, nanvar, array, concatenate
 5 | from pandas.api.types import is_numeric_dtype, CategoricalDtype
 6 | 
 7 | from utils.logging import LOGGER
 8 | from feature_sets.feature_set import FeatureSet
 9 | from feature_sets.independent_histograms import HistogramFeatureSet
10 | from feature_sets.bayes import CorrelationsFeatureSet
11 | 
12 | from warnings import filterwarnings
13 | filterwarnings('ignore', message=r"Parsing", category=FutureWarning)
14 | 
15 | 
16 | class NaiveFeatureSet(FeatureSet):
17 |     def __init__(self, datatype):
18 |         self.datatype = datatype
19 |         self.attributes = None
20 |         self.category_codes = {}
21 |         assert self.datatype in [DataFrame, ndarray], 'Unknown data type {}'.format(datatype)
22 | 
23 |         self.__name__ = 'Naive'
24 | 
25 |     def extract(self, data):
26 |         if self.datatype is DataFrame:
27 |             assert isinstance(data, DataFrame), 'Feature extraction expects DataFrame as input'
28 |             if self.attributes is not None:
29 |                 if bool(set(list(data)).difference(set(self.attributes))):
30 |                     raise ValueError('Data to filter does not match expected schema')
31 |             else:
32 |                 self.attributes = list(data)
33 |             features = DataFrame(columns=self.attributes)
34 |             for c in self.attributes:
35 |                 col = data[c]
36 |                 if is_numeric_dtype(col):
37 |                     features[c] = [col.mean(), col.median(), col.var()]
38 |                 else:
39 |                     if c in self.category_codes.keys():
40 |                         new_cats = set(col.astype('category').cat.categories).difference(set(self.category_codes[c]))
41 |                         self.category_codes[c] += list(new_cats)
42 |                         col = col.astype(CategoricalDtype(categories=self.category_codes[c]))
43 |                     else:
44 |                         col = col.astype('category')
45 |                         self.category_codes[c] = list(col.cat.categories)
46 |                     counts = list(col.cat.codes.value_counts().index)
47 |                     features[c] = [counts[0], counts[-1], len(counts)]
48 |             features = features.values
49 | 
50 |         elif self.datatype is ndarray:
51 |             assert isinstance(data, ndarray), 'Feature extraction expects ndarray as input'
52 |             features = array([nanmean(data), nanmedian(data), nanvar(data)])
53 |         else:
54 |             raise ValueError(f'Unknown data type {type(data)}')
55 | 
56 |         return features.flatten()
57 | 
58 | 
59 | class EnsembleFeatureSet(FeatureSet):
60 |     """An ensemble of features that is not model specific"""
61 |     def __init__(self, datatype, metadata, nbins=10, quasi_id_cols=None):
62 |         assert datatype in [DataFrame, ndarray], 'Unknown data type {}'.format(datatype)
63 |         self.datatype = datatype
64 | 
65 |         self.naive = NaiveFeatureSet(datatype)
66 |         self.histograms  = HistogramFeatureSet(datatype, metadata, nbins=nbins, quids=quasi_id_cols)
67 |         self.correlations = CorrelationsFeatureSet(datatype, metadata, quids=quasi_id_cols)
68 | 
69 |         self.__name__ = 'Ensemble'
70 | 
71 |     def extract(self, data):
72 |         F_naive = self.naive.extract(data)
73 |         F_hist = self.histograms.extract(data)
74 |         F_corr = self.correlations.extract(data)
75 | 
76 |         return concatenate([F_naive, F_hist, F_corr])
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/generative_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/generative_models/__init__.py


--------------------------------------------------------------------------------
/generative_models/ctgan.py:
--------------------------------------------------------------------------------
 1 | from pandas import DataFrame
 2 | 
 3 | from utils.logging import LOGGER
 4 | 
 5 | from generative_models.generative_model import GenerativeModel
 6 | from ctgan import CTGANSynthesizer
 7 | 
 8 | 
 9 | class CTGAN(GenerativeModel):
10 |     """A conditional generative adversarial network for tabular data"""
11 |     def __init__(self, metadata,
12 |                  embedding_dim=128, gen_dim=(256, 256),
13 |                  dis_dim=(256, 256), l2scale=1e-6,
14 |                  batch_size=500, epochs=300,
15 |                  multiprocess=False):
16 | 
17 |         self.synthesiser = CTGANSynthesizer(embedding_dim, gen_dim, dis_dim,
18 |                                             l2scale, batch_size, epochs)
19 | 
20 |         self.metadata = metadata
21 |         self.datatype = DataFrame
22 | 
23 |         self.multiprocess = bool(multiprocess)
24 | 
25 |         self.infer_ranges = True
26 |         self.trained = False
27 | 
28 |         self.__name__ = 'CTGAN'
29 | 
30 |     def fit(self, data):
31 |         """Train a generative adversarial network on tabular data.
32 |         Input data is assumed to be of shape (n_samples, n_features)
33 |         See https://github.com/DAI-Lab/SDGym for details"""
34 |         assert isinstance(data, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(data)}'
35 | 
36 |         LOGGER.debug(f'Start fitting {self.__class__.__name__} to data of shape {data.shape}...')
37 |         self.synthesiser.fit(data, self.metadata)
38 | 
39 |         LOGGER.debug(f'Finished fitting')
40 |         self.trained = True
41 | 
42 |     def generate_samples(self, nsamples):
43 |         """Generate random samples from the fitted Gaussian distribution"""
44 |         assert self.trained, "Model must first be fitted to some data."
45 | 
46 |         LOGGER.debug(f'Generate synthetic dataset of size {nsamples}')
47 |         synthetic_data = self.synthesiser.sample(nsamples)
48 | 
49 |         return synthetic_data
50 | 


--------------------------------------------------------------------------------
/generative_models/data_synthesiser.py:
--------------------------------------------------------------------------------
  1 | """Generative models adapted from https://github.com/DataResponsibly/DataSynthesizer"""
  2 | # Copyright <2018> <dataresponsibly.com>
  3 | 
  4 | from numpy.random import seed, laplace, choice
  5 | from pandas import DataFrame, merge
  6 | from itertools import product
  7 | 
  8 | from generative_models.data_synthesiser_utils.datatypes.FloatAttribute import FloatAttribute
  9 | from generative_models.data_synthesiser_utils.datatypes.IntegerAttribute import IntegerAttribute
 10 | from generative_models.data_synthesiser_utils.datatypes.StringAttribute import StringAttribute
 11 | from generative_models.data_synthesiser_utils.utils import bayes_worker, normalize_given_distribution, exponential_mechanism
 12 | 
 13 | from generative_models.generative_model import GenerativeModel
 14 | 
 15 | from utils.constants import *
 16 | from utils.logging import LOGGER
 17 | 
 18 | 
 19 | class IndependentHistogram(GenerativeModel):
 20 | 
 21 |     def __init__(self, metadata, histogram_bins=10, infer_ranges=False, multiprocess=True):
 22 |         self.metadata = self._read_meta(metadata)
 23 |         self.histogram_bins = histogram_bins
 24 | 
 25 |         self.datatype = DataFrame
 26 |         self.multiprocess = bool(multiprocess)
 27 |         self.infer_ranges = bool(infer_ranges)
 28 | 
 29 |         self.DataDescriber = None
 30 | 
 31 |         self.trained = False
 32 | 
 33 |         self.__name__ = 'IndependentHistogram'
 34 | 
 35 |     def fit(self, data):
 36 |         assert isinstance(data, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(data)}'
 37 |         LOGGER.debug(f'Start fitting IndependentHistogram model to data of shape {data.shape}...')
 38 |         if self.trained:
 39 |             self.trained = False
 40 |             self.DataDescriber = None
 41 | 
 42 |         self.DataDescriber = DataDescriber(self.metadata, self.histogram_bins, self.infer_ranges)
 43 |         self.DataDescriber.describe(data)
 44 |         LOGGER.debug(f'Finished fitting IndependentHistogram')
 45 |         self.trained = True
 46 | 
 47 |     def generate_samples(self, nsamples):
 48 |         assert self.trained, "Model must be fitted to some data first"
 49 | 
 50 |         LOGGER.debug(f'Generate synthetic dataset of size {nsamples}')
 51 |         synthetic_dataset = DataFrame(columns=self.DataDescriber.attr_names)
 52 |         for attr_name, Attr in self.DataDescriber.attr_dict.items():
 53 |             binning_indices = Attr.sample_binning_indices_in_independent_attribute_mode(nsamples)
 54 |             synthetic_dataset[attr_name] = Attr.sample_values_from_binning_indices(binning_indices)
 55 | 
 56 |         LOGGER.debug(f'Generated synthetic dataset of size {nsamples}')
 57 |         return synthetic_dataset
 58 | 
 59 |     def _read_meta(self, metadata):
 60 |         """ Read metadata from metadata file."""
 61 |         metadict = {}
 62 | 
 63 |         for cdict in metadata['columns']:
 64 |             col = cdict['name']
 65 |             coltype = cdict['type']
 66 | 
 67 |             if coltype == FLOAT or coltype == INTEGER:
 68 |                 metadict[col] = {
 69 |                     'type': coltype,
 70 |                     'min': cdict['min'],
 71 |                     'max': cdict['max']
 72 |                 }
 73 | 
 74 |             elif coltype == CATEGORICAL or coltype == ORDINAL:
 75 |                 metadict[col] = {
 76 |                     'type': coltype,
 77 |                     'categories': cdict['i2s'],
 78 |                     'size': len(cdict['i2s'])
 79 |                 }
 80 | 
 81 |             else:
 82 |                 raise ValueError(f'Unknown data type {coltype} for attribute {col}')
 83 | 
 84 |         return metadict
 85 | 
 86 | 
 87 | class BayesianNet(GenerativeModel):
 88 |     """
 89 |     A BayesianNet model using non-private GreedyBayes to learn conditional probabilities
 90 |     """
 91 |     def __init__(self, metadata, histogram_bins=10, degree=1, infer_ranges=False, multiprocess=True, seed=None):
 92 |         self.metadata = self._read_meta(metadata)
 93 |         self.histogram_bins = histogram_bins
 94 |         self.degree = degree
 95 |         self.num_attributes = len(metadata['columns'])
 96 | 
 97 |         self.multiprocess = bool(multiprocess)
 98 |         self.infer_ranges = bool(infer_ranges)
 99 |         self.seed = seed
100 |         self.datatype = DataFrame
101 | 
102 |         self.bayesian_network = None
103 |         self.conditional_probabilities = None
104 |         self.DataDescriber = None
105 |         self.trained = False
106 | 
107 |         self.__name__ = 'BayesianNet'
108 | 
109 |     def fit(self, data):
110 |         assert isinstance(data, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(data)}'
111 |         assert len(list(data)) >= 2, "BayesianNet requires at least 2 attributes(i.e., columns) in dataset."
112 |         LOGGER.debug(f'Start training BayesianNet on data of shape {data.shape}...')
113 |         if self.trained:
114 |             self.trained = False
115 |             self.DataDescriber = None
116 |             self.bayesian_network = None
117 |             self.conditional_probabilities = None
118 | 
119 |         self.DataDescriber = DataDescriber(self.metadata, self.histogram_bins, self.infer_ranges)
120 |         self.DataDescriber.describe(data)
121 | 
122 |         encoded_df = DataFrame(columns=self.DataDescriber.attr_names)
123 |         for attr_name, column in self.DataDescriber.attr_dict.items():
124 |             encoded_df[attr_name] = column.encode_values_into_bin_idx()
125 | 
126 |         self.bayesian_network = self._greedy_bayes_linear(encoded_df, self.degree)
127 | 
128 |         self.conditional_probabilities = self._construct_conditional_probabilities(self.bayesian_network, encoded_df)
129 | 
130 |         LOGGER.debug(f'Finished training Bayesian net')
131 |         self.trained = True
132 | 
133 |     def generate_samples(self, nsamples):
134 |         LOGGER.debug(f'Generate synthetic dataset of size {nsamples}')
135 |         assert self.trained, "Model must be fitted to some real data first"
136 |         synthetic_data = DataFrame(columns=self.DataDescriber.attr_names)
137 | 
138 |         # Get samples for attributes modelled in Bayesian net
139 |         encoded_dataset = self._generate_encoded_dataset(nsamples)
140 | 
141 |         for attr in self.DataDescriber.attr_names:
142 |             column = self.DataDescriber.attr_dict[attr]
143 |             if attr in encoded_dataset:
144 |                 synthetic_data[attr] = column.sample_values_from_binning_indices(encoded_dataset[attr])
145 |             else:
146 |                 # For attributes not in BN use independent attribute mode
147 |                 binning_indices = column.sample_binning_indices_in_independent_attribute_mode(nsamples)
148 |                 synthetic_data[attr] = column.sample_values_from_binning_indices(binning_indices)
149 | 
150 |         return synthetic_data
151 | 
152 |     def _generate_encoded_dataset(self, nsamples):
153 |         encoded_df = DataFrame(columns=self._get_sampling_order(self.bayesian_network))
154 | 
155 |         bn_root_attr = self.bayesian_network[0][1][0]
156 |         root_attr_dist = self.conditional_probabilities[bn_root_attr]
157 |         encoded_df[bn_root_attr] = choice(len(root_attr_dist), size=nsamples, p=root_attr_dist)
158 | 
159 |         for child, parents in self.bayesian_network:
160 |             child_conditional_distributions = self.conditional_probabilities[child]
161 | 
162 |             for parents_instance in child_conditional_distributions.keys():
163 |                 dist = child_conditional_distributions[parents_instance]
164 |                 parents_instance = list(eval(parents_instance))
165 | 
166 |                 filter_condition = ''
167 |                 for parent, value in zip(parents, parents_instance):
168 |                     filter_condition += f"(encoded_df['{parent}']=={value})&"
169 | 
170 |                 filter_condition = eval(filter_condition[:-1])
171 |                 size = encoded_df[filter_condition].shape[0]
172 |                 if size:
173 |                     encoded_df.loc[filter_condition, child] = choice(len(dist), size=size, p=dist)
174 | 
175 |             # Fill any nan values by sampling from marginal child distribution
176 |             marginal_dist = self.DataDescriber.attr_dict[child].distribution_probabilities
177 |             null_idx = encoded_df[child].isnull()
178 |             encoded_df.loc[null_idx, child] = choice(len(marginal_dist), size=null_idx.sum(), p=marginal_dist)
179 | 
180 |         encoded_df[encoded_df.columns] = encoded_df[encoded_df.columns].astype(int)
181 | 
182 |         return encoded_df
183 | 
184 |     def _get_sampling_order(self, bayesian_net):
185 |         order = [bayesian_net[0][1][0]]
186 |         for child, _ in bayesian_net:
187 |             order.append(child)
188 |         return order
189 | 
190 |     def _greedy_bayes_linear(self, encoded_df, k=1):
191 |         """Construct a Bayesian Network (BN) using greedy algorithm."""
192 |         dataset = encoded_df.astype(str, copy=False)
193 | 
194 |         # Optional: Fix sed for reproducibility
195 |         if self.seed is not None:
196 |             seed(self.seed)
197 | 
198 |         root_attribute = choice(dataset.columns)
199 |         V = [root_attribute]
200 |         rest_attributes = set(dataset.columns)
201 |         rest_attributes.remove(root_attribute)
202 |         bayesian_net = []
203 |         while rest_attributes:
204 |             parents_pair_list = []
205 |             mutual_info_list = []
206 | 
207 |             num_parents = min(len(V), k)
208 |             for child, split in product(rest_attributes, range(len(V) - num_parents + 1)):
209 |                 task = (child, V, num_parents, split, dataset)
210 |                 res = bayes_worker(task)
211 |                 parents_pair_list += res[0]
212 |                 mutual_info_list += res[1]
213 | 
214 |             idx = mutual_info_list.index(max(mutual_info_list))
215 | 
216 |             bayesian_net.append(parents_pair_list[idx])
217 |             adding_attribute = parents_pair_list[idx][0]
218 |             V.append(adding_attribute)
219 |             rest_attributes.remove(adding_attribute)
220 | 
221 |         return bayesian_net
222 | 
223 |     def _construct_conditional_probabilities(self, bayesian_network, encoded_dataset):
224 |         k = len(bayesian_network[-1][1])
225 |         conditional_distributions = {}
226 | 
227 |         # first k+1 attributes
228 |         root = bayesian_network[0][1][0]
229 |         kplus1_attributes = [root]
230 |         for child, _ in bayesian_network[:k]:
231 |             kplus1_attributes.append(child)
232 | 
233 |         freqs_of_kplus1_attributes = self._get_attribute_frequency_counts(kplus1_attributes, encoded_dataset)
234 | 
235 |         # get distribution of root attribute
236 |         root_marginal_freqs = freqs_of_kplus1_attributes.loc[:, [root, 'count']].groupby(root).sum()['count']
237 |         conditional_distributions[root] = normalize_given_distribution(root_marginal_freqs).tolist()
238 | 
239 |         for idx, (child, parents) in enumerate(bayesian_network):
240 |             conditional_distributions[child] = {}
241 | 
242 |             if idx < k:
243 |                 stats = freqs_of_kplus1_attributes.copy().loc[:, parents + [child, 'count']]
244 |             else:
245 |                 stats = self._get_attribute_frequency_counts(parents + [child], encoded_dataset)
246 | 
247 |             stats = DataFrame(stats.loc[:, parents + [child, 'count']].groupby(parents + [child]).sum())
248 | 
249 |             if len(parents) == 1:
250 |                 for parent_instance in stats.index.levels[0]:
251 |                     dist = normalize_given_distribution(stats.loc[parent_instance]['count']).tolist()
252 |                     conditional_distributions[child][str([parent_instance])] = dist
253 |             else:
254 |                 for parents_instance in product(*stats.index.levels[:-1]):
255 |                     dist = normalize_given_distribution(stats.loc[parents_instance]['count']).tolist()
256 |                     conditional_distributions[child][str(list(parents_instance))] = dist
257 | 
258 |         return conditional_distributions
259 | 
260 |     def _get_attribute_frequency_counts(self, attributes, encoded_dataset):
261 |         # Get attribute counts for category combinations present in data
262 |         counts = encoded_dataset.groupby(attributes).size()
263 |         counts.name = 'count'
264 |         counts = counts.reset_index()
265 | 
266 |         # Get all possible attribute combinations
267 |         attr_combs = [range(self.DataDescriber.attr_dict[attr].domain_size) for attr in attributes]
268 |         full_space = DataFrame(columns=attributes, data=list(product(*attr_combs)))
269 |         # stats.reset_index(inplace=True)
270 |         full_counts = merge(full_space, counts, how='left')
271 |         full_counts.fillna(0, inplace=True)
272 | 
273 |         return full_counts
274 | 
275 |     def _read_meta(self, metadata):
276 |         """ Read metadata from metadata file."""
277 |         metadict = {}
278 | 
279 |         for cdict in metadata['columns']:
280 |             col = cdict['name']
281 |             coltype = cdict['type']
282 | 
283 |             if coltype == FLOAT or coltype == INTEGER:
284 |                 metadict[col] = {
285 |                     'type': coltype,
286 |                     'min': cdict['min'],
287 |                     'max': cdict['max']
288 |                 }
289 | 
290 |             elif coltype == CATEGORICAL or coltype == ORDINAL:
291 |                 metadict[col] = {
292 |                     'type': coltype,
293 |                     'categories': cdict['i2s'],
294 |                     'size': len(cdict['i2s'])
295 |                 }
296 | 
297 |             else:
298 |                 raise ValueError(f'Unknown data type {coltype} for attribute {col}')
299 | 
300 |         return metadict
301 | 
302 | 
303 | class PrivBayes(BayesianNet):
304 |     """"
305 |     A differentially private BayesianNet model using GreedyBayes
306 |     """
307 |     def __init__(self, metadata, histogram_bins=10, degree=1, epsilon=.1, infer_ranges=False, multiprocess=True, seed=None):
308 |         super().__init__(metadata=metadata, histogram_bins=histogram_bins, degree=degree, infer_ranges=infer_ranges, multiprocess=multiprocess, seed=seed)
309 | 
310 |         self.epsilon = float(epsilon)
311 | 
312 |         self.__name__ = f'PrivBayesEps{self.epsilon}'
313 | 
314 |     @property
315 |     def laplace_noise_scale(self):
316 |         return 2 * (self.num_attributes - self.degree) / (self.epsilon / 2)
317 | 
318 |     def _greedy_bayes_linear(self, encoded_df, k=1):
319 |         """Construct a Bayesian Network (BN) using greedy algorithm."""
320 |         dataset = encoded_df.astype(str, copy=False)
321 |         num_tuples, num_attributes = dataset.shape
322 | 
323 |         # Optional: Fix seed for reproducibility
324 |         if self.seed is not None:
325 |             seed(self.seed)
326 | 
327 |         attr_to_is_binary = {attr: dataset[attr].unique().size <= 2 for attr in dataset}
328 | 
329 |         root_attribute = choice(dataset.columns)
330 |         V = [root_attribute]
331 |         rest_attributes = set(dataset.columns)
332 |         rest_attributes.remove(root_attribute)
333 |         bayesian_net = []
334 |         while rest_attributes:
335 |             parents_pair_list = []
336 |             mutual_info_list = []
337 | 
338 |             num_parents = min(len(V), k)
339 |             for child, split in product(rest_attributes, range(len(V) - num_parents + 1)):
340 |                 task = (child, V, num_parents, split, dataset)
341 |                 res = bayes_worker(task)
342 |                 parents_pair_list += res[0]
343 |                 mutual_info_list += res[1]
344 | 
345 |             sampling_distribution = exponential_mechanism(self.epsilon/2, mutual_info_list, parents_pair_list, attr_to_is_binary,
346 |                                                           num_tuples, num_attributes)
347 |             idx = choice(list(range(len(mutual_info_list))), p=sampling_distribution)
348 | 
349 |             bayesian_net.append(parents_pair_list[idx])
350 |             adding_attribute = parents_pair_list[idx][0]
351 |             V.append(adding_attribute)
352 |             rest_attributes.remove(adding_attribute)
353 | 
354 |         return bayesian_net
355 | 
356 |     def _get_attribute_frequency_counts(self, attributes, encoded_dataset):
357 |         """ Differentially private mechanism to get attribute frequency counts"""
358 |         # Get attribute counts for category combinations present in data
359 |         counts = encoded_dataset.groupby(attributes).size()
360 |         counts.name = 'count'
361 |         counts = counts.reset_index()
362 | 
363 |         # Get all possible attribute combinations
364 |         attr_combs = [range(self.DataDescriber.attr_dict[attr].domain_size) for attr in attributes]
365 |         full_space = DataFrame(columns=attributes, data=list(product(*attr_combs)))
366 |         full_counts = merge(full_space, counts, how='left')
367 |         full_counts.fillna(0, inplace=True)
368 | 
369 |         # Get Laplace noise sample
370 |         noise_sample = laplace(0, scale=self.laplace_noise_scale, size=full_counts.index.size)
371 |         full_counts['count'] += noise_sample
372 |         full_counts.loc[full_counts['count'] < 0, 'count'] = 0
373 | 
374 |         return full_counts
375 | 
376 | 
377 | class DataDescriber(object):
378 |     def __init__(self, metadata, histogram_bins, infer_ranges=False):
379 |         self.metadata = metadata
380 |         self.histogram_bins = histogram_bins
381 |         self.infer_ranges = infer_ranges
382 | 
383 |         self.attr_dict = None
384 |         self.attr_names = None
385 | 
386 |     def describe(self, df):
387 |         self.attr_names = self._get_attr_names()
388 |         self.attr_dict = self._represent_input_dataset_by_columns(df)
389 | 
390 |         for col, Attribute in self.attr_dict.items():
391 |             Attribute.infer_distribution()
392 | 
393 |     def _get_attr_names(self):
394 |         return [c for c in self.metadata.keys()]
395 | 
396 |     def _represent_input_dataset_by_columns(self, df):
397 |         attr_dict = {}
398 | 
399 |         for col, cdict in self.metadata.items():
400 |             coltype = cdict['type']
401 | 
402 |             paras = (col, df[col], self.histogram_bins)
403 |             if coltype in NUMERICAL:
404 |                 if coltype == FLOAT:
405 |                     Attribute = FloatAttribute(*paras)
406 |                 else:
407 |                     Attribute = IntegerAttribute(*paras)
408 | 
409 |                 if self.infer_ranges:
410 |                     cmin, cmax = min(df[col]), max(df[col])
411 |                 else:
412 |                     cmin, cmax = cdict['min'], cdict['max']
413 | 
414 |                 Attribute.set_domain(domain=(cmin, cmax))
415 | 
416 |             elif coltype in STRINGS:
417 |                 Attribute = StringAttribute(*paras)
418 |                 if self.infer_ranges:
419 |                     ccats = list(df[col].unique())
420 |                 else:
421 |                     ccats = cdict['categories']
422 | 
423 |                 Attribute.set_domain(domain=ccats)
424 | 
425 |             else:
426 |                 raise Exception(f'The DataType of {col} is unknown.')
427 | 
428 |             attr_dict[col] = Attribute
429 | 
430 |         return attr_dict
431 | 
432 | 
433 | 
434 | 
435 | 
436 | 
437 | 
438 | 


--------------------------------------------------------------------------------
/generative_models/data_synthesiser_utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Generative models adapted from https://github.com/DataResponsibly/DataSynthesizer"""
2 | # Copyright <2018> <dataresponsibly.com>


--------------------------------------------------------------------------------
/generative_models/data_synthesiser_utils/datatypes/AbstractAttribute.py:
--------------------------------------------------------------------------------
  1 | from abc import ABCMeta, abstractmethod
  2 | from bisect import bisect_right
  3 | from random import uniform
  4 | 
  5 | import numpy as np
  6 | from numpy.random import choice
  7 | from pandas import Series
  8 | 
  9 | from generative_models.data_synthesiser_utils.utils import normalize_given_distribution
 10 | 
 11 | 
 12 | class AbstractAttribute(object):
 13 |     __metaclass__ = ABCMeta
 14 | 
 15 |     def __init__(self, name, data, histogram_size):
 16 |         self.name = name
 17 |         self.data = data
 18 |         self.histogram_size = histogram_size
 19 | 
 20 |         self.data_dropna = self.data.dropna()
 21 |         self.missing_rate = (self.data.size - self.data_dropna.size) / (self.data.size or 1)
 22 | 
 23 |         self.is_categorical = None
 24 |         self.is_numerical = None
 25 |         self.data_type = None
 26 |         self.min = None
 27 |         self.max = None
 28 |         self.distribution_bins = None
 29 |         self.distribution_probabilities = None
 30 |         self.domain_size = None
 31 | 
 32 |     def set_domain(self, domain):
 33 |         return NotImplementedError('Method needs to be overwritten.')
 34 | 
 35 |     @abstractmethod
 36 |     def infer_distribution(self):
 37 |         if self.is_categorical:
 38 |             histogram = self.data_dropna.value_counts()
 39 |             for value in set(self.distribution_bins) - set(histogram.index):
 40 |                 histogram[value] = 0
 41 |             histogram = histogram[self.distribution_bins]
 42 |             self.distribution_probabilities = normalize_given_distribution(histogram)
 43 | 
 44 |         else:
 45 |             histogram, _ = np.histogram(self.data_dropna, bins=self.distribution_bins)
 46 |             self.distribution_probabilities = normalize_given_distribution(histogram)
 47 | 
 48 |     def encode_values_into_bin_idx(self):
 49 |         """
 50 |         Encode values into bin indices for Bayesian Network construction.
 51 |         """
 52 |         if self.is_categorical:
 53 |             value_to_bin_idx = {value: idx for idx, value in enumerate(self.distribution_bins)}
 54 |             encoded = self.data.map(lambda x: value_to_bin_idx[x], na_action='ignore')
 55 |         else:
 56 |             encoded = self.data.map(lambda x: bisect_right(self.distribution_bins[:-1], x) - 1, na_action='ignore')
 57 | 
 58 |         encoded.fillna(len(self.distribution_bins), inplace=True)
 59 |         return encoded.astype(int, copy=False)
 60 | 
 61 |     def to_json(self):
 62 |         """Encode attribution information in JSON format / Python dictionary.
 63 | 
 64 |         """
 65 |         return {"name": self.name,
 66 |                 "data_type": self.data_type.value,
 67 |                 "is_categorical": self.is_categorical,
 68 |                 "min": self.min,
 69 |                 "max": self.max,
 70 |                 "missing_rate": self.missing_rate,
 71 |                 "distribution_bins": self.distribution_bins.tolist(),
 72 |                 "distribution_probabilities": self.distribution_probabilities.tolist()}
 73 | 
 74 |     @abstractmethod
 75 |     def generate_values_as_candidate_key(self, n):
 76 |         """When attribute should be a candidate key in output dataset.
 77 | 
 78 |         """
 79 |         return np.arange(n)
 80 | 
 81 |     def sample_binning_indices_in_independent_attribute_mode(self, n):
 82 |         """Sample an array of binning indices.
 83 | 
 84 |         """
 85 |         return Series(choice(len(self.distribution_probabilities), size=n, p=self.distribution_probabilities))
 86 | 
 87 |     @abstractmethod
 88 |     def sample_values_from_binning_indices(self, binning_indices):
 89 |         """Convert binning indices into values in domain. Used by both independent and correlated attribute mode.
 90 | 
 91 |         """
 92 |         return binning_indices.apply(lambda x: self.uniform_sampling_within_a_bin(x))
 93 | 
 94 |     def uniform_sampling_within_a_bin(self, bin_idx):
 95 |         num_bins = len(self.distribution_probabilities)
 96 |         if bin_idx == num_bins:
 97 |             return np.nan
 98 |         elif self.is_categorical:
 99 |             return self.distribution_bins[bin_idx]
100 |         else:
101 |             return uniform(self.distribution_bins[bin_idx], self.distribution_bins[bin_idx + 1])
102 | 
103 | 


--------------------------------------------------------------------------------
/generative_models/data_synthesiser_utils/datatypes/FloatAttribute.py:
--------------------------------------------------------------------------------
 1 | from numpy import linspace, histogram, arange
 2 | 
 3 | from generative_models.data_synthesiser_utils.datatypes.AbstractAttribute import AbstractAttribute
 4 | from generative_models.data_synthesiser_utils.datatypes.utils.DataType import DataType
 5 | from generative_models.data_synthesiser_utils.utils import normalize_given_distribution
 6 | 
 7 | 
 8 | class FloatAttribute(AbstractAttribute):
 9 |     def __init__(self, name, data, histogram_size):
10 | 
11 |         super().__init__(name, data, histogram_size)
12 |         self.is_categorical = False
13 |         self.is_numerical = True
14 |         self.data_type = DataType.FLOAT
15 |         self.data = self.data.astype(float)
16 |         self.data_dropna = self.data_dropna.astype(float)
17 | 
18 |     def set_domain(self, domain=None):
19 |         if domain is not None:
20 |             self.min, self.max = domain
21 |         else:
22 |             self.min = float(self.data_dropna.min())
23 |             self.max = float(self.data_dropna.max())
24 | 
25 |         self.distribution_bins = linspace(self.min, self.max, self.histogram_size+1)
26 |         self.domain_size = self.histogram_size
27 | 
28 |     def infer_distribution(self):
29 |         frequency_counts, _ = histogram(self.data_dropna, bins=self.distribution_bins)
30 |         self.distribution_probabilities = normalize_given_distribution(frequency_counts)
31 | 
32 |     def generate_values_as_candidate_key(self, n):
33 |         return arange(self.min, self.max, (self.max - self.min) / n)
34 | 
35 |     def sample_values_from_binning_indices(self, binning_indices):
36 |         return super().sample_values_from_binning_indices(binning_indices)
37 | 


--------------------------------------------------------------------------------
/generative_models/data_synthesiser_utils/datatypes/IntegerAttribute.py:
--------------------------------------------------------------------------------
 1 | from numpy import linspace, histogram
 2 | 
 3 | from generative_models.data_synthesiser_utils.datatypes.AbstractAttribute import AbstractAttribute
 4 | from generative_models.data_synthesiser_utils.datatypes.utils.DataType import DataType
 5 | from generative_models.data_synthesiser_utils.utils import normalize_given_distribution
 6 | 
 7 | 
 8 | class IntegerAttribute(AbstractAttribute):
 9 |     def __init__(self, name, data, histogram_size):
10 |         super().__init__(name, data, histogram_size)
11 |         self.is_categorical = False
12 |         self.is_numerical = True
13 |         self.data_type = DataType.INTEGER
14 |         self.data = self.data.astype(int)
15 |         self.data_dropna = self.data_dropna.astype(int)
16 | 
17 |     def set_domain(self, domain=None):
18 |         if domain is not None:
19 |             self.min, self.max = domain
20 |         else:
21 |             self.min = self.data_dropna.min()
22 |             self.max = self.data_dropna.max()
23 | 
24 |         self.min = int(self.min)
25 |         self.max = int(self.max)
26 |         self.distribution_bins = linspace(self.min, self.max, self.histogram_size + 1).astype(int)
27 |         self.domain_size = self.histogram_size
28 | 
29 |     def infer_distribution(self):
30 |         frequency_counts, _ = histogram(self.data_dropna, bins=self.distribution_bins)
31 |         self.distribution_probabilities = normalize_given_distribution(frequency_counts)
32 | 
33 |     def generate_values_as_candidate_key(self, n):
34 |         return super().generate_values_as_candidate_key(n)
35 | 
36 |     def sample_values_from_binning_indices(self, binning_indices):
37 |         column = super().sample_values_from_binning_indices(binning_indices)
38 |         column = column.round()
39 |         column = column.astype(int)
40 |         # column[~column.isnull()] = column[~column.isnull()].astype(int)
41 |         return column
42 | 


--------------------------------------------------------------------------------
/generative_models/data_synthesiser_utils/datatypes/StringAttribute.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from generative_models.data_synthesiser_utils.datatypes.AbstractAttribute import AbstractAttribute
 4 | from generative_models.data_synthesiser_utils.datatypes.utils.DataType import DataType
 5 | from generative_models.data_synthesiser_utils.utils import normalize_given_distribution, generate_random_string
 6 | 
 7 | 
 8 | class StringAttribute(AbstractAttribute):
 9 |     """Variable min and max are the lengths of the shortest and longest strings.
10 | 
11 |     """
12 | 
13 |     def __init__(self, name, data, histogram_size):
14 |         super().__init__(name, data, histogram_size)
15 |         self.is_categorical = True
16 |         self.is_numerical = False
17 |         self.data_type = DataType.STRING
18 |         self.data_dropna_len = self.data_dropna.astype(str).map(len)
19 | 
20 |     def set_domain(self, domain=None):
21 |         if domain is not None:
22 |             lengths = [len(i) for i in domain]
23 |             self.min = min(lengths)
24 |             self.max = max(lengths)
25 |             self.distribution_bins = np.array(domain)
26 |         else:
27 |             self.min = int(self.data_dropna_len.min())
28 |             self.max = int(self.data_dropna_len.max())
29 |             self.distribution_bins = self.data_dropna.unique()
30 | 
31 |         self.domain_size = len(self.distribution_bins)
32 | 
33 |     def infer_distribution(self):
34 | 
35 |         histogram = self.data_dropna.value_counts()
36 |         for attr_cat in set(self.distribution_bins) - set(histogram.index):
37 |             histogram[attr_cat] = 0
38 |         histogram = histogram[self.distribution_bins]
39 |         self.distribution_probabilities = normalize_given_distribution(histogram)
40 | 
41 |     def generate_values_as_candidate_key(self, n):
42 |         length = np.random.randint(self.min, self.max)
43 |         vectorized = np.vectorize(lambda x: '{}{}'.format(generate_random_string(length), x))
44 |         return vectorized(np.arange(n))
45 | 
46 |     def sample_values_from_binning_indices(self, binning_indices):
47 |         return super().sample_values_from_binning_indices(binning_indices)
48 | 


--------------------------------------------------------------------------------
/generative_models/data_synthesiser_utils/datatypes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/generative_models/data_synthesiser_utils/datatypes/__init__.py


--------------------------------------------------------------------------------
/generative_models/data_synthesiser_utils/datatypes/constants.py:
--------------------------------------------------------------------------------
1 | CONTINUOUS = 'Continuous'
2 | CATEGORICAL = 'Categorical'
3 | ORDINAL = 'Ordinal'
4 | INTEGER = 'Integer'
5 | FLOAT = 'Float'
6 | STRING = 'String'


--------------------------------------------------------------------------------
/generative_models/data_synthesiser_utils/datatypes/utils/DataType.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class DataType(Enum):
 5 |     INTEGER = 'Integer'
 6 |     FLOAT = 'Float'
 7 |     STRING = 'String'
 8 |     DATETIME = 'DateTime'
 9 |     SOCIAL_SECURITY_NUMBER = 'SocialSecurityNumber'
10 | 


--------------------------------------------------------------------------------
/generative_models/data_synthesiser_utils/datatypes/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/generative_models/data_synthesiser_utils/datatypes/utils/__init__.py


--------------------------------------------------------------------------------
/generative_models/data_synthesiser_utils/utils.py:
--------------------------------------------------------------------------------
  1 | from math import log, ceil
  2 | from numpy import array, exp, isinf, full_like
  3 | from numpy.random import choice
  4 | from string import ascii_lowercase
  5 | from itertools import combinations
  6 | from pandas import Series, DataFrame
  7 | from sklearn.metrics import mutual_info_score, normalized_mutual_info_score
  8 | 
  9 | 
 10 | def mutual_information(labels_x: Series, labels_y: DataFrame):
 11 |     """Mutual information of distributions in format of Series or DataFrame.
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     labels_x : Series
 16 |     labels_y : DataFrame
 17 |     """
 18 |     if labels_y.shape[1] == 1:
 19 |         labels_y = labels_y.iloc[:, 0]
 20 |     else:
 21 |         labels_y = labels_y.apply(lambda x: ' '.join(x.values), axis=1)
 22 | 
 23 |     return mutual_info_score(labels_x, labels_y)
 24 | 
 25 | 
 26 | def pairwise_attributes_mutual_information(dataset):
 27 |     """Compute normalized mutual information for all pairwise attributes. Return a DataFrame."""
 28 |     sorted_columns = sorted(dataset.columns)
 29 |     mi_df = DataFrame(columns=sorted_columns, index=sorted_columns, dtype=float)
 30 |     for row in mi_df.columns:
 31 |         for col in mi_df.columns:
 32 |             mi_df.loc[row, col] = normalized_mutual_info_score(dataset[row].astype(str),
 33 |                                                                dataset[col].astype(str),
 34 |                                                                average_method='arithmetic')
 35 |     return mi_df
 36 | 
 37 | 
 38 | def normalize_given_distribution(frequencies):
 39 |     distribution = array(frequencies, dtype=float)
 40 |     distribution = distribution.clip(0)  # replace negative values with 0
 41 |     summation = distribution.sum()
 42 |     if summation > 0:
 43 |         if isinf(summation):
 44 |             return normalize_given_distribution(isinf(distribution))
 45 |         else:
 46 |             return distribution / summation
 47 |     else:
 48 |         return full_like(distribution, 1 / distribution.size)
 49 | 
 50 | 
 51 | def infer_numerical_attributes_in_dataframe(dataframe):
 52 |     describe = dataframe.describe()
 53 |     # DataFrame.describe() usually returns 8 rows.
 54 |     if describe.shape[0] == 8:
 55 |         return set(describe.columns)
 56 |     # DataFrame.describe() returns less than 8 rows when there is no numerical attribute.
 57 |     else:
 58 |         return set()
 59 | 
 60 | 
 61 | def display_bayesian_network(bn):
 62 |     length = 0
 63 |     for child, _ in bn:
 64 |         if len(child) > length:
 65 |             length = len(child)
 66 | 
 67 |     print('Constructed Bayesian network:')
 68 |     for child, parents in bn:
 69 |         print("    {0:{width}} has parents {1}.".format(child, parents, width=length))
 70 | 
 71 | 
 72 | def generate_random_string(length):
 73 |     return ''.join(choice(list(ascii_lowercase), size=length))
 74 | 
 75 | 
 76 | def bayes_worker(paras):
 77 |     child, V, num_parents, split, dataset = paras
 78 |     parents_pair_list = []
 79 |     mutual_info_list = []
 80 | 
 81 |     if split + num_parents - 1 < len(V):
 82 |         for other_parents in combinations(V[split + 1:], num_parents - 1):
 83 |             parents = list(other_parents)
 84 |             parents.append(V[split])
 85 |             parents_pair_list.append((child, parents))
 86 |             mi = mutual_information(dataset[child], dataset[parents])
 87 |             mutual_info_list.append(mi)
 88 | 
 89 |     return parents_pair_list, mutual_info_list
 90 | 
 91 | 
 92 | def calculate_sensitivity(num_tuples, child, parents, attr_to_is_binary):
 93 |     """Sensitivity function for Bayesian network construction. PrivBayes Lemma 1.
 94 |     Parameters
 95 |     ----------
 96 |     num_tuples : int
 97 |         Number of tuples in sensitive dataset.
 98 |     Return
 99 |     --------
100 |     int
101 |         Sensitivity value.
102 |     """
103 |     if attr_to_is_binary[child] or (len(parents) == 1 and attr_to_is_binary[parents[0]]):
104 |         a = log(num_tuples) / num_tuples
105 |         b = (num_tuples - 1) / num_tuples
106 |         b_inv = num_tuples / (num_tuples - 1)
107 |         return a + b * log(b_inv)
108 |     else:
109 |         a = (2 / num_tuples) * log((num_tuples + 1) / 2)
110 |         b = (1 - 1 / num_tuples) * log(1 + 2 / (num_tuples - 1))
111 |         return a + b
112 | 
113 | 
114 | def calculate_delta(num_attributes, sensitivity, epsilon):
115 |     """Computing delta, which is a factor when applying differential privacy.
116 |     More info is in PrivBayes Section 4.2 "A First-Cut Solution".
117 |     Parameters
118 |     ----------
119 |     num_attributes : int
120 |         Number of attributes in dataset.
121 |     sensitivity : float
122 |         Sensitivity of removing one tuple.
123 |     epsilon : float
124 |         Parameter of differential privacy.
125 |     """
126 |     return (num_attributes - 1) * sensitivity / epsilon
127 | 
128 | 
129 | def exponential_mechanism(epsilon, mutual_info_list, parents_pair_list, attr_to_is_binary, num_tuples, num_attributes):
130 |     """Applied in Exponential Mechanism to sample outcomes."""
131 |     delta_array = []
132 |     for (child, parents) in parents_pair_list:
133 |         sensitivity = calculate_sensitivity(num_tuples, child, parents, attr_to_is_binary)
134 |         delta = calculate_delta(num_attributes, sensitivity, epsilon)
135 |         delta_array.append(delta)
136 | 
137 |     mi_array = array(mutual_info_list) / (2 * array(delta_array))
138 |     mi_array = exp(mi_array)
139 |     mi_array = normalize_given_distribution(mi_array)
140 |     return mi_array


--------------------------------------------------------------------------------
/generative_models/generative_model.py:
--------------------------------------------------------------------------------
 1 | """Parent class for all generative models"""
 2 | 
 3 | class GenerativeModel(object):
 4 | 
 5 |     def fit(self, data):
 6 |         """Fit a generative model to the input dataset"""
 7 |         return NotImplementedError('Method needs to be overwritten by a subclass.')
 8 | 
 9 |     def generate_samples(self, nsamples):
10 |         """Generate a synthetic dataset of size nsamples"""
11 |         return NotImplementedError('Method needs to be overwritten by a subclass.')


--------------------------------------------------------------------------------
/generative_models/gmm.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | from sklearn.mixture import GaussianMixture
 3 | from generative_models.generative_model import GenerativeModel
 4 | 
 5 | import logging
 6 | from logging.config import fileConfig
 7 | dirname = path.dirname(__file__)
 8 | logconfig = path.join(dirname, '../logging_config.ini')
 9 | fileConfig(logconfig)
10 | logger = logging.getLogger(__name__)
11 | 
12 | class GaussianMixtureModel(GenerativeModel):
13 | 
14 |     def __init__(self):
15 |         self.gm = GaussianMixture()
16 |         self.trained = False
17 | 
18 |     def fit(self, data):
19 |         """Fit a gaussian mixture model to the input data. Input data is assumed to be of shape (n_samples, n_features)
20 |         See https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#sklearn.mixture.GaussianMixture.fit for details"""
21 |         logger.debug(f'Start fitting GaussianMixtureModel to data of shape {data.shape}...')
22 |         self.gm.fit(data)
23 |         logger.debug(f'Finished fitting GMM')
24 |         self.trained = True
25 | 
26 |     def generate_samples(self, nsamples):
27 |         """Generate random samples from the fitted Gaussian distribution"""
28 |         assert self.trained, "Model must first be fitted to some data."
29 |         logger.debug(f'Generate synthetic dataset of size {nsamples}')
30 |         synthetic_data, _ = self.gm.sample(nsamples)
31 |         return synthetic_data
32 | 
33 | 


--------------------------------------------------------------------------------
/generative_models/pate_gan.py:
--------------------------------------------------------------------------------
  1 | """
  2 | A generative model training algorithm based on
  3 | "PATE-GAN: Generating Synthetic Data with Differential Privacy Guarantees"
  4 | by J. Yoon, J. Jordon, M. van der Schaar, published in International Conference on Learning Representations (ICLR), 2019
  5 | Adapted from: https://bitbucket.org/mvdschaar/mlforhealthlabpub/src/82d7f91d46db54d256ff4fc920d513499ddd2ab8/alg/pategan/
  6 | """
  7 | 
  8 | import tensorflow.compat.v1 as tf
  9 | tf.disable_v2_behavior()
 10 | 
 11 | import numpy as np
 12 | from pandas import DataFrame
 13 | 
 14 | from generative_models.generative_model import GenerativeModel
 15 | from utils.logging import LOGGER
 16 | from utils.constants import *
 17 | 
 18 | 
 19 | ZERO_TOL = 1e-8
 20 | 
 21 | 
 22 | class PATEGAN(GenerativeModel):
 23 |     """ A generative adversarial network trained under the PATE framework to achieve differential privacy """
 24 | 
 25 |     def __init__(self, metadata,
 26 |                  eps=1, delta=1e-5, infer_ranges=False,
 27 |                  num_teachers=10, n_iters=100, batch_size=128,
 28 |                  learning_rate=1e-4, multiprocess=False):
 29 |         """
 30 |         :param metadata: dict: Attribute metadata describing the data domain of the synthetic target data
 31 |         :param eps: float: Privacy parameter
 32 |         :param delta: float: Privacy parameter
 33 |         :param target: str: Name of the target variable for downstream classification tasks
 34 |         :param num_teachers: int: Number of teacher discriminators
 35 |         :param n_iters: int: Number of training iterations
 36 |         """
 37 |         # Data description
 38 |         self.metadata, self.attribute_list = self.read_meta(metadata)
 39 |         self.datatype = DataFrame
 40 |         self.nfeatures = self.get_num_features()
 41 | 
 42 |         # Privacy params
 43 |         self.epsilon = eps
 44 |         self.delta = delta
 45 |         self.infer_ranges = infer_ranges
 46 | 
 47 |         # Training params
 48 |         self.num_teachers = num_teachers
 49 |         self.n_iters = n_iters
 50 |         self.batch_size = batch_size
 51 |         self.learning_rate = learning_rate
 52 |         self.z_dim = int(self.nfeatures / 4)
 53 |         self.h_dim = int(self.nfeatures)
 54 | 
 55 |         # Configure device
 56 |         device_name = tf.test.gpu_device_name()
 57 |         if device_name is '':
 58 |             self.device_spec = tf.DeviceSpec(device_type='CPU', device_index=0)
 59 |         else:
 60 |             self.device_spec = tf.DeviceSpec(device_type='GPU', device_index=0)
 61 | 
 62 |         with tf.device(self.device_spec.to_string()):
 63 |             # Variable init
 64 |             # Feature matrix
 65 |             self.X = tf.placeholder(tf.float32, shape=[None, self.nfeatures])
 66 |             # Latent space
 67 |             self.Z = tf.placeholder(tf.float32, shape=[None, self.z_dim])
 68 |             # Noise variable
 69 |             self.M = tf.placeholder(tf.float32, shape=[None, 1])
 70 |             # Generator
 71 |             self.GDist = None
 72 |             self._generator()
 73 |             # Discriminator
 74 |             self._discriminator()
 75 |             self.sess = tf.Session()
 76 | 
 77 |         self.multiprocess = multiprocess
 78 | 
 79 |         self.trained = False
 80 | 
 81 |         self.__name__ = f'PateGanEps{self.epsilon}'
 82 | 
 83 |     @property
 84 |     def laplace_noise_scale(self):
 85 |         return np.sqrt(2 * np.log(1.25 * 10**self.delta)) / self.epsilon
 86 | 
 87 |     def get_num_features(self):
 88 |         nfeatures = 0
 89 | 
 90 |         for cname, cdict in self.metadata.items():
 91 |             data_type = cdict['type']
 92 |             if data_type == FLOAT or data_type == INTEGER:
 93 |                 nfeatures += 1
 94 | 
 95 |             elif data_type == CATEGORICAL or data_type == ORDINAL:
 96 |                 nfeatures += len(cdict['categories'])
 97 | 
 98 |             else:
 99 |                 raise ValueError(f'Unkown data type {data_type} for attribute {cname}')
100 | 
101 |         return nfeatures
102 | 
103 |     def read_meta(self, metadata):
104 |         meta_dict = {}
105 |         attr_names = []
106 |         for cdict in metadata['columns']:
107 |             attr_name = cdict['name']
108 |             data_type = cdict['type']
109 |             if data_type == FLOAT or data_type == INTEGER:
110 |                 meta_dict[attr_name] = {
111 |                     'type': data_type,
112 |                     'min': cdict['min'],
113 |                     'max': cdict['max']
114 |                 }
115 | 
116 |             elif data_type == CATEGORICAL or data_type == ORDINAL:
117 |                 meta_dict[attr_name] = {
118 |                     'type': data_type,
119 |                     'categories': cdict['i2s']
120 |                 }
121 | 
122 |             else:
123 |                 raise ValueError(f'Unknown data type {data_type} for attribute {attr_name}')
124 | 
125 |             attr_names.append(attr_name)
126 | 
127 |         return meta_dict, attr_names
128 | 
129 |     def _generator(self):
130 |         self.G_W1 = tf.Variable(self._xavier_init([self.z_dim, self.h_dim]))
131 |         self.G_b1 = tf.Variable(tf.zeros(shape=[self.h_dim]))
132 | 
133 |         self.G_W2 = tf.Variable(self._xavier_init([self.h_dim, self.h_dim]))
134 |         self.G_b2 = tf.Variable(tf.zeros(shape=[self.h_dim]))
135 | 
136 |         self.G_W3 = tf.Variable(self._xavier_init([self.h_dim, self.nfeatures]))
137 |         self.G_b3 = tf.Variable(tf.zeros(shape=[self.nfeatures]))
138 | 
139 |         self.theta_G = [self.G_W1, self.G_W2, self.G_W3, self.G_b1, self.G_b2, self.G_b3]
140 | 
141 |     def _discriminator(self):
142 |         self.D_W1 = tf.Variable(self._xavier_init([self.nfeatures, self.h_dim]))
143 |         self.D_b1 = tf.Variable(tf.zeros(shape=[self.h_dim]))
144 | 
145 |         self.D_W2 = tf.Variable(self._xavier_init([self.h_dim, self.h_dim]))
146 |         self.D_b2 = tf.Variable(tf.zeros(shape=[self.h_dim]))
147 | 
148 |         self.D_W3 = tf.Variable(self._xavier_init([self.h_dim, 1]))
149 |         self.D_b3 = tf.Variable(tf.zeros(shape=[1]))
150 | 
151 |         self.theta_D = [self.D_W1, self.D_W2, self.D_W3, self.D_b1, self.D_b2, self.D_b3]
152 | 
153 |     def fit(self, data):
154 |         """Fit a generative model of the training data distribution.
155 |         :param data: DataFrame: Training set
156 |         """
157 |         assert isinstance(data, self.datatype), f'{self.__class__.__name__} expects {self.datatype} as input data but got {type(data)}'
158 | 
159 |         # Clean up
160 |         if self.trained:
161 |             self._generator()
162 |             self._discriminator()
163 |             self.sess = tf.Session()
164 |             self.trained = False
165 | 
166 |         LOGGER.debug(f'Start fitting {self.__class__.__name__} to data of shape {data.shape}...')
167 |         nsamples = len(data)
168 |         features_train = self._encode_data(data)
169 | 
170 |         with tf.device(self.device_spec.to_string()):
171 |             # Generator
172 |             self.GDist = self.gen_out(self.Z)
173 | 
174 |             # Discriminator
175 |             D_real = self.discriminator_out(self.X)
176 |             D_fake = self.discriminator_out(self.GDist)
177 |             D_entire = tf.concat(axis=0, values=[D_real, D_fake])
178 | 
179 |             # Replacement of Clipping algorithm to Penalty term
180 |             # 1. Line 6 in Algorithm 1
181 |             noisy_vals = tf.random_uniform([self.batch_size, 1], minval=0., maxval=1.)
182 |             X_inter = noisy_vals * self.X + (1. - noisy_vals) * self.GDist
183 | 
184 |             # 2. Line 7 in Algorithm 1
185 |             grad = tf.gradients(self.discriminator_out(X_inter), [X_inter])[0]
186 |             grad_norm = tf.sqrt(tf.reduce_sum(grad ** 2 + ZERO_TOL, axis=1))
187 |             grad_pen = self.num_teachers * tf.reduce_mean((grad_norm - 1) ** 2)
188 | 
189 |             # Loss function
190 |             discriminator_loss = tf.reduce_mean((1 - self.M) * D_entire) - tf.reduce_mean(self.M * D_entire) + grad_pen
191 |             generator_loss = -tf.reduce_mean(D_fake)
192 | 
193 |             # Solver
194 |             discriminator_solver = (tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.5).minimize(discriminator_loss, var_list=self.theta_D))
195 |             generator_solver = (tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.5).minimize(generator_loss, var_list=self.theta_G))
196 | 
197 |             # Start session
198 |             self.sess.run(tf.global_variables_initializer())
199 | 
200 |             # Training iterations
201 |             for _ in range(self.n_iters):
202 |                 # TODO: Move dataset splitting here
203 |                 # For fixed generator weights run teacher training
204 |                 for _ in range(self.num_teachers):
205 |                     # Sample latent vars
206 |                     latent_batch = self._sample_latent_z(self.batch_size, self.z_dim)
207 | 
208 |                     # Sample real
209 |                     train_idx_teach = self._sample_real_x(nsamples, self.batch_size) # Does this way of sampling satisfy DP? Should be disjoint subsets!
210 |                     features_train_batch = features_train[train_idx_teach, :]
211 | 
212 |                     labels_real = np.ones([self.batch_size, ])
213 |                     labels_fake = np.zeros([self.batch_size, ])
214 | 
215 |                     labels_batch = np.concatenate((labels_real, labels_fake), 0)
216 | 
217 |                     gaussian_noise = np.random.normal(loc=0.0, scale=self.laplace_noise_scale, size=self.batch_size * 2)
218 | 
219 |                     labels_batch = labels_batch + gaussian_noise
220 | 
221 |                     labels_batch = (labels_batch > 0.5)
222 | 
223 |                     labels_batch = np.reshape(labels_batch.astype(float), (2 * self.batch_size, 1))
224 | 
225 |                     _, discriminator_loss_iter = self.sess.run([discriminator_solver, discriminator_loss], feed_dict={self.X: features_train_batch, self.Z: latent_batch, self.M: labels_batch})
226 | 
227 |                 # Update generator weights
228 |                 latent_batch = self._sample_latent_z(self.batch_size, self.z_dim)
229 | 
230 |                 _, generator_loss_iter = self.sess.run([generator_solver, generator_loss], feed_dict={self.Z: latent_batch})
231 | 
232 |         self.trained = True
233 | 
234 |     def generate_samples(self, nsamples):
235 |         """""
236 |         Samples synthetic data records from the fitted generative distribution
237 |         :param nsamples: int: Number of synthetic records to generate
238 |         :return synData: DataFrame: A synthetic dataset
239 |         """
240 |         with tf.device(self.device_spec.to_string()):
241 |             # Output generation
242 |             features_synthetic_encoded = self.sess.run([self.GDist], feed_dict={self.Z: self._sample_latent_z(nsamples, self.z_dim)})[0]
243 | 
244 |         # Revers numerical encoding
245 |         synthetic_data = self._decode_data(features_synthetic_encoded)
246 |         synthetic_data = synthetic_data.iloc[np.random.permutation(synthetic_data.index)].reset_index(drop=True)
247 | 
248 |         return synthetic_data
249 | 
250 | 
251 |     def gen_out(self, z):
252 |         G_h1 = tf.nn.tanh(tf.matmul(z, self.G_W1) + self.G_b1)
253 |         G_h2 = tf.nn.tanh(tf.matmul(G_h1, self.G_W2) + self.G_b2)
254 |         G_log_prob = tf.nn.sigmoid(tf.matmul(G_h2, self.G_W3) + self.G_b3)
255 | 
256 |         return G_log_prob
257 | 
258 |     def discriminator_out(self, x):
259 |         D_h1 = tf.nn.relu(tf.matmul(x, self.D_W1) + self.D_b1)
260 |         D_h2 = tf.nn.relu(tf.matmul(D_h1, self.D_W2) + self.D_b2)
261 |         out = (tf.matmul(D_h2, self.D_W3) + self.D_b3)
262 | 
263 |         return out
264 | 
265 |     def _xavier_init(self,size):
266 |         in_dim = size[0]
267 |         xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
268 | 
269 |         return tf.random_normal(shape=size, stddev=xavier_stddev)
270 | 
271 |     def _sample_latent_z(self, nsamples, ndims):
272 |         return np.random.uniform(-1., 1., size=[nsamples, ndims])
273 | 
274 |     def _sample_real_x(self, data_size, batch_size):
275 |         return np.random.permutation(data_size)[:batch_size]
276 | 
277 |     def _encode_data(self, data):
278 |         n_samples = len(data)
279 |         features_encoded = np.empty((n_samples, self.nfeatures))
280 |         cidx = 0
281 | 
282 |         for attr_name, cdict in self.metadata.items():
283 |             data_type = cdict['type']
284 |             col_data = data[attr_name].to_numpy()
285 | 
286 |             if data_type == FLOAT or data_type == INTEGER:
287 |                 # Normalise continuous data
288 |                 if self.infer_ranges:
289 |                     col_max = max(col_data)
290 |                     col_min = min(col_data)
291 | 
292 |                     self.metadata[attr_name]['max'] = col_max
293 |                     self.metadata[attr_name]['min'] = col_min
294 | 
295 |                 else:
296 |                     col_max = cdict['max']
297 |                     col_min = cdict['min']
298 | 
299 |                 features_encoded[:, cidx] = np.true_divide(col_data - col_min, col_max + ZERO_TOL)
300 | 
301 |                 cidx += 1
302 | 
303 |             elif data_type == CATEGORICAL or data_type == ORDINAL:
304 |                 # One-hot encoded categorical columns
305 |                 col_cats = cdict['categories']
306 |                 col_data_onehot = self._one_hot(col_data, col_cats)
307 |                 features_encoded[:, cidx : cidx + len(col_cats)] = col_data_onehot
308 | 
309 |                 cidx += len(col_cats)
310 | 
311 |         return features_encoded
312 | 
313 |     def _decode_data(self, features_encoded):
314 |         """ Revers feature encoding. """
315 |         data = DataFrame(columns=self.attribute_list)
316 | 
317 |         cidx = 0
318 | 
319 |         for attr_name, cdict in self.metadata.items():
320 |             data_type = cdict['type']
321 | 
322 |             if data_type == FLOAT:
323 |                 col_min = cdict['min']
324 |                 col_max = cdict['max']
325 | 
326 |                 col_data = features_encoded[:, cidx]
327 |                 col_data = col_data * (col_max + ZERO_TOL) + col_min
328 |                 data[attr_name] = col_data.astype(float)
329 |                 cidx += 1
330 | 
331 |             elif data_type == INTEGER:
332 |                 col_min = cdict['min']
333 |                 col_max = cdict['max']
334 | 
335 |                 col_data = features_encoded[:, cidx]
336 |                 col_data = col_data * (col_max + ZERO_TOL) + col_min
337 |                 data[attr_name] = col_data.astype(int)
338 |                 cidx += 1
339 | 
340 |             elif data_type == CATEGORICAL or data_type == ORDINAL:
341 |                 col_cats = cdict['categories']
342 |                 ncats = len(col_cats)
343 | 
344 |                 col_data_onehot = features_encoded[:, cidx : cidx + ncats]
345 |                 col_data = self._reverse_one_hot(col_data_onehot, col_cats)
346 |                 data[attr_name] = col_data.astype(str)
347 | 
348 |                 cidx += ncats
349 | 
350 |         return data
351 | 
352 |     def _one_hot(self, col_data, categories):
353 |         col_data_onehot = np.zeros((len(col_data), len(categories)))
354 |         cidx = [categories.index(c) for c in col_data]
355 |         col_data_onehot[np.arange(len(col_data)), cidx] = 1
356 | 
357 |         return col_data_onehot
358 | 
359 |     def _reverse_one_hot(self, col_encoded, categories):
360 |         cat_idx = np.argmax(col_encoded, axis=1)
361 |         col_data = np.array([categories[i] for i in cat_idx])
362 | 
363 |         return col_data


--------------------------------------------------------------------------------
/inference_cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Command-line interface for running privacy evaluation under an attribute inference adversary
  3 | """
  4 | 
  5 | import json
  6 | 
  7 | from os import mkdir, path
  8 | from numpy.random import choice, seed
  9 | from argparse import ArgumentParser
 10 | 
 11 | from utils.datagen import load_s3_data_as_df, load_local_data_as_df
 12 | from utils.utils import json_numpy_serialzer
 13 | from utils.logging import LOGGER
 14 | from utils.constants import *
 15 | 
 16 | from generative_models.ctgan import CTGAN
 17 | from generative_models.data_synthesiser import IndependentHistogram, BayesianNet, PrivBayes
 18 | from generative_models.pate_gan import PATEGAN
 19 | from sanitisation_techniques.sanitiser import SanitiserNHS
 20 | from attack_models.reconstruction import LinRegAttack, RandForestAttack
 21 | 
 22 | from warnings import simplefilter
 23 | simplefilter('ignore', category=FutureWarning)
 24 | simplefilter('ignore', category=DeprecationWarning)
 25 | 
 26 | cwd = path.dirname(__file__)
 27 | 
 28 | SEED = 42
 29 | 
 30 | 
 31 | def main():
 32 |     argparser = ArgumentParser()
 33 |     datasource = argparser.add_mutually_exclusive_group()
 34 |     datasource.add_argument('--s3name', '-S3', type=str, choices=['adult', 'census', 'credit', 'alarm', 'insurance'], help='Name of the dataset to run on')
 35 |     datasource.add_argument('--datapath', '-D', type=str, help='Relative path to cwd of a local data file')
 36 |     argparser.add_argument('--runconfig', '-RC', default='runconfig_mia.json', type=str, help='Path relative to cwd of runconfig file')
 37 |     argparser.add_argument('--outdir', '-O', default='tests', type=str, help='Path relative to cwd for storing output files')
 38 |     args = argparser.parse_args()
 39 | 
 40 |     # Load runconfig
 41 |     with open(path.join(cwd, args.runconfig)) as f:
 42 |         runconfig = json.load(f)
 43 |     print('Runconfig:')
 44 |     print(runconfig)
 45 | 
 46 |     # Load data
 47 |     if args.s3name is not None:
 48 |         rawPop, metadata = load_s3_data_as_df(args.s3name)
 49 |         dname = args.s3name
 50 |     else:
 51 |         rawPop, metadata = load_local_data_as_df(path.join(cwd, args.datapath))
 52 |         dname = args.datapath.split('/')[-1]
 53 | 
 54 |     print(f'Loaded data {dname}:')
 55 |     print(rawPop.info())
 56 | 
 57 |     # Make sure outdir exists
 58 |     if not path.isdir(args.outdir):
 59 |         mkdir(args.outdir)
 60 | 
 61 |     seed(SEED)
 62 | 
 63 |     ########################
 64 |     #### GAME INPUTS #######
 65 |     ########################
 66 |     # Pick targets
 67 |     targetIDs = choice(list(rawPop.index), size=runconfig['nTargets'], replace=False).tolist()
 68 | 
 69 |     # If specified: Add specific target records
 70 |     if runconfig['Targets'] is not None:
 71 |         targetIDs.extend(runconfig['Targets'])
 72 | 
 73 |     targets = rawPop.loc[targetIDs, :]
 74 | 
 75 |     # Drop targets from population
 76 |     rawPopDropTargets = rawPop.drop(targetIDs)
 77 | 
 78 |     # List of candidate generative models to evaluate
 79 |     gmList = []
 80 |     if 'generativeModels' in runconfig.keys():
 81 |         for gm, paramsList in runconfig['generativeModels'].items():
 82 |             if gm == 'IndependentHistogram':
 83 |                 for params in paramsList:
 84 |                     gmList.append(IndependentHistogram(metadata, *params))
 85 |             elif gm == 'BayesianNet':
 86 |                 for params in paramsList:
 87 |                     gmList.append(BayesianNet(metadata, *params))
 88 |             elif gm == 'PrivBayes':
 89 |                 for params in paramsList:
 90 |                     gmList.append(PrivBayes(metadata, *params))
 91 |             elif gm == 'CTGAN':
 92 |                 for params in paramsList:
 93 |                     gmList.append(CTGAN(metadata, *params))
 94 |             elif gm == 'PATEGAN':
 95 |                 for params in paramsList:
 96 |                     gmList.append(PATEGAN(metadata, *params))
 97 |             else:
 98 |                 raise ValueError(f'Unknown GM {gm}')
 99 | 
100 |     # List of candidate sanitisation techniques to evaluate
101 |     sanList = []
102 |     if 'sanitisationTechniques' in runconfig.keys():
103 |         for name, paramsList in runconfig['sanitisationTechniques'].items():
104 |             if name == 'SanitiserNHS':
105 |                 for params in paramsList:
106 |                     sanList.append(SanitiserNHS(metadata, *params))
107 |             else:
108 |                 raise ValueError(f'Unknown sanitisation technique {name}')
109 | 
110 |     ##################################
111 |     ######### EVALUATION #############
112 |     ##################################
113 |     resultsTargetPrivacy = {tid: {sa: {gm.__name__: {} for gm in gmList + sanList} for sa in runconfig['sensitiveAttributes']} for tid in targetIDs}
114 |     # Add entry for raw
115 |     for tid in targetIDs:
116 |         for sa in runconfig['sensitiveAttributes']:
117 |             resultsTargetPrivacy[tid][sa]['Raw'] = {}
118 | 
119 |     print('\n---- Start the game ----')
120 |     for nr in range(runconfig['nIter']):
121 |         print(f'\n--- Game iteration {nr + 1} ---')
122 |         # Draw a raw dataset
123 |         rIdx = choice(list(rawPopDropTargets.index), size=runconfig['sizeRawT'], replace=False).tolist()
124 |         rawTout = rawPopDropTargets.loc[rIdx]
125 | 
126 |         ###############
127 |         ## ATTACKS ####
128 |         ###############
129 |         attacks = {}
130 |         for sa, atype in runconfig['sensitiveAttributes'].items():
131 |             if atype == 'LinReg':
132 |                 attacks[sa] = LinRegAttack(sensitiveAttribute=sa, metadata=metadata)
133 |             elif atype == 'Classification':
134 |                 attacks[sa] = RandForestAttack(sensitiveAttribute=sa, metadata=metadata)
135 | 
136 |         #### Assess advantage raw
137 |         for sa, Attack in attacks.items():
138 |             Attack.train(rawTout)
139 | 
140 |             for tid in targetIDs:
141 |                 target = targets.loc[[tid]]
142 |                 targetAux = target.loc[[tid], Attack.knownAttributes]
143 |                 targetSecret = target.loc[tid, Attack.sensitiveAttribute]
144 | 
145 |                 guess = Attack.attack(targetAux, attemptLinkage=True, data=rawTout)
146 |                 pCorrect = Attack.get_likelihood(targetAux, targetSecret, attemptLinkage=True, data=rawTout)
147 | 
148 |                 resultsTargetPrivacy[tid][sa]['Raw'][nr] = {
149 |                     'AttackerGuess': [guess],
150 |                     'ProbCorrect': [pCorrect],
151 |                     'TargetPresence': [LABEL_OUT]
152 |                 }
153 | 
154 |         for tid in targetIDs:
155 |             target = targets.loc[[tid]]
156 |             rawTin = rawTout.append(target)
157 | 
158 |             for sa, Attack in attacks.items():
159 |                 targetAux = target.loc[[tid], Attack.knownAttributes]
160 |                 targetSecret = target.loc[tid, Attack.sensitiveAttribute]
161 | 
162 |                 guess = Attack.attack(targetAux, attemptLinkage=True, data=rawTin)
163 |                 pCorrect = Attack.get_likelihood(targetAux, targetSecret, attemptLinkage=True, data=rawTin)
164 | 
165 |                 resultsTargetPrivacy[tid][sa]['Raw'][nr]['AttackerGuess'].append(guess)
166 |                 resultsTargetPrivacy[tid][sa]['Raw'][nr]['ProbCorrect'].append(pCorrect)
167 |                 resultsTargetPrivacy[tid][sa]['Raw'][nr]['TargetPresence'].append(LABEL_IN)
168 | 
169 |         ##### Assess advantage Syn
170 |         for GenModel in gmList:
171 |             LOGGER.info(f'Start: Evaluation for model {GenModel.__name__}...')
172 |             GenModel.fit(rawTout)
173 |             synTwithoutTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])]
174 | 
175 |             for sa, Attack in attacks.items():
176 |                 for tid in targetIDs:
177 |                     resultsTargetPrivacy[tid][sa][GenModel.__name__][nr] = {
178 |                         'AttackerGuess': [],
179 |                         'ProbCorrect': [],
180 |                         'TargetPresence': [LABEL_OUT for _ in range(runconfig['nSynT'])]
181 |                     }
182 | 
183 |                 for syn in synTwithoutTarget:
184 |                     Attack.train(syn)
185 | 
186 |                     for tid in targetIDs:
187 |                         target = targets.loc[[tid]]
188 |                         targetAux = target.loc[[tid], Attack.knownAttributes]
189 |                         targetSecret = target.loc[tid, Attack.sensitiveAttribute]
190 | 
191 |                         guess = Attack.attack(targetAux)
192 |                         pCorrect = Attack.get_likelihood(targetAux, targetSecret)
193 | 
194 |                         resultsTargetPrivacy[tid][sa][GenModel.__name__][nr]['AttackerGuess'].append(guess)
195 |                         resultsTargetPrivacy[tid][sa][GenModel.__name__][nr]['ProbCorrect'].append(pCorrect)
196 | 
197 |             del synTwithoutTarget
198 | 
199 |             for tid in targetIDs:
200 |                 LOGGER.info(f'Target: {tid}')
201 |                 target = targets.loc[[tid]]
202 |                 rawTin = rawTout.append(target)
203 | 
204 |                 GenModel.fit(rawTin)
205 |                 synTwithTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])]
206 | 
207 |                 for sa, Attack in attacks.items():
208 |                     targetAux = target.loc[[tid], Attack.knownAttributes]
209 |                     targetSecret = target.loc[tid, Attack.sensitiveAttribute]
210 | 
211 |                     for syn in synTwithTarget:
212 |                         Attack.train(syn)
213 | 
214 |                         guess = Attack.attack(targetAux)
215 |                         pCorrect = Attack.get_likelihood(targetAux, targetSecret)
216 | 
217 |                         resultsTargetPrivacy[tid][sa][GenModel.__name__][nr]['AttackerGuess'].append(guess)
218 |                         resultsTargetPrivacy[tid][sa][GenModel.__name__][nr]['ProbCorrect'].append(pCorrect)
219 |                         resultsTargetPrivacy[tid][sa][GenModel.__name__][nr]['TargetPresence'].append(LABEL_IN)
220 |             del synTwithTarget
221 | 
222 |         for San in sanList:
223 |             LOGGER.info(f'Start: Evaluation for sanitiser {San.__name__}...')
224 |             attacks = {}
225 |             for sa, atype in runconfig['sensitiveAttributes'].items():
226 |                 if atype == 'LinReg':
227 |                     attacks[sa] = LinRegAttack(sensitiveAttribute=sa, metadata=metadata, quids=San.quids)
228 |                 elif atype == 'Classification':
229 |                     attacks[sa] = RandForestAttack(sensitiveAttribute=sa, metadata=metadata, quids=San.quids)
230 | 
231 |             sanOut = San.sanitise(rawTout)
232 | 
233 |             for sa, Attack in attacks.items():
234 |                 Attack.train(sanOut)
235 | 
236 |                 for tid in targetIDs:
237 |                     target = targets.loc[[tid]]
238 |                     targetAux = target.loc[[tid], Attack.knownAttributes]
239 |                     targetSecret = target.loc[tid, Attack.sensitiveAttribute]
240 | 
241 |                     guess = Attack.attack(targetAux, attemptLinkage=True, data=sanOut)
242 |                     pCorrect = Attack.get_likelihood(targetAux, targetSecret, attemptLinkage=True, data=sanOut)
243 | 
244 |                     resultsTargetPrivacy[tid][sa][San.__name__][nr] = {
245 |                         'AttackerGuess': [guess],
246 |                         'ProbCorrect': [pCorrect],
247 |                         'TargetPresence': [LABEL_OUT]
248 |                 }
249 | 
250 |             for tid in targetIDs:
251 |                 LOGGER.info(f'Target: {tid}')
252 |                 target = targets.loc[[tid]]
253 |                 rawTin = rawTout.append(target)
254 |                 sanIn = San.sanitise(rawTin)
255 | 
256 |                 for sa, Attack in attacks.items():
257 |                     targetAux = target.loc[[tid], Attack.knownAttributes]
258 |                     targetSecret = target.loc[tid, Attack.sensitiveAttribute]
259 | 
260 | 
261 |                     Attack.train(sanIn)
262 | 
263 |                     guess = Attack.attack(targetAux, attemptLinkage=True, data=sanIn)
264 |                     pCorrect = Attack.get_likelihood(targetAux, targetSecret, attemptLinkage=True, data=sanIn)
265 | 
266 |                     resultsTargetPrivacy[tid][sa][San.__name__][nr]['AttackerGuess'].append(guess)
267 |                     resultsTargetPrivacy[tid][sa][San.__name__][nr]['ProbCorrect'].append(pCorrect)
268 |                     resultsTargetPrivacy[tid][sa][San.__name__][nr]['TargetPresence'].append(LABEL_IN)
269 | 
270 |     outfile = f"ResultsMLEAI_{dname}"
271 |     LOGGER.info(f"Write results to {path.join(f'{args.outdir}', f'{outfile}')}")
272 | 
273 |     with open(path.join(f'{args.outdir}', f'{outfile}.json'), 'w') as f:
274 |         json.dump(resultsTargetPrivacy, f, indent=2, default=json_numpy_serialzer)
275 | 
276 | if __name__ == "__main__":
277 |     main()


--------------------------------------------------------------------------------
/linkage_cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Command-line interface for running privacy evaluation with respect to the risk of linkability
  3 | """
  4 | 
  5 | import json
  6 | 
  7 | from os import mkdir, path
  8 | from numpy.random import choice, seed
  9 | from argparse import ArgumentParser
 10 | from pandas import DataFrame
 11 | 
 12 | from utils.datagen import load_s3_data_as_df, load_local_data_as_df
 13 | from utils.utils import json_numpy_serialzer
 14 | from utils.logging import LOGGER
 15 | from utils.constants import *
 16 | 
 17 | from feature_sets.independent_histograms import HistogramFeatureSet
 18 | from feature_sets.model_agnostic import NaiveFeatureSet, EnsembleFeatureSet
 19 | from feature_sets.bayes import CorrelationsFeatureSet
 20 | 
 21 | from sanitisation_techniques.sanitiser import SanitiserNHS
 22 | 
 23 | from generative_models.ctgan import CTGAN
 24 | from generative_models.pate_gan import PATEGAN
 25 | from generative_models.data_synthesiser import (IndependentHistogram,
 26 |                                                 BayesianNet,
 27 |                                                 PrivBayes)
 28 | 
 29 | from attack_models.mia_classifier import (MIAttackClassifierRandomForest,
 30 |                                           generate_mia_shadow_data,
 31 |                                           generate_mia_anon_data)
 32 | 
 33 | from warnings import simplefilter
 34 | simplefilter('ignore', category=FutureWarning)
 35 | simplefilter('ignore', category=DeprecationWarning)
 36 | 
 37 | cwd = path.dirname(__file__)
 38 | 
 39 | 
 40 | SEED = 42
 41 | 
 42 | 
 43 | def main():
 44 |     argparser = ArgumentParser()
 45 |     datasource = argparser.add_mutually_exclusive_group()
 46 |     datasource.add_argument('--s3name', '-S3', type=str, choices=['adult', 'census', 'credit', 'alarm', 'insurance'], help='Name of the dataset to run on')
 47 |     datasource.add_argument('--datapath', '-D', type=str, help='Relative path to cwd of a local data file')
 48 |     argparser.add_argument('--runconfig', '-RC', default='runconfig_mia.json', type=str, help='Path relative to cwd of runconfig file')
 49 |     argparser.add_argument('--outdir', '-O', default='tests', type=str, help='Path relative to cwd for storing output files')
 50 |     args = argparser.parse_args()
 51 | 
 52 |     # Load runconfig
 53 |     with open(path.join(cwd, args.runconfig)) as f:
 54 |         runconfig = json.load(f)
 55 |     print('Runconfig:')
 56 |     print(runconfig)
 57 | 
 58 |     # Load data
 59 |     if args.s3name is not None:
 60 |         rawPop, metadata = load_s3_data_as_df(args.s3name)
 61 |         dname = args.s3name
 62 |     else:
 63 |         rawPop, metadata = load_local_data_as_df(path.join(cwd, args.datapath))
 64 |         dname = args.datapath.split('/')[-1]
 65 | 
 66 |     print(f'Loaded data {dname}:')
 67 |     print(rawPop.info())
 68 | 
 69 |     # Make sure outdir exists
 70 |     if not path.isdir(args.outdir):
 71 |         mkdir(args.outdir)
 72 | 
 73 |     seed(SEED)
 74 | 
 75 |     ########################
 76 |     #### GAME INPUTS #######
 77 |     ########################
 78 |     # Pick targets
 79 |     targetIDs = choice(list(rawPop.index), size=runconfig['nTargets'], replace=False).tolist()
 80 | 
 81 |     # If specified: Add specific target records
 82 |     if runconfig['Targets'] is not None:
 83 |         targetIDs.extend(runconfig['Targets'])
 84 | 
 85 |     targets = rawPop.loc[targetIDs, :]
 86 | 
 87 |     # Drop targets from population
 88 |     rawPopDropTargets = rawPop.drop(targetIDs)
 89 | 
 90 |     # Init adversary's prior knowledge
 91 |     rawAidx = choice(list(rawPopDropTargets.index), size=runconfig['sizeRawA'], replace=False).tolist()
 92 |     rawA = rawPop.loc[rawAidx, :]
 93 | 
 94 |     # List of candidate generative models to evaluate
 95 |     gmList = []
 96 |     if 'generativeModels' in runconfig.keys():
 97 |         for gm, paramsList in runconfig['generativeModels'].items():
 98 |             if gm == 'IndependentHistogram':
 99 |                 for params in paramsList:
100 |                     gmList.append(IndependentHistogram(metadata, *params))
101 |             elif gm == 'BayesianNet':
102 |                 for params in paramsList:
103 |                     gmList.append(BayesianNet(metadata, *params))
104 |             elif gm == 'PrivBayes':
105 |                 for params in paramsList:
106 |                     gmList.append(PrivBayes(metadata, *params))
107 |             elif gm == 'CTGAN':
108 |                 for params in paramsList:
109 |                     gmList.append(CTGAN(metadata, *params))
110 |             elif gm == 'PATEGAN':
111 |                 for params in paramsList:
112 |                     gmList.append(PATEGAN(metadata, *params))
113 |             else:
114 |                 raise ValueError(f'Unknown GM {gm}')
115 | 
116 |     # List of candidate sanitisation techniques to evaluate
117 |     sanList = []
118 |     if 'sanitisationTechniques' in runconfig.keys():
119 |         for name, paramsList in runconfig['sanitisationTechniques'].items():
120 |             if name == 'SanitiserNHS':
121 |                 for params in paramsList:
122 |                     sanList.append(SanitiserNHS(metadata, *params))
123 |             else:
124 |                 raise ValueError(f'Unknown sanitisation technique {name}')
125 | 
126 |     ###################################
127 |     #### ATTACK TRAINING #############
128 |     ##################################
129 |     print('\n---- Attack training ----')
130 |     attacks = {}
131 | 
132 |     for tid in targetIDs:
133 |         print(f'\n--- Adversary picks target {tid} ---')
134 |         target = targets.loc[[tid]]
135 |         attacks[tid] = {}
136 | 
137 |         for San in sanList:
138 |             LOGGER.info(f'Start: Attack training for {San.__name__}...')
139 | 
140 |             attacks[tid][San.__name__] = {}
141 | 
142 |             # Generate example datasets for training attack classifier
143 |             sanA, labelsA = generate_mia_anon_data(San, target, rawA, runconfig['sizeRawT'], runconfig['nShadows'] * runconfig['nSynA'])
144 | 
145 |             # Train attack on shadow data
146 |             for Feature in [NaiveFeatureSet(DataFrame),
147 |                             HistogramFeatureSet(DataFrame, metadata, nbins=San.histogram_size, quids=San.quids),
148 |                             CorrelationsFeatureSet(DataFrame, metadata, quids=San.quids),
149 |                             EnsembleFeatureSet(DataFrame, metadata, nbins=San.histogram_size, quasi_id_cols=San.quids)]:
150 | 
151 |                 Attack = MIAttackClassifierRandomForest(metadata=metadata, FeatureSet=Feature, quids=San.quids)
152 |                 Attack.train(sanA, labelsA)
153 |                 attacks[tid][San.__name__][f'{Feature.__name__}'] = Attack
154 | 
155 |             # Clean up
156 |             del sanA, labelsA
157 | 
158 |             LOGGER.info(f'Finished: Attack training.')
159 | 
160 |         for GenModel in gmList:
161 |             LOGGER.info(f'Start: Attack training for {GenModel.__name__}...')
162 | 
163 |             attacks[tid][GenModel.__name__] = {}
164 | 
165 |             # Generate shadow model data for training attacks on this target
166 |             synA, labelsSA = generate_mia_shadow_data(GenModel, target, rawA, runconfig['sizeRawT'], runconfig['sizeSynT'], runconfig['nShadows'], runconfig['nSynA'])
167 | 
168 |             # Train attack on shadow data
169 |             for Feature in [NaiveFeatureSet(GenModel.datatype), HistogramFeatureSet(GenModel.datatype, metadata), CorrelationsFeatureSet(GenModel.datatype, metadata)]:
170 |                 Attack  = MIAttackClassifierRandomForest(metadata, Feature)
171 |                 Attack.train(synA, labelsSA)
172 |                 attacks[tid][GenModel.__name__][f'{Feature.__name__}'] = Attack
173 | 
174 |             # Clean up
175 |             del synA, labelsSA
176 | 
177 |             LOGGER.info(f'Finished: Attack training.')
178 | 
179 |     ##################################
180 |     ######### EVALUATION #############
181 |     ##################################
182 |     resultsTargetPrivacy = {tid: {gm.__name__: {} for gm in gmList + sanList} for tid in targetIDs}
183 | 
184 |     print('\n---- Start the game ----')
185 |     for nr in range(runconfig['nIter']):
186 |         print(f'\n--- Game iteration {nr + 1} ---')
187 |         # Draw a raw dataset
188 |         rIdx = choice(list(rawPopDropTargets.index), size=runconfig['sizeRawT'], replace=False).tolist()
189 |         rawTout = rawPopDropTargets.loc[rIdx]
190 | 
191 |         for GenModel in gmList:
192 |             LOGGER.info(f'Start: Evaluation for model {GenModel.__name__}...')
193 |             # Train a generative model
194 |             GenModel.fit(rawTout)
195 |             synTwithoutTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])]
196 |             synLabelsOut = [LABEL_OUT for _ in range(runconfig['nSynT'])]
197 | 
198 |             for tid in targetIDs:
199 |                 LOGGER.info(f'Target: {tid}')
200 |                 target = targets.loc[[tid]]
201 |                 resultsTargetPrivacy[tid][f'{GenModel.__name__}'][nr] = {}
202 | 
203 |                 rawTin = rawTout.append(target)
204 |                 GenModel.fit(rawTin)
205 |                 synTwithTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])]
206 |                 synLabelsIn = [LABEL_IN for _ in range(runconfig['nSynT'])]
207 | 
208 |                 synT = synTwithoutTarget + synTwithTarget
209 |                 synTlabels = synLabelsOut + synLabelsIn
210 | 
211 |                 # Run attacks
212 |                 for feature, Attack in attacks[tid][f'{GenModel.__name__}'].items():
213 |                     # Produce a guess for each synthetic dataset
214 |                     attackerGuesses = Attack.attack(synT)
215 | 
216 |                     resDict = {
217 |                         'Secret': synTlabels,
218 |                         'AttackerGuess': attackerGuesses
219 |                     }
220 |                     resultsTargetPrivacy[tid][f'{GenModel.__name__}'][nr][feature] = resDict
221 | 
222 |             del synT, synTwithoutTarget, synTwithTarget
223 | 
224 |             LOGGER.info(f'Finished: Evaluation for model {GenModel.__name__}.')
225 | 
226 |         for San in sanList:
227 |             LOGGER.info(f'Start: Evaluation for sanitiser {San.__name__}...')
228 |             sanOut = San.sanitise(rawTout)
229 | 
230 |             for tid in targetIDs:
231 |                 LOGGER.info(f'Target: {tid}')
232 |                 target = targets.loc[[tid]]
233 |                 resultsTargetPrivacy[tid][San.__name__][nr] = {}
234 | 
235 |                 rawTin = rawTout.append(target)
236 |                 sanIn = San.sanitise(rawTin)
237 | 
238 |                 sanT = [sanOut, sanIn]
239 |                 sanTLabels = [LABEL_OUT, LABEL_IN]
240 | 
241 |                 # Run attacks
242 |                 for feature, Attack in attacks[tid][San.__name__].items():
243 |                     # Produce a guess for each synthetic dataset
244 |                     attackerGuesses = Attack.attack(sanT, attemptLinkage=True, target=target)
245 | 
246 |                     resDict = {
247 |                         'Secret': sanTLabels,
248 |                         'AttackerGuess': attackerGuesses
249 |                     }
250 |                     resultsTargetPrivacy[tid][San.__name__][nr][feature] = resDict
251 | 
252 |             del sanT, sanOut, sanIn
253 | 
254 |             LOGGER.info(f'Finished: Evaluation for model {San.__name__}.')
255 | 
256 |     outfile = f"ResultsMIA_{dname}"
257 |     LOGGER.info(f"Write results to {path.join(f'{args.outdir}', f'{outfile}')}")
258 | 
259 |     with open(path.join(f'{args.outdir}', f'{outfile}.json'), 'w') as f:
260 |         json.dump(resultsTargetPrivacy, f, indent=2, default=json_numpy_serialzer)
261 | 
262 | 
263 | if __name__ == "__main__":
264 |     main()


--------------------------------------------------------------------------------
/notebooks/Analyse Results.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2\n",
 11 |     "%matplotlib inline\n",
 12 |     "\n",
 13 |     "from warnings import filterwarnings\n",
 14 |     "filterwarnings('ignore')"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import sys\n",
 24 |     "sys.path.append('../')\n",
 25 |     "\n",
 26 |     "from utils.analyse_results import *"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "# Linkage"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "dirname = '../tests/linkage/'\n",
 43 |     "linkage_gain = load_results_linkage(dirname)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "models = ['SanitiserNHSk10', 'BayesianNet', 'PrivBayesEps1.0']\n",
 53 |     "fig = plt_per_target_pg(linkage_gain, models, resFilter=('FeatureSet', 'Naive'))"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "models = ['SanitiserNHSk10', 'BayesianNet', 'PrivBayesEps1.0']\n",
 63 |     "fig = plt_per_target_pg(linkage_gain, models, resFilter=('FeatureSet', 'Correlations'))"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "# Inference"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "dirname = '../tests/inference/'\n",
 80 |     "dpath = '../data/texas'\n",
 81 |     "inference_gain = load_results_inference(dirname, dpath)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "models = ['SanitiserNHSk10', 'BayesianNet', 'PrivBayesEps1.0']\n",
 91 |     "fig = plt_per_target_pg(inference_gain, models, resFilter=('SensitiveAttribute', 'RACE'))"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "fig = plt_per_target_pg(inference_gain, models, resFilter=('SensitiveAttribute', 'LengthOfStay'))"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "# Aggregate Utility"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "dirname = '../tests/utility/'\n",
117 |     "utility_record, utility_agg = load_results_utility(dirname)"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "labelVar = 'RiskMortality'\n",
127 |     "models = ['Raw','SanitiserNHSk10', 'BayesianNet', 'PrivBayesEps1.0']\n",
128 |     "fig = plt_avg_accuracy(utility_agg, models)"
129 |    ]
130 |   }
131 |  ],
132 |  "metadata": {
133 |   "kernelspec": {
134 |    "display_name": "venv_syn",
135 |    "language": "python",
136 |    "name": "venv_syn"
137 |   },
138 |   "language_info": {
139 |    "codemirror_mode": {
140 |     "name": "ipython",
141 |     "version": 3
142 |    },
143 |    "file_extension": ".py",
144 |    "mimetype": "text/x-python",
145 |    "name": "python",
146 |    "nbconvert_exporter": "python",
147 |    "pygments_lexer": "ipython3",
148 |    "version": "3.6.8"
149 |   }
150 |  },
151 |  "nbformat": 4,
152 |  "nbformat_minor": 4
153 | }
154 | 


--------------------------------------------------------------------------------
/predictive_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/predictive_models/__init__.py


--------------------------------------------------------------------------------
/predictive_models/predictive_model.py:
--------------------------------------------------------------------------------
  1 | """ Some predictive models to represent a simple analysis task. """
  2 | from sklearn.impute import SimpleImputer
  3 | from sklearn.linear_model import LinearRegression
  4 | from sklearn.linear_model import LogisticRegression
  5 | from sklearn.ensemble import RandomForestClassifier
  6 | from pandas import DataFrame
  7 | from numpy import empty, true_divide, zeros, arange
  8 | 
  9 | from utils.logging import LOGGER
 10 | from utils.constants import *
 11 | 
 12 | 
 13 | class PredictiveModel(object):
 14 |     """ A predictive model. """
 15 |     def __init__(self, metadata, labelCol):
 16 |         """
 17 |         :param metadata: dict: Metadata description
 18 |         :param labelCol: str: Name of the target variable
 19 |         """
 20 |         self.metadata = metadata
 21 |         self.labelCol = labelCol
 22 |         self.nfeatures = self._get_num_features()
 23 | 
 24 |         self.ImputerCat = SimpleImputer(strategy='most_frequent')
 25 |         self.ImputerNum = SimpleImputer(strategy='median')
 26 | 
 27 |         self.datatype = DataFrame
 28 |         self.trained = False
 29 | 
 30 |     def train(self, data):
 31 |         return NotImplementedError("Method needs to be overwritten by a subclass")
 32 | 
 33 |     def predict(self, features):
 34 |         return NotImplementedError("Method needs to be overwritten by a subclass")
 35 | 
 36 |     def evalute(self, data):
 37 |         return NotImplementedError("Method needs to be overwritten by a subclass")
 38 | 
 39 |     def _encode_data(self, data):
 40 |         n_samples = len(data)
 41 |         features_encoded = empty((n_samples, self.nfeatures))
 42 |         cidx = 0
 43 | 
 44 |         for cdict in self.metadata['columns']:
 45 |             data_type = cdict['type']
 46 |             attr_name = cdict['name']
 47 |             if attr_name != self.labelCol:
 48 |                 col_data = data[attr_name].to_numpy()
 49 | 
 50 |                 if data_type == FLOAT or data_type == INTEGER:
 51 |                     col_max = cdict['max']
 52 |                     col_min = cdict['min']
 53 |                     features_encoded[:, cidx] = true_divide(col_data - col_min, col_max + ZERO_TOL)
 54 |                     cidx += 1
 55 | 
 56 |                 elif data_type == CATEGORICAL or data_type == ORDINAL:
 57 |                     # One-hot encoded categorical columns
 58 |                     col_cats = cdict['i2s']
 59 |                     col_data_onehot = self._one_hot(col_data, col_cats)
 60 |                     features_encoded[:, cidx : cidx + len(col_cats)] = col_data_onehot
 61 |                     cidx += len(col_cats)
 62 | 
 63 |         return features_encoded
 64 | 
 65 |     def _get_num_features(self):
 66 |         nfeatures = 0
 67 | 
 68 |         for cdict in self.metadata['columns']:
 69 |             data_type = cdict['type']
 70 |             attr_name = cdict['name']
 71 | 
 72 |             if attr_name != self.labelCol:
 73 |                 if data_type == FLOAT or data_type == INTEGER:
 74 |                     nfeatures += 1
 75 | 
 76 |                 elif data_type == CATEGORICAL or data_type == ORDINAL:
 77 |                     nfeatures += len(cdict['i2s'])
 78 | 
 79 |                 else:
 80 |                     raise ValueError(f'Unkown data type {data_type} for attribute {attr_name}')
 81 | 
 82 |         return nfeatures
 83 | 
 84 |     def _get_feature_names(self):
 85 |         featureNames = []
 86 | 
 87 |         for i, cdict in enumerate(self.metadata['columns']):
 88 |             data_type = cdict['type']
 89 |             attr_name = cdict['name']
 90 | 
 91 |             if attr_name != self.labelCol:
 92 |                 if data_type == FLOAT or data_type == INTEGER:
 93 |                     featureNames.append(attr_name)
 94 | 
 95 |                 elif data_type == CATEGORICAL or data_type == ORDINAL:
 96 |                     col_cats = cdict['i2s']
 97 |                     featureNames.extend([f'{attr_name}_{c}' for c in col_cats])
 98 | 
 99 |         return featureNames
100 | 
101 |     def _impute_missing_values(self, df):
102 |         dfImpute = df.copy()
103 | 
104 |         catCols = []
105 |         numCols = []
106 | 
107 |         for col in self.metadata['columns']:
108 |             if col['name'] in list(dfImpute):
109 |                 if col['type'] in [CATEGORICAL, ORDINAL]:
110 |                     catCols.append(col['name'])
111 |                 elif col['type'] in NUMERICAL:
112 |                     numCols.append(col['name'])
113 | 
114 |         self.ImputerCat.fit(df[catCols])
115 |         dfImpute[catCols] = self.ImputerCat.transform(df[catCols])
116 | 
117 |         self.ImputerNum.fit(df[numCols])
118 |         dfImpute[numCols] = self.ImputerNum.transform(df[numCols])
119 | 
120 |         return dfImpute
121 | 
122 |     def _one_hot(self, col_data, categories):
123 |         col_data_onehot = zeros((len(col_data), len(categories)))
124 |         cidx = [categories.index(c) for c in col_data]
125 |         col_data_onehot[arange(len(col_data)), cidx] = 1
126 | 
127 |         return col_data_onehot
128 | 
129 | 
130 | class ClassificationTask(PredictiveModel):
131 |     """ A binary or multiclass classification model. """
132 | 
133 |     def __init__(self, Distinguisher, metadata, labelCol):
134 |         """
135 |         :param Distinguisher: sklearn.Classifier: A classification model
136 |         :param metadata: dict: Metadata description
137 |         :param labelCol: str: Name of the target variable
138 |         """
139 |         super().__init__(metadata, labelCol)
140 |         self.Distinguisher = Distinguisher
141 | 
142 |         labels = self._get_labels()
143 |         self.labels = {l:i for i, l in enumerate(labels)}
144 |         self.labelsInv = {i:l for l, i in self.labels.items()}
145 | 
146 |         self.__name__ = f'{self.Distinguisher.__class__.__name__}{self.labelCol}'
147 | 
148 |     def train(self, data):
149 |         if not isinstance(data, self.datatype):
150 |             raise ValueError(f"Model expects input as {self.datatype} but got {type(data)}")
151 | 
152 |         data = self._impute_missing_values(data)
153 |         features = self._encode_data(data.drop(self.labelCol, axis=1))
154 |         labels = data[self.labelCol].apply(lambda x: self.labels[x]).values
155 | 
156 |         self.Distinguisher.fit(features, labels)
157 | 
158 |         LOGGER.debug('Finished training MIA distinguisher')
159 |         self.trained = True
160 | 
161 |     def predict(self, data):
162 |         if not isinstance(data, self.datatype):
163 |             raise ValueError(f"Model expects input as {self.datatype} but got {type(data)}")
164 | 
165 |         features = self._encode_data(data.drop(self.labelCol, axis=1))
166 |         labels = self.Distinguisher.predict(features)
167 | 
168 |         return [self.labelsInv[i] for i in labels]
169 | 
170 |     def evaluate(self, data):
171 |         if not isinstance(data, self.datatype):
172 |             raise ValueError(f"Model expects input as {self.datatype} but got {type(data)}")
173 | 
174 |         features = self._encode_data(data.drop(self.labelCol, axis=1))
175 |         labelsTrue = data[self.labelCol].apply(lambda x: self.labels[x]).values
176 |         labelsPred = self.Distinguisher.predict(features)
177 | 
178 |         return [int(l == p) for l, p in zip(labelsTrue, labelsPred)]
179 | 
180 |     def _get_accuracy(self, trueLabels, predLabels):
181 |         return sum([g == l for g, l in zip(trueLabels, predLabels)])/len(trueLabels)
182 | 
183 |     def _get_labels(self):
184 |         for cdict in self.metadata['columns']:
185 |             if cdict['name'] == self.labelCol:
186 |                 if not cdict['type'] in [CATEGORICAL, ORDINAL]:
187 |                     raise ValueError('Label column must be discrete data type.')
188 | 
189 |                 return cdict['i2s']
190 | 
191 | 
192 | class RandForestClassTask(ClassificationTask):
193 |     def __init__(self, metadata, labelCol):
194 |         super().__init__(RandomForestClassifier(), metadata, labelCol)
195 | 
196 | 
197 | class LogRegClassTask(ClassificationTask):
198 |     def __init__(self, metadata, labelCol):
199 |         super().__init__(LogisticRegression(), metadata, labelCol)
200 | 
201 | 
202 | class RegressionTask(PredictiveModel):
203 |     """ A binary or multiclass classification model. """
204 | 
205 |     def __init__(self, Regressor, metadata, labelCol):
206 |         """
207 | 
208 |         :param Regressor: sklearn.Regressor: A regression model
209 |         :param metadata: dict: Metadata description
210 |         :param labels: list: Label names
211 |         :param FeatureSet: object: Feature extraction object
212 |         """
213 |         super().__init__(metadata, labelCol)
214 |         self.Regressor = Regressor
215 | 
216 |         self.__name__ = f'{self.Regressor.__class__.__name__}{self.labelCol}'
217 | 
218 |     def train(self, data):
219 |         if not isinstance(data, self.datatype):
220 |             raise ValueError(f"Model expects input as {self.datatype} but got {type(data)}")
221 | 
222 |         data = self._impute_missing_values(data)
223 |         features = self._encode_data(data.drop(self.labelCol, axis=1))
224 |         labels = data[self.labelCol].values
225 | 
226 |         self.Regressor.fit(features, labels)
227 | 
228 |         LOGGER.debug('Finished training regression model')
229 |         self.trained = True
230 | 
231 |     def predict(self, features):
232 |         if not isinstance(features, self.datatype):
233 |             raise ValueError(f"Model expects input as {self.datatype} but got {type(features)}")
234 | 
235 |         features = self._encode_data(features)
236 |         labels = self.Regressor.predict(features)
237 | 
238 |         return list(labels)
239 | 
240 |     def evaluate(self, data):
241 |         if not isinstance(data, self.datatype):
242 |             raise ValueError(f"Model expects input as {self.datatype} but got {type(data)}")
243 | 
244 |         features = self._encode_data(data.drop(self.labelCol, axis=1))
245 |         labelsTrue = data[self.labelCol].values
246 |         labelsPred = self.Regressor.predict(features)
247 | 
248 |         return [true - pred for true, pred in zip(labelsTrue, labelsPred)]
249 | 
250 | 
251 | class LinRegTask(RegressionTask):
252 |     def __init__(self, metadata, labelCol):
253 |         super().__init__(LinearRegression(), metadata, labelCol)
254 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | husl==4.0.3
 2 | loguru==0.5.3
 3 | matplotlib==3.4.3
 4 | palettable==3.3.0
 5 | pandas==0.25.3
 6 | scipy==1.7.1
 7 | seaborn==0.11.2
 8 | sklearn==0.0
 9 | tensorflow==2.6.0
10 | torch==1.9.1
11 | 


--------------------------------------------------------------------------------
/sanitisation_techniques/sanitiser.py:
--------------------------------------------------------------------------------
  1 | """ Parent class for sanitisers """
  2 | from pandas import DataFrame, cut
  3 | from sklearn.impute import SimpleImputer
  4 | from pandas.api.types import is_numeric_dtype
  5 | 
  6 | from utils.constants import *
  7 | 
  8 | class Sanitiser(object):
  9 | 
 10 |     def sanitise(self, data):
 11 |         """ Apply a privacy policy to the data. """
 12 |         return NotImplementedError('Method needs to be overwritten by a subclass')
 13 | 
 14 | 
 15 | class SanitiserNHS(Sanitiser):
 16 |     """ A sanitisation mechanism that follows the strategy described by NHS England. """
 17 |     def __init__(self, metadata,
 18 |                  nbins=10, thresh_rare=0,
 19 |                  max_quantile = 1, anonymity_set_size=1,
 20 |                  drop_cols=None, quids=None):
 21 | 
 22 |         self.metadata = self._read_meta(metadata, drop_cols, quids)
 23 |         self.datatype = DataFrame
 24 | 
 25 |         self.histogram_size = nbins
 26 |         self.unique_threshold = thresh_rare
 27 |         self.quids = quids
 28 |         self.max_quantile = max_quantile
 29 |         self.anonymity_set_size = anonymity_set_size
 30 | 
 31 |         self.ImputerCat = SimpleImputer(strategy='most_frequent')
 32 |         self.ImputerNum = SimpleImputer(strategy='median')
 33 | 
 34 |         self.trained = False
 35 | 
 36 |         self.__name__ = f'SanitiserNHSk{self.anonymity_set_size}'
 37 | 
 38 |     def sanitise(self, data):
 39 |         """
 40 |         Sanitise a sensitive dataset
 41 | 
 42 |         :param data: DataFrame: Sensitive raw dataset
 43 |         :return: san_data: DataFrame: Sanitised dataset
 44 |         """
 45 |         san_data = DataFrame(index=data.index)
 46 |         data = self._impute_missing_values(data)
 47 |         drop_records = []
 48 | 
 49 |         for col, cdict in self.metadata.items():
 50 |             coltype = cdict['type']
 51 |             col_data = data[col].copy()
 52 | 
 53 |             if coltype == FLOAT or coltype == INTEGER:
 54 |                 col_data = col_data.astype(int)
 55 | 
 56 |                 # Cap numerical attributes
 57 |                 cap = col_data.quantile(self.max_quantile)
 58 |                 idx = col_data[col_data > cap].index
 59 |                 col_data.loc[idx] = int(cap)
 60 | 
 61 |             elif coltype == CATEGORICAL or coltype == ORDINAL:
 62 |                 if is_numeric_dtype(col_data):
 63 |                     # Bins numerical cols marked as quid into specified bins
 64 |                     col_data = cut(col_data, bins=cdict['bins'], labels=cdict['categories'])
 65 |                     col_data = col_data.astype(str)
 66 | 
 67 |                 # Remove any records with rare categories
 68 |                 frequencies = col_data.value_counts()
 69 |                 drop_cats = frequencies[frequencies <= self.unique_threshold].index
 70 | 
 71 |                 for c in drop_cats:
 72 |                     ridx = list(col_data[col_data == c].index)
 73 |                     drop_records.extend(ridx)
 74 | 
 75 |             san_data[col] = col_data.values
 76 | 
 77 |         drop_records = list(set(drop_records))
 78 |         san_data = san_data.drop(drop_records)
 79 | 
 80 |         # Enforce k-anonymity constraint
 81 |         if self.quids is not None:
 82 |             anonymity_sets = san_data.groupby(self.quids).size()
 83 |             groups = anonymity_sets[anonymity_sets < self.anonymity_set_size].index
 84 |             for g in groups:
 85 |                 conditions = [f"{k} == '{v}'" for k,v in zip(self.quids, g)]
 86 |                 query = " and ".join(conditions)
 87 |                 didx = san_data.query(query).index
 88 |                 san_data = san_data.drop(didx)
 89 | 
 90 |         return san_data
 91 | 
 92 |     def _read_meta(self, metadata, drop_cols, quids):
 93 |         """ Read metadata from metadata file."""
 94 |         if quids is None:
 95 |             quids = []
 96 | 
 97 |         if drop_cols is None:
 98 |             drop_cols = []
 99 | 
100 |         metadict = {}
101 | 
102 |         for cdict in metadata['columns']:
103 |             col = cdict['name']
104 |             coltype = cdict['type']
105 | 
106 |             if col not in drop_cols:
107 |                 if coltype == FLOAT or coltype == INTEGER:
108 |                     if col in quids:
109 |                         cbins = cdict['bins']
110 |                         cats = [f'({cbins[i]},{cbins[i+1]}]' for i in range(len(cbins)-1)]
111 | 
112 |                         metadict[col] = {
113 |                             'type': CATEGORICAL,
114 |                             'categories': cats,
115 |                             'bins': cbins,
116 |                             'size': len(cats)
117 |                         }
118 | 
119 |                     else:
120 |                         metadict[col] = {
121 |                             'type': coltype,
122 |                             'min': cdict['min'],
123 |                             'max': cdict['max']
124 |                         }
125 | 
126 |                 elif coltype == CATEGORICAL or coltype == ORDINAL:
127 |                     metadict[col] = {
128 |                         'type': coltype,
129 |                         'categories': cdict['i2s'],
130 |                         'size': len(cdict['i2s'])
131 |                     }
132 | 
133 |                 else:
134 |                     raise ValueError(f'Unknown data type {coltype} for attribute {col}')
135 | 
136 |         return metadict
137 | 
138 |     def _impute_missing_values(self, df):
139 |         df_impute = df.copy()
140 | 
141 |         cat_cols = []
142 |         num_cols = []
143 | 
144 |         for col, cdict in self.metadata.items():
145 |             if col in list(df_impute):
146 |                 if cdict['type'] in [CATEGORICAL, ORDINAL]:
147 |                     cat_cols.append(col)
148 | 
149 |                 elif cdict['type'] in NUMERICAL:
150 |                     num_cols.append(col)
151 | 
152 |         self.ImputerCat.fit(df[cat_cols])
153 |         df_impute[cat_cols] = self.ImputerCat.transform(df[cat_cols])
154 | 
155 |         self.ImputerNum.fit(df[num_cols])
156 |         df_impute[num_cols] = self.ImputerNum.transform(df[num_cols])
157 | 
158 |         return df_impute


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import sys, os
2 | path = os.path.dirname(__file__)
3 | print(path)
4 | if path not in sys.path:
5 |     sys.path.append(path)


--------------------------------------------------------------------------------
/tests/germancredit_test.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:dac84a41b87e3f7b7ae067262b6ba9b23419339d5513aa75ca7b77b2eefb84c5
3 | size 18560
4 | 


--------------------------------------------------------------------------------
/tests/germancredit_test.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "columns": [
  3 |     {
  4 |       "name": "Age",
  5 |       "min": 19,
  6 |       "max": 75,
  7 |       "type": "Integer",
  8 |       "bins": [18, 25, 30, 40, 50, 60, 80]
  9 |     },
 10 |     {
 11 |       "name": "Sex",
 12 |       "type": "Categorical",
 13 |       "size": 2,
 14 |       "i2s": [
 15 |         "male",
 16 |         "female"
 17 |       ]
 18 |     },
 19 |     {
 20 |       "name": "Job",
 21 |       "type": "Ordinal",
 22 |       "size": 4,
 23 |       "i2s": [
 24 |         "unemployed",
 25 |         "unskilled",
 26 |         "skilled",
 27 |         "management"
 28 |       ]
 29 |     },
 30 |     {
 31 |       "name": "Housing",
 32 |       "type": "Categorical",
 33 |       "size": 3,
 34 |       "i2s": [
 35 |         "own",
 36 |         "free",
 37 |         "rent"
 38 |       ]
 39 |     },
 40 |     {
 41 |       "name": "Saving accounts",
 42 |       "type": "Ordinal",
 43 |       "i2s": [
 44 |         "no_info",
 45 |         "little",
 46 |         "moderate",
 47 |         "quite rich",
 48 |         "rich"
 49 |       ],
 50 |       "size": 5
 51 |     },
 52 |     {
 53 |       "name": "Checking account",
 54 |       "type": "Ordinal",
 55 |       "size": 4,
 56 |       "i2s": [
 57 |         "no_info",
 58 |         "little",
 59 |         "moderate",
 60 |         "rich"
 61 |       ]
 62 |     },
 63 |     {
 64 |       "name": "Credit amount",
 65 |       "type": "Float",
 66 |       "min": 250.0,
 67 |       "max": 18424.0
 68 |     },
 69 |     {
 70 |       "name": "Duration",
 71 |       "type": "Integer",
 72 |       "min": 4,
 73 |       "max": 72
 74 |     },
 75 |     {
 76 |       "name": "Purpose",
 77 |       "type": "Categorical",
 78 |       "size": 8,
 79 |       "i2s": [
 80 |         "radio/TV",
 81 |         "education",
 82 |         "furniture/equipment",
 83 |         "car",
 84 |         "business",
 85 |         "domestic appliances",
 86 |         "repairs",
 87 |         "vacation/others"
 88 |       ]
 89 |     },
 90 |     {
 91 |       "name": "Risk",
 92 |       "type": "Categorical",
 93 |       "size": 2,
 94 |       "i2s": [
 95 |         "good",
 96 |         "bad"
 97 |       ]
 98 |     }
 99 |   ]
100 | }


--------------------------------------------------------------------------------
/tests/inference/runconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "sensitiveAttributes": {"LENGTH_OF_STAY": "LinReg",
 3 |     "RACE": "Classification"},
 4 |   "nIter": 15,
 5 |   "sizeRawT": 1000,
 6 |   "sizeSynT": 1000,
 7 |   "nSynT": 10,
 8 |   "nTargets": 0,
 9 |   "Targets": ["ID26241", "ID31432", "ID27428", "ID29265", "ID14086"],
10 |   "generativeModels": {
11 |     "BayesianNet": [[25, 1]],
12 |     "PrivBayes": [[25, 1, 1.0]]
13 |   },
14 |   "sanitisationTechniques": {
15 |     "SanitiserNHS": [[10, 1, 0.99, 10, [], ["PAT_STATE", "SEX_CODE", "RACE", "ETHNICITY", "PAT_AGE"]]]
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/tests/linkage/runconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nIter": 15,
 3 |   "sizeRawA": 10000,
 4 |   "nSynA": 10,
 5 |   "nShadows": 10,
 6 |   "sizeRawT": 1000,
 7 |   "sizeSynT": 1000,
 8 |   "nSynT": 5,
 9 |   "nTargets": 0,
10 |   "Targets": ["ID26241", "ID31432", "ID27428", "ID29265", "ID14086"],
11 |   "generativeModels": {
12 |     "BayesianNet": [[25, 1]],
13 |     "PrivBayes": [[25, 1, 1.0]]
14 |   },
15 |   "sanitisationTechniques": {
16 |     "SanitiserNHS": [[10, 1, 0.99, 10, [], ["PAT_STATE", "SEX_CODE", "RACE", "ETHNICITY", "PAT_AGE"]]]
17 |   }
18 | }
19 | 


--------------------------------------------------------------------------------
/tests/test_attacks.py:
--------------------------------------------------------------------------------
 1 | """A template file for writing a simple test for a new attack model"""
 2 | from unittest import TestCase
 3 | from pandas import DataFrame
 4 | 
 5 | from warnings import filterwarnings
 6 | filterwarnings('ignore')
 7 | 
 8 | from os import path
 9 | cwd = path.dirname(__file__)
10 | 
11 | from attack_models.mia_classifier import (MIAttackClassifierLinearSVC,
12 |                                           MIAttackClassifierLogReg,
13 |                                           MIAttackClassifierRandomForest,
14 |                                           generate_mia_shadow_data,
15 |                                           generate_mia_anon_data)
16 | 
17 | from generative_models.data_synthesiser import IndependentHistogram
18 | from sanitisation_techniques.sanitiser import SanitiserNHS
19 | from feature_sets.independent_histograms import HistogramFeatureSet
20 | from utils.datagen import load_local_data_as_df
21 | 
22 | class TestAttacks(TestCase):
23 |     @classmethod
24 |     def setUp(self) -> None:
25 |         self.raw, self.metadata = load_local_data_as_df(path.join(cwd, 'germancredit_test'))
26 |         self.sizeS = int(len(self.raw)/2)
27 |         self.GenModel = IndependentHistogram(self.metadata)
28 |         self.San = SanitiserNHS(self.metadata)
29 |         self.FeatureSet = HistogramFeatureSet(DataFrame, metadata=self.metadata)
30 | 
31 |         self.target = self.raw.sample()
32 |         self.shadowDataSyn = generate_mia_shadow_data(self.GenModel, self.target, self.raw, self.sizeS, self.sizeS, numModels=2, numCopies=2)
33 |         self.shadowDataSan = generate_mia_anon_data(self.San, self.target, self.raw, self.sizeS, numSamples=2)
34 | 
35 |         self.GenModel.fit(self.raw)
36 |         self.synthetic = [self.GenModel.generate_samples(self.sizeS) for _ in range(10)]
37 |         self.sanitised = [self.San.sanitise(self.raw) for _ in range(10)]
38 | 
39 |     def test_mia_randforest(self):
40 |         print('\nTest MIA RandForest')
41 |         ## Default without feature extraction
42 |         Attack = MIAttackClassifierRandomForest(metadata=self.metadata)
43 |         Attack.train(*self.shadowDataSyn)
44 | 
45 |         guesses = Attack.attack(self.synthetic)
46 |         self.assertEqual(len(guesses), len(self.synthetic))
47 | 
48 |         ## With FeatureSet
49 |         Attack = MIAttackClassifierRandomForest(metadata=self.metadata, FeatureSet=self.FeatureSet)
50 |         Attack.train(*self.shadowDataSyn)
51 | 
52 |         guesses = Attack.attack(self.synthetic)
53 |         self.assertEqual(len(guesses), len(self.synthetic))
54 | 
55 |         ## Test linkage
56 |         Attack.train(*self.shadowDataSan)
57 |         guesses = Attack.attack(self.sanitised, attemptLinkage=True, target=self.target)
58 |         self.assertEqual(len(guesses), len(self.sanitised))
59 | 
60 | 
61 |     def test_mia_logreg(self):
62 |         print('\nTest MIA LogReg')
63 |         Attack = MIAttackClassifierLogReg(metadata=self.metadata, FeatureSet=self.FeatureSet)
64 |         Attack.train(*self.shadowDataSyn)
65 | 
66 |         guesses = Attack.attack(self.synthetic)
67 |         self.assertEqual(len(guesses), len(self.synthetic))
68 | 
69 |         ## Test linkage
70 |         Attack.train(*self.shadowDataSan)
71 |         guesses = Attack.attack(self.sanitised, attemptLinkage=True, target=self.target)
72 |         self.assertEqual(len(guesses), len(self.sanitised))
73 | 
74 |     def test_mia_svc(self):
75 |         print('\nTest MIA SVC')
76 |         Attack = MIAttackClassifierLinearSVC(metadata=self.metadata, FeatureSet=self.FeatureSet)
77 |         Attack.train(*self.shadowDataSyn)
78 | 
79 |         guesses = Attack.attack(self.synthetic)
80 |         self.assertEqual(len(guesses), len(self.synthetic))
81 | 
82 |         ## Test linkage
83 |         Attack.train(*self.shadowDataSan)
84 |         guesses = Attack.attack(self.sanitised, attemptLinkage=True, target=self.target)
85 |         self.assertEqual(len(guesses), len(self.sanitised))
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/tests/test_gms.py:
--------------------------------------------------------------------------------
  1 | """A template file for writing a simple test for a new generative model"""
  2 | from unittest import TestCase
  3 | 
  4 | from warnings import filterwarnings
  5 | filterwarnings('ignore')
  6 | 
  7 | from os import path
  8 | cwd = path.dirname(__file__)
  9 | 
 10 | from generative_models.data_synthesiser import IndependentHistogram, BayesianNet, PrivBayes
 11 | from generative_models.ctgan import CTGAN
 12 | from generative_models.pate_gan import PATEGAN
 13 | 
 14 | from utils.datagen import *
 15 | 
 16 | SEED = 42
 17 | 
 18 | class TestGenerativeModel(TestCase):
 19 | 
 20 |     @classmethod
 21 |     def setUp(self) -> None:
 22 |         self.raw, self.metadata = load_local_data_as_df(path.join(cwd, 'germancredit_test'))
 23 |         self.sizeS = len(self.raw)
 24 | 
 25 |     def test_independent_histogram(self):
 26 |         print('\nTest IndependentHistogram')
 27 |         ## Test default params
 28 |         gm = IndependentHistogram(self.metadata)
 29 |         gm.fit(self.raw)
 30 |         synthetic_data = gm.generate_samples(self.sizeS)
 31 | 
 32 |         self.assertListEqual(list(synthetic_data), list(self.raw))
 33 | 
 34 |         ## Changing nbins
 35 |         gm = IndependentHistogram(self.metadata, histogram_bins=25)
 36 |         gm.fit(self.raw)
 37 |         synthetic_data = gm.generate_samples(self.sizeS)
 38 | 
 39 |         self.assertListEqual(list(synthetic_data), list(self.raw))
 40 | 
 41 |     def test_bayesian_net(self):
 42 |         print('\nTest BayesianNet')
 43 |         ## Test default params
 44 |         gm = BayesianNet(self.metadata)
 45 |         gm.fit(self.raw)
 46 |         synthetic_data = gm.generate_samples(self.sizeS)
 47 | 
 48 |         self.assertListEqual(list(synthetic_data), list(self.raw))
 49 | 
 50 |         ## Change network degree
 51 |         gm = BayesianNet(self.metadata, degree=2)
 52 |         gm.fit(self.raw)
 53 |         synthetic_data = gm.generate_samples(self.sizeS)
 54 | 
 55 |         self.assertListEqual(list(synthetic_data), list(self.raw))
 56 | 
 57 |         ## Infer ranges
 58 |         gm = BayesianNet(self.metadata, infer_ranges=True)
 59 |         gm.fit(self.raw)
 60 |         synthetic_data = gm.generate_samples(self.sizeS)
 61 | 
 62 |         self.assertListEqual(list(synthetic_data), list(self.raw))
 63 | 
 64 |         ## Fix seed
 65 |         gm = BayesianNet(self.metadata, seed=SEED)
 66 |         gm.fit(self.raw)
 67 |         synthetic_data = gm.generate_samples(self.sizeS)
 68 | 
 69 |         self.assertListEqual(list(synthetic_data), list(self.raw))
 70 | 
 71 |     def test_priv_bayes(self):
 72 |         print('\nTest PrivBayes')
 73 |         ## Test default params
 74 |         gm = PrivBayes(self.metadata)
 75 |         gm.fit(self.raw)
 76 |         synthetic_data = gm.generate_samples(self.sizeS)
 77 | 
 78 |         self.assertListEqual(list(synthetic_data), list(self.raw))
 79 | 
 80 |         ## Change privacy param
 81 |         gm = PrivBayes(self.metadata, epsilon=1e-9)
 82 |         gm.fit(self.raw)
 83 |         synthetic_data = gm.generate_samples(self.sizeS)
 84 | 
 85 |         self.assertListEqual(list(synthetic_data), list(self.raw))
 86 | 
 87 |         ## Fix seed
 88 |         gm = PrivBayes(self.metadata, seed=SEED)
 89 |         gm.fit(self.raw)
 90 |         synthetic_data = gm.generate_samples(self.sizeS)
 91 | 
 92 |         self.assertListEqual(list(synthetic_data), list(self.raw))
 93 | 
 94 |     def test_ctgan(self):
 95 |         print('\nTest CTGAN')
 96 | 
 97 |         gm = CTGAN(self.metadata, batch_size=10, epochs=2)
 98 |         gm.fit(self.raw)
 99 |         synthetic_data = gm.generate_samples(self.sizeS)
100 | 
101 |         self.assertListEqual(list(synthetic_data), list(self.raw))
102 | 
103 | 
104 |     def test_pategan(self):
105 |         # Default params
106 |         gm = PATEGAN(self.metadata)
107 |         gm.fit(self.raw)
108 |         synthetic_data = gm.generate_samples(self.sizeS)
109 | 
110 |         self.assertTupleEqual(synthetic_data.shape, self.raw.shape)
111 | 
112 |         # Change privacy params
113 |         gm = PATEGAN(self.metadata, eps=10, delta=1e-1)
114 |         gm.fit(self.raw)
115 |         synthetic_data = gm.generate_samples(self.sizeS)
116 | 
117 |         self.assertTupleEqual(synthetic_data.shape, self.raw.shape)
118 | 
119 |         # Infer ranges
120 |         gm = PATEGAN(self.metadata, infer_ranges=True)
121 |         gm.fit(self.raw)
122 |         synthetic_data = gm.generate_samples(self.sizeS)
123 | 
124 |         self.assertTupleEqual(synthetic_data.shape, self.raw.shape)
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/tests/test_sanitisation.py:
--------------------------------------------------------------------------------
 1 | """A template file for writing a simple test for a sanitisation technique"""
 2 | from unittest import TestCase
 3 | 
 4 | from warnings import filterwarnings
 5 | filterwarnings('ignore')
 6 | 
 7 | from os import path
 8 | cwd = path.dirname(__file__)
 9 | 
10 | from sanitisation_techniques.sanitiser import SanitiserNHS
11 | 
12 | from utils.datagen import load_local_data_as_df
13 | from utils.constants import *
14 | 
15 | 
16 | class TestSanitisation(TestCase):
17 | 
18 |     @classmethod
19 |     def setUp(self) -> None:
20 |         self.raw, self.metadata = load_local_data_as_df(path.join(cwd, 'germancredit_test'))
21 |         self.sizeS = len(self.raw)
22 | 
23 |     def test_sanitise_nhs(self):
24 |         print('\nTest SanitiserNHS')
25 | 
26 |         ## Test default params
27 |         sanitiser = SanitiserNHS(self.metadata)
28 |         san = sanitiser.sanitise(self.raw)
29 | 
30 |         # Expect no columns to be dropped or rows removed
31 |         self.assertTupleEqual(san.shape, self.raw.shape)
32 | 
33 |         ## Test dropping columns
34 |         sanitiser = SanitiserNHS(self.metadata, drop_cols=['Purpose'])
35 |         san = sanitiser.sanitise(self.raw)
36 | 
37 |         # Purpose should be dropped
38 |         self.assertTrue('Purpose' not in list(san))
39 | 
40 |         ## Test rare value threshold
41 |         sanitiser = SanitiserNHS(self.metadata, thresh_rare=2)
42 |         san = sanitiser.sanitise(self.raw)
43 | 
44 |         for cdict in self.metadata['columns']:
45 |             if cdict['type'] == CATEGORICAL or cdict['type'] == ORDINAL:
46 |                 counts = san[cdict['name']].value_counts()
47 |                 self.assertTrue(len(counts[counts > 2]) == len(counts))
48 | 
49 |         ## Test converting numerical into categorical attributes
50 |         demographics = ['Age', 'Sex', 'Job', 'Housing']
51 |         sanitiser = SanitiserNHS(self.metadata, quids=demographics)
52 |         san = sanitiser.sanitise(self.raw)
53 | 
54 |         self.assertListEqual([type(str) for _ in demographics], list(san[demographics].dtypes))
55 | 
56 |         ## Test k-anonymity constraint
57 |         sanitiser = SanitiserNHS(self.metadata, quids=demographics, anonymity_set_size=7)
58 |         san = sanitiser.sanitise(self.raw)
59 | 
60 |         counts = san.groupby(demographics).size()
61 |         self.assertTrue(len(counts[counts >= 7]) == len(counts))
62 | 
63 | 
64 | def write_to_dict(nr, results):
65 |     results[nr] = 'a'
66 | 
67 | 


--------------------------------------------------------------------------------
/tests/utility/runconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nIter": 15,
 3 |   "sizeRawT": 1000,
 4 |   "sizeSynT": 1000,
 5 |   "nSynT": 10,
 6 |   "nTargets": 0,
 7 |   "Targets": ["ID26241", "ID31432", "ID27428", "ID29265", "ID14086"],
 8 |   "TestRecords": ["ID71120", "ID84282", "ID88763", "ID79216", "ID92777"],
 9 |   "generativeModels": {
10 |     "BayesianNet": [[25, 1]],
11 |     "PrivBayes": [[25, 1, 1.0]]
12 |   },
13 |   "sanitisationTechniques": {
14 |     "SanitiserNHS": [[10, 1, 0.99, 10, [], ["PAT_STATE", "SEX_CODE", "RACE", "ETHNICITY", "PAT_AGE"]]]
15 |   },
16 |   "utilityTasks": {
17 |     "RandForestClass": [["RISK_MORTALITY"]]
18 |   },
19 |   "dataFilter": {
20 |     "train": "DISCHARGE in ['2013Q1', '2013Q2', '2013Q3', '2013Q4']",
21 |     "test": "DISCHARGE in ['2014Q1', '2014Q2', '2014Q3', '2014Q4']"
22 |   }
23 | }
24 | 
25 | 


--------------------------------------------------------------------------------
/utility_cli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Command-line interface for running utility evaluation
  3 | """
  4 | 
  5 | import json
  6 | 
  7 | from os import mkdir, path
  8 | from numpy import mean
  9 | from numpy.random import choice, seed
 10 | from argparse import ArgumentParser
 11 | 
 12 | from utils.datagen import load_s3_data_as_df, load_local_data_as_df
 13 | from utils.utils import json_numpy_serialzer
 14 | from utils.logging import LOGGER
 15 | 
 16 | from sanitisation_techniques.sanitiser import SanitiserNHS
 17 | from generative_models.data_synthesiser import BayesianNet, PrivBayes, IndependentHistogram
 18 | from generative_models.ctgan import CTGAN
 19 | from generative_models.pate_gan import PATEGAN
 20 | from predictive_models.predictive_model import RandForestClassTask, LogRegClassTask, LinRegTask
 21 | 
 22 | from warnings import simplefilter
 23 | simplefilter('ignore', category=FutureWarning)
 24 | simplefilter('ignore', category=DeprecationWarning)
 25 | 
 26 | cwd = path.dirname(__file__)
 27 | 
 28 | SEED = 42
 29 | 
 30 | 
 31 | def main():
 32 |     argparser = ArgumentParser()
 33 |     datasource = argparser.add_mutually_exclusive_group()
 34 |     datasource.add_argument('--s3name', '-S3', type=str, choices=['adult', 'census', 'credit', 'alarm', 'insurance'], help='Name of the dataset to run on')
 35 |     datasource.add_argument('--datapath', '-D', type=str, help='Relative path to cwd of a local data file')
 36 |     argparser.add_argument('--runconfig', '-RC', default='runconfig_mia.json', type=str, help='Path relative to cwd of runconfig file')
 37 |     argparser.add_argument('--outdir', '-O', default='outputs/test', type=str, help='Path relative to cwd for storing output files')
 38 |     args = argparser.parse_args()
 39 | 
 40 |     seed(SEED)
 41 |     # Load runconfig
 42 |     with open(path.join(cwd, args.runconfig)) as f:
 43 |         runconfig = json.load(f)
 44 |     print('Runconfig:')
 45 |     print(runconfig)
 46 | 
 47 |     # Load data
 48 |     if args.s3name is not None:
 49 |         rawPop, metadata = load_s3_data_as_df(args.s3name)
 50 |         dname = args.s3name
 51 |     else:
 52 |         rawPop, metadata = load_local_data_as_df(path.join(cwd, args.datapath))
 53 |         dname = args.datapath.split('/')[-1]
 54 | 
 55 |     print(f'Loaded data {dname}:')
 56 |     print(rawPop.info())
 57 | 
 58 |     # Make sure outdir exists
 59 |     if not path.isdir(args.outdir):
 60 |         mkdir(args.outdir)
 61 | 
 62 |     ########################
 63 |     #### GAME INPUTS #######
 64 |     ########################
 65 |     # Train test split
 66 |     rawTrain = rawPop.query(runconfig['dataFilter']['train'])
 67 |     rawTest = rawPop.query(runconfig['dataFilter']['test'])
 68 | 
 69 |     # Pick targets
 70 |     targetIDs = choice(list(rawTrain.index), size=runconfig['nTargets'], replace=False).tolist()
 71 | 
 72 |     # If specified: Add specific target records
 73 |     if runconfig['Targets'] is not None:
 74 |         targetIDs.extend(runconfig['Targets'])
 75 | 
 76 |     targets = rawTrain.loc[targetIDs, :]
 77 | 
 78 |     # Drop targets from population
 79 |     rawTrainWoTargets = rawTrain.drop(targetIDs)
 80 | 
 81 |     # Get test target records
 82 |     testRecordIDs = choice(list(rawTest.index), size=runconfig['nTargets'], replace=False).tolist()
 83 | 
 84 |     # If specified: Add specific target records
 85 |     if runconfig['TestRecords'] is not None:
 86 |         testRecordIDs.extend(runconfig['TestRecords'])
 87 | 
 88 |     testRecords = rawTest.loc[testRecordIDs, :]
 89 | 
 90 |     # List of candidate generative models to evaluate
 91 |     gmList = []
 92 |     if 'generativeModels' in runconfig.keys():
 93 |         for gm, paramsList in runconfig['generativeModels'].items():
 94 |             if gm == 'IndependentHistogram':
 95 |                 for params in paramsList:
 96 |                     gmList.append(IndependentHistogram(metadata, *params))
 97 |             elif gm == 'BayesianNet':
 98 |                 for params in paramsList:
 99 |                     gmList.append(BayesianNet(metadata, *params))
100 |             elif gm == 'PrivBayes':
101 |                 for params in paramsList:
102 |                     gmList.append(PrivBayes(metadata, *params))
103 |             elif gm == 'CTGAN':
104 |                 for params in paramsList:
105 |                     gmList.append(CTGAN(metadata, *params))
106 |             elif gm == 'PATEGAN':
107 |                 for params in paramsList:
108 |                     gmList.append(PATEGAN(metadata, *params))
109 |             else:
110 |                 raise ValueError(f'Unknown GM {gm}')
111 | 
112 |     # List of candidate sanitisation techniques to evaluate
113 |     sanList = []
114 |     if 'sanitisationTechniques' in runconfig.keys():
115 |         for name, paramsList in runconfig['sanitisationTechniques'].items():
116 |             if name == 'SanitiserNHS':
117 |                 for params in paramsList:
118 |                     sanList.append(SanitiserNHS(metadata, *params))
119 |             else:
120 |                 raise ValueError(f'Unknown sanitisation technique {name}')
121 | 
122 |     utilityTasks = []
123 |     for taskName, paramsList in runconfig['utilityTasks'].items():
124 |         if taskName == 'RandForestClass':
125 |             for params in paramsList:
126 |                 utilityTasks.append(RandForestClassTask(metadata, *params))
127 |         elif taskName == 'LogRegClass':
128 |             for params in paramsList:
129 |                 utilityTasks.append(LogRegClassTask(metadata, *params))
130 |         elif taskName == 'LinReg':
131 |             for params in paramsList:
132 |                 utilityTasks.append(LinRegTask(metadata, *params))
133 | 
134 |     ##################################
135 |     ######### EVALUATION #############
136 |     ##################################
137 |     resultsTargetUtility = {ut.__name__: {gm.__name__: {} for gm in gmList + sanList} for ut in utilityTasks}
138 |     resultsAggUtility = {ut.__name__: {gm.__name__: {'TargetID': [],
139 |                                                      'Accuracy': []} for gm in gmList + sanList} for ut in utilityTasks}
140 | 
141 |     # Add entry for raw
142 |     for ut in utilityTasks:
143 |         resultsTargetUtility[ut.__name__]['Raw'] = {}
144 |         resultsAggUtility[ut.__name__]['Raw'] = {'TargetID': [],
145 |                                                  'Accuracy': []}
146 | 
147 |     print('\n---- Start the game ----')
148 |     for nr in range(runconfig['nIter']):
149 |         print(f'\n--- Game iteration {nr + 1} ---')
150 |         # Draw a raw dataset
151 |         rIdx = choice(list(rawTrainWoTargets.index), size=runconfig['sizeRawT'], replace=False).tolist()
152 |         rawTout = rawTrain.loc[rIdx]
153 | 
154 |         LOGGER.info('Start: Utility evaluation on Raw...')
155 |         # Get utility from raw without targets
156 |         for ut in utilityTasks:
157 |             resultsTargetUtility[ut.__name__]['Raw'][nr] = {}
158 | 
159 |             predErrorTargets = []
160 |             predErrorAggr = []
161 |             for _ in range(runconfig['nSynT']):
162 |                 ut.train(rawTout)
163 |                 predErrorTargets.append(ut.evaluate(testRecords))
164 |                 predErrorAggr.append(ut.evaluate(rawTest))
165 | 
166 |             resultsTargetUtility[ut.__name__]['Raw'][nr]['OUT'] = {
167 |                 'TestRecordID': testRecordIDs,
168 |                 'Accuracy': list(mean(predErrorTargets, axis=0))
169 |             }
170 | 
171 |             resultsAggUtility[ut.__name__]['Raw']['TargetID'].append('OUT')
172 |             resultsAggUtility[ut.__name__]['Raw']['Accuracy'].append(mean(predErrorAggr))
173 | 
174 |         # Get utility from raw with each target
175 |         for tid in targetIDs:
176 |             target = targets.loc[[tid]]
177 |             rawIn = rawTout.append(target)
178 | 
179 |             for ut in utilityTasks:
180 |                 predErrorTargets = []
181 |                 predErrorAggr = []
182 |                 for _ in range(runconfig['nSynT']):
183 |                     ut.train(rawIn)
184 |                     predErrorTargets.append(ut.evaluate(testRecords))
185 |                     predErrorAggr.append(ut.evaluate(rawTest))
186 | 
187 |                 resultsTargetUtility[ut.__name__]['Raw'][nr][tid] = {
188 |                     'TestRecordID': testRecordIDs,
189 |                     'Accuracy': list(mean(predErrorTargets, axis=0))
190 |                 }
191 | 
192 |                 resultsAggUtility[ut.__name__]['Raw']['TargetID'].append(tid)
193 |                 resultsAggUtility[ut.__name__]['Raw']['Accuracy'].append(mean(predErrorAggr))
194 | 
195 |         LOGGER.info('Finished: Utility evaluation on Raw.')
196 | 
197 |         for GenModel in gmList:
198 |             LOGGER.info(f'Start: Evaluation for model {GenModel.__name__}...')
199 |             GenModel.fit(rawTout)
200 |             synTwithoutTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])]
201 | 
202 |             # Util evaluation for synthetic without all targets
203 |             for ut in utilityTasks:
204 |                 resultsTargetUtility[ut.__name__][GenModel.__name__][nr] = {}
205 | 
206 |                 predErrorTargets = []
207 |                 predErrorAggr = []
208 |                 for syn in synTwithoutTarget:
209 |                     ut.train(syn)
210 |                     predErrorTargets.append(ut.evaluate(testRecords))
211 |                     predErrorAggr.append(ut.evaluate(rawTest))
212 | 
213 |                 resultsTargetUtility[ut.__name__][GenModel.__name__][nr]['OUT'] = {
214 |                     'TestRecordID': testRecordIDs,
215 |                     'Accuracy': list(mean(predErrorTargets, axis=0))
216 |                 }
217 | 
218 |                 resultsAggUtility[ut.__name__][GenModel.__name__]['TargetID'].append('OUT')
219 |                 resultsAggUtility[ut.__name__][GenModel.__name__]['Accuracy'].append(mean(predErrorAggr))
220 | 
221 |             for tid in targetIDs:
222 |                 LOGGER.info(f'Target: {tid}')
223 |                 target = targets.loc[[tid]]
224 | 
225 |                 rawTin = rawTout.append(target)
226 |                 GenModel.fit(rawTin)
227 |                 synTwithTarget = [GenModel.generate_samples(runconfig['sizeSynT']) for _ in range(runconfig['nSynT'])]
228 | 
229 |                 # Util evaluation for synthetic with this target
230 |                 for ut in utilityTasks:
231 |                     predErrorTargets = []
232 |                     predErrorAggr = []
233 |                     for syn in synTwithTarget:
234 |                         ut.train(syn)
235 |                         predErrorTargets.append(ut.evaluate(testRecords))
236 |                         predErrorAggr.append(ut.evaluate(rawTest))
237 | 
238 |                     resultsTargetUtility[ut.__name__][GenModel.__name__][nr][tid] = {
239 |                         'TestRecordID': testRecordIDs,
240 |                         'Accuracy': list(mean(predErrorTargets, axis=0))
241 |                     }
242 | 
243 |                     resultsAggUtility[ut.__name__][GenModel.__name__]['TargetID'].append(tid)
244 |                     resultsAggUtility[ut.__name__][GenModel.__name__]['Accuracy'].append(mean(predErrorAggr))
245 | 
246 |             del synTwithoutTarget, synTwithTarget
247 | 
248 |             LOGGER.info(f'Finished: Evaluation for model {GenModel.__name__}.')
249 | 
250 |         for San in sanList:
251 |             LOGGER.info(f'Start: Evaluation for sanitiser {San.__name__}...')
252 |             sanOut = San.sanitise(rawTout)
253 | 
254 |             for ut in utilityTasks:
255 |                 resultsTargetUtility[ut.__name__][San.__name__][nr] = {}
256 | 
257 |                 predErrorTargets = []
258 |                 predErrorAggr = []
259 |                 for _ in range(runconfig['nSynT']):
260 |                     ut.train(sanOut)
261 |                     predErrorTargets.append(ut.evaluate(testRecords))
262 |                     predErrorAggr.append(ut.evaluate(rawTest))
263 | 
264 |                 resultsTargetUtility[ut.__name__][San.__name__][nr]['OUT'] = {
265 |                     'TestRecordID': testRecordIDs,
266 |                     'Accuracy': list(mean(predErrorTargets, axis=0))
267 |                 }
268 | 
269 |                 resultsAggUtility[ut.__name__][San.__name__]['TargetID'].append('OUT')
270 |                 resultsAggUtility[ut.__name__][San.__name__]['Accuracy'].append(mean(predErrorAggr))
271 | 
272 |             for tid in targetIDs:
273 |                 LOGGER.info(f'Target: {tid}')
274 |                 target = targets.loc[[tid]]
275 | 
276 |                 rawTin = rawTout.append(target)
277 |                 sanIn = San.sanitise(rawTin)
278 | 
279 |                 for ut in utilityTasks:
280 |                     predErrorTargets = []
281 |                     predErrorAggr = []
282 |                     for _ in range(runconfig['nSynT']):
283 |                         ut.train(sanIn)
284 |                         predErrorTargets.append(ut.evaluate(testRecords))
285 |                         predErrorAggr.append(ut.evaluate(rawTest))
286 | 
287 |                     resultsTargetUtility[ut.__name__][San.__name__][nr][tid] = {
288 |                         'TestRecordID': testRecordIDs,
289 |                         'Accuracy': list(mean(predErrorTargets, axis=0))
290 |                     }
291 | 
292 |                     resultsAggUtility[ut.__name__][San.__name__]['TargetID'].append(tid)
293 |                     resultsAggUtility[ut.__name__][San.__name__]['Accuracy'].append(mean(predErrorAggr))
294 | 
295 |             del sanOut, sanIn
296 | 
297 |             LOGGER.info(f'Finished: Evaluation for model {San.__name__}.')
298 | 
299 |     outfile = f"ResultsUtilTargets_{dname}"
300 |     LOGGER.info(f"Write results to {path.join(f'{args.outdir}', f'{outfile}')}")
301 | 
302 |     with open(path.join(f'{args.outdir}', f'{outfile}.json'), 'w') as f:
303 |         json.dump(resultsTargetUtility, f, indent=2, default=json_numpy_serialzer)
304 | 
305 |     outfile = f"ResultsUtilAgg_{dname}"
306 |     LOGGER.info(f"Write results to {path.join(f'{args.outdir}', f'{outfile}')}")
307 | 
308 |     with open(path.join(f'{args.outdir}', f'{outfile}.json'), 'w') as f:
309 |         json.dump(resultsAggUtility, f, indent=2, default=json_numpy_serialzer)
310 | 
311 | 
312 | if __name__ == "__main__":
313 |     main()


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spring-epfl/synthetic_data_release/eba9a43c75a64110b63d79e58c83859634b7339c/utils/__init__.py


--------------------------------------------------------------------------------
/utils/analyse_results.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import matplotlib.pyplot as plt
  3 | import seaborn as sns
  4 | 
  5 | from glob import glob
  6 | from pandas import DataFrame, concat
  7 | from itertools import cycle
  8 | from os import path
  9 | 
 10 | from warnings import filterwarnings
 11 | filterwarnings('ignore')
 12 | 
 13 | from .datagen import load_local_data_as_df
 14 | from .plot_setup import set_style, pltmarkers as MARKERS, fontsizelabels as FSIZELABELS, fontsizeticks as FSIZETICKS
 15 | from .evaluation_framework import *
 16 | set_style()
 17 | 
 18 | PREDTASKS = ['RandomForestClassifier', 'LogisticRegression', 'LinearRegression']
 19 | 
 20 | MARKERCYCLE = cycle(MARKERS)
 21 | HUEMARKERS = [next(MARKERCYCLE) for _ in range(20)]
 22 | 
 23 | 
 24 | ###### Load results
 25 | def load_results_linkage(dirname):
 26 |     """
 27 |     Helper function to load results of privacy evaluation under risk of linkability
 28 |     :param dirname: str: Directory that contains results files
 29 |     :return: results: DataFrame: Results of privacy evaluation
 30 |     """
 31 | 
 32 |     files = glob(path.join(dirname, f'ResultsMIA_*.json'))
 33 | 
 34 |     resList = []
 35 |     for fpath in files:
 36 |         with open(fpath) as f:
 37 |             resDict = json.load(f)
 38 | 
 39 |         dataset = fpath.split('.json')[0].split('_')[-1]
 40 | 
 41 |         for tid, tres in resDict.items():
 42 |             for gm, gmDict in tres.items():
 43 |                 for nr, nrDict in gmDict.items():
 44 |                     for fset, fsetDict in nrDict.items():
 45 |                         df = DataFrame(fsetDict)
 46 | 
 47 |                         df['Run'] = nr
 48 |                         df['FeatureSet'] = fset
 49 |                         df['TargetModel'] = gm
 50 |                         df['TargetID'] = tid
 51 |                         df['Dataset'] = dataset
 52 | 
 53 |                         resList.append(df)
 54 | 
 55 |     results = concat(resList)
 56 | 
 57 |     resAgg = []
 58 | 
 59 |     games = results.groupby(['TargetID', 'TargetModel', 'FeatureSet', 'Run'])
 60 |     for gameParams, gameRes in games:
 61 |         tpSyn, fpSyn = get_tp_fp_rates(gameRes['AttackerGuess'], gameRes['Secret'])
 62 |         advantageSyn = get_mia_advantage(tpSyn, fpSyn)
 63 |         advantageRaw = 1
 64 | 
 65 |         resAgg.append(gameParams + (tpSyn, fpSyn, advantageSyn, advantageRaw))
 66 | 
 67 |     resAgg = DataFrame(resAgg)
 68 | 
 69 |     resAgg.columns = ['TargetID','TargetModel', 'FeatureSet', 'Run', 'TPSyn', 'FPSyn', 'AdvantageSyn', 'AdvantageRaw']
 70 | 
 71 |     resAgg['PrivacyGain'] = resAgg['AdvantageRaw'] - resAgg['AdvantageSyn']
 72 | 
 73 |     return resAgg
 74 | 
 75 | 
 76 | def load_results_inference(dirname, dpath):
 77 |     """
 78 |     Helper function to load results of privacy evaluation under risk of inference
 79 |     :param dirname: str: Directory that contains results files
 80 |     :param dpath: str: Dataset path (needed to extract some metadata)
 81 |     :return: results: DataFrame: Results of privacy evaluation
 82 |     """
 83 |     df, metadata = load_local_data_as_df(dpath)
 84 | 
 85 |     files = glob(path.join(dirname, f'ResultsMLEAI_*.json'))
 86 |     resList = []
 87 |     for fpath in files:
 88 | 
 89 |         with open(fpath) as f:
 90 |             resDict = json.load(f)
 91 | 
 92 |         dataset = fpath.split('.json')[0].split('_')[-1]
 93 | 
 94 |         for tid, tdict in resDict.items():
 95 |             for sa, sdict in tdict.items():
 96 |                 tsecret = df.loc[tid, sa]
 97 |                 satype = None
 98 | 
 99 |                 for cdict in metadata['columns']:
100 |                     if cdict['name'] == sa:
101 |                         satype = cdict['type']
102 | 
103 |                 if '_' in sa:
104 |                     sa = ''.join([s.capitalize() for s in sa.split('_')])
105 |                 elif '-' in sa:
106 |                     sa = ''.join([s.capitalize() for s in sa.split('-')])
107 | 
108 |                 for gm, gdict in sdict.items():
109 |                     for nr, res in gdict.items():
110 | 
111 |                         resDF = DataFrame(res)
112 |                         resDF['TargetID'] = tid
113 |                         resDF['TargetSecret'] = tsecret
114 |                         resDF['SensitiveType'] = satype
115 |                         resDF['TargetModel'] = gm
116 |                         resDF['Run'] = nr
117 |                         resDF['SensitiveAttribute'] = sa
118 |                         resDF['Dataset'] = dataset
119 | 
120 |                         resList.append(resDF)
121 | 
122 |     results = concat(resList)
123 | 
124 |     resAdv = []
125 |     for gameParams, game in results.groupby(['Dataset', 'TargetID', 'SensitiveAttribute', 'Run']):
126 |         rawRes = game.groupby(['TargetModel']).get_group('Raw')
127 |         if all(game['SensitiveType'].isin([INTEGER, FLOAT])):
128 |             pCorrectRIn, pCorrectROut = get_probs_correct(rawRes['ProbCorrect'], rawRes['TargetPresence'])
129 | 
130 |         elif all(game['SensitiveType'].isin([CATEGORICAL, ORDINAL])):
131 |             pCorrectRIn, pCorrectROut = get_accuracy(rawRes['AttackerGuess'], rawRes['TargetSecret'], rawRes['TargetPresence'])
132 | 
133 |         else:
134 |             raise ValueError('Unknown sensitive attribute type.')
135 | 
136 |         advR = get_ai_advantage(pCorrectRIn, pCorrectROut)
137 | 
138 |         for gm, gmRes in game.groupby(['TargetModel']):
139 |             if gm != 'Raw':
140 |                 if all(gmRes['SensitiveType'].isin([INTEGER, FLOAT])):
141 |                     pCorrectSIn, pCorrectSOut = get_probs_correct(gmRes['ProbCorrect'], gmRes['TargetPresence'])
142 | 
143 |                 elif all(gmRes['SensitiveType'].isin([CATEGORICAL, ORDINAL])):
144 |                     pCorrectSIn, pCorrectSOut = get_accuracy(gmRes['AttackerGuess'], gmRes['TargetSecret'], gmRes['TargetPresence'])
145 | 
146 |                 else:
147 |                     raise ValueError('Unknown sensitive attribute type.')
148 | 
149 |                 advS = get_ai_advantage(pCorrectSIn, pCorrectSOut)
150 | 
151 | 
152 |                 resAdv.append(gameParams + (gm, pCorrectRIn, pCorrectROut, advR, pCorrectSIn, pCorrectSOut, advS))
153 | 
154 | 
155 |     resAdv = DataFrame(resAdv)
156 |     resAdv.columns  =['Dataset', 'TargetID', 'SensitiveAttribute','Run', 'TargetModel',
157 |                       'ProbCorrectRawIn', 'ProbCorrectRawOut', 'AdvantageRaw',
158 |                       'ProbCorrectSynIn', 'ProbCorrectSynOut', 'AdvantageSyn']
159 | 
160 |     resAdv['PrivacyGain'] = resAdv['AdvantageRaw'] - resAdv['AdvantageSyn']
161 | 
162 |     return resAdv
163 | 
164 | 
165 | def load_results_utility(dirname):
166 |     """
167 |     Helper function to load results of utility evaluation
168 |     :param dirname: str: Directory that contains results files
169 |     :return: resultsTarget: DataFrame: Results of utility evaluation on individual records
170 |     :return: resultsAgg: DataFrame: Results of average utility evaluation
171 |     """
172 | 
173 |     # Load individual target utility results
174 |     files = glob(path.join(dirname, f'ResultsUtilTargets_*.json'))
175 | 
176 |     resList = []
177 |     for fpath in files:
178 |         with open(fpath) as f:
179 |             results = json.load(f)
180 | 
181 |         dataset = fpath.split('.json')[0].split('_')[-1]
182 | 
183 |         for ut, ures in results.items():
184 |             model = [m for m in PREDTASKS if m in ut][0]
185 |             labelVar = ut.split(model)[-1]
186 | 
187 |             if '_' in labelVar:
188 |                 labelVar = ''.join([s.capitalize() for s in labelVar.split('_')])
189 | 
190 |             if '-' in labelVar:
191 |                 labelVar = ''.join([s.capitalize() for s in labelVar.split('-')])
192 | 
193 |             for gm, gmres in ures.items():
194 |                 for n, nres in gmres.items():
195 |                     for tid, tres in nres.items():
196 |                         res = DataFrame(tres)
197 | 
198 |                         res['TargetID'] = tid
199 |                         res['Run'] = f'Run {n}'
200 |                         res['TargetModel'] = gm
201 |                         res['PredictionModel'] = model
202 |                         res['LabelVar'] = labelVar
203 |                         res['Dataset'] = dataset
204 | 
205 |                         resList.append(res)
206 | 
207 |     resultsTargets = concat(resList)
208 | 
209 |     # Load aggregate utility results
210 |     files = glob(path.join(dirname, f'ResultsUtilAgg_*.json'))
211 | 
212 |     resList = []
213 |     for fpath in files:
214 |         with open(fpath) as f:
215 |             results = json.load(f)
216 | 
217 |         dataset = fpath.split('.json')[0].split('_')[-1]
218 | 
219 |         for ut, utres in results.items():
220 |             model = [m for m in PREDTASKS if m in ut][0]
221 |             labelVar = ut.split(model)[-1]
222 | 
223 |             if '_' in labelVar:
224 |                 labelVar = ''.join([s.capitalize() for s in labelVar.split('_')])
225 | 
226 |             if '-' in labelVar:
227 |                 labelVar = ''.join([s.capitalize() for s in labelVar.split('-')])
228 | 
229 |             for gm, gmres in utres.items():
230 |                 resDF = DataFrame(gmres)
231 |                 resDF['PredictionModel'] = model
232 |                 resDF['LabelVar'] = labelVar
233 |                 resDF['TargetModel'] = gm
234 |                 resDF['Dataset'] = dataset
235 | 
236 |                 resList.append(resDF)
237 | 
238 |     resultsAgg = concat(resList)
239 | 
240 |     return resultsTargets, resultsAgg
241 | 
242 | 
243 | ### Plotting
244 | def plt_per_target_pg(results, models, resFilter=('FeatureSet', 'Naive')):
245 |     """ Plot per record average privacy gain. """
246 |     results = results[results[resFilter[0]] == resFilter[1]]
247 | 
248 |     fig, ax = plt.subplots(figsize=(10, 6))
249 |     pointplot(results, 'TargetModel', 'PrivacyGain', 'TargetID', ax, models)
250 | 
251 |     ax.set_title(f'Attack on {resFilter[0]}: {resFilter[1]}', fontsize=FSIZELABELS)
252 |     ax.legend(loc='upper center', bbox_to_anchor=(.5, 1.3), ncol=5, title='TargetID')
253 |     ax.set_ylabel('$\mathtt{PG}$', fontsize=FSIZELABELS)
254 | 
255 |     return fig
256 | 
257 | 
258 | def plt_avg_accuracy(results, models):
259 |     fig, ax = plt.subplots(figsize=(12, 5))
260 | 
261 |     pltdata = results[results['TargetID'] == 'OUT']
262 | 
263 |     boxplot(pltdata, 'TargetModel', 'Accuracy', 'LabelVar', ax, models)
264 | 
265 |     ax.hlines(0.2, *ax.get_xlim(), 'grey', '--')
266 |     ax.set_ylabel('$\mathtt{Accuracy}$', fontsize=FSIZELABELS)
267 |     ax.set_xlabel('')
268 | 
269 |     return fig
270 | 
271 | 
272 | def pointplot(data, x, y, hue, ax, order):
273 |     ncats = data[hue].nunique()
274 |     huemarkers = HUEMARKERS[:ncats]
275 | 
276 |     sns.pointplot(data=data, y=y,
277 |                   x=x, hue=hue,
278 |                   order=order,
279 |                   ax=ax, dodge=True,
280 |                   join=False, markers=huemarkers,
281 |                   scale=1.2, errwidth=2,
282 |                   linestyles='--')
283 | 
284 |     # Remove legend
285 |     ax.get_legend().remove()
286 | 
287 |     # Set x- and y-label
288 |     ax.set_xlabel('')
289 | 
290 |     # Resize y-tick labels
291 |     for tick in ax.yaxis.get_major_ticks():
292 |         tick.label.set_fontsize(FSIZETICKS)
293 | 
294 |     # Resize x-tick labels
295 |     for tick in ax.xaxis.get_major_ticks():
296 |         tick.label.set_fontsize(FSIZETICKS)
297 | 
298 | 
299 | def boxplot(data, x, y, hue, ax, order, hue_order=None):
300 |     sns.boxenplot(data=data, y=y,
301 |                   x=x, hue=hue,
302 |                   order=order, hue_order=hue_order,
303 |                   ax=ax, dodge=True)
304 | 
305 |     # Resize y-tick labels
306 |     for tick in ax.yaxis.get_major_ticks():
307 |         tick.label.set_fontsize(FSIZETICKS)
308 | 
309 |     # Resize x-tick labels
310 |     for tick in ax.xaxis.get_major_ticks():
311 |         tick.label.set_fontsize(FSIZETICKS)
312 | 


--------------------------------------------------------------------------------
/utils/constants.py:
--------------------------------------------------------------------------------
 1 | # Data coding constants
 2 | FILLNA_VALUE_CAT = "NaN"
 3 | CATEGORICAL = "Categorical"
 4 | ORDINAL = "Ordinal"
 5 | INTEGER = "Integer"
 6 | FLOAT = "Float"
 7 | NUMERICAL = [INTEGER, FLOAT]
 8 | STRINGS = [CATEGORICAL, ORDINAL]
 9 | 
10 | # Runtime constant
11 | PROCESSES = 16
12 | 
13 | # Experiment constants
14 | LABEL_IN = 1
15 | LABEL_OUT = 0
16 | ZERO_TOL = 1e-12


--------------------------------------------------------------------------------
/utils/datagen.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helper functions for loading, converting, reshaping data
  3 | _load_file, _load_json, _get_columns are copies of utility functions in
  4 | https://github.com/sdv-dev/SDGym published under MIT License Copyright (c) 2019, MIT Data To AI Lab
  5 | """
  6 | import numpy as np
  7 | import pandas as pd
  8 | import json
  9 | import urllib
 10 | from os import path
 11 | from os import makedirs
 12 | from pandas.api.types import CategoricalDtype
 13 | 
 14 | from utils.constants import *
 15 | 
 16 | BASE_URL = 'http://sdgym.s3.amazonaws.com/datasets/'
 17 | DATA_PATH = path.join(path.dirname(__file__), 'data')
 18 | MNIST_IMAGE_SZIE = (28, 28, 1)
 19 | 
 20 | 
 21 | def load_mnist(filename):
 22 |     """Load and prepare MNIST dataset"""
 23 |     train = pd.read_csv(filename, sep=" ")
 24 |     y_train = np.array(train.values[:, -1], dtype=np.float32)
 25 |     X_train = np.array(train.values[:, :-1], dtype=np.float32)
 26 |     X_train = X_train.astype("float32")
 27 |     y_train = y_train.astype("float32")
 28 |     X_train /= 255
 29 | 
 30 |     X_train = X_train.reshape(len(X_train), *MNIST_IMAGE_SZIE)
 31 | 
 32 |     return (X_train, y_train)
 33 | 
 34 | 
 35 | def load_local_data_as_df(filename):
 36 |     with open(f'{filename}.json') as f:
 37 |         metadata = json.load(f)
 38 |     dtypes = {cd['name']:_get_dtype(cd) for cd in metadata['columns']}
 39 |     df = pd.read_csv(f'{filename}.csv', dtype=dtypes)
 40 |     metadata['categorical_columns'], metadata['ordinal_columns'], metadata['continuous_columns'] = _get_columns(metadata)
 41 | 
 42 |     df['ID'] = [f'ID{i}' for i in np.arange(len(df))]
 43 |     df = df.set_index('ID')
 44 | 
 45 |     return df, metadata
 46 | 
 47 | 
 48 | def load_local_data_as_array(filename):
 49 |     df = pd.read_csv(f'{filename}.csv')
 50 |     with open(f'{filename}.json') as f:
 51 |         metadata = json.load(f)
 52 |     metadata['categorical_columns'], metadata['ordinal_columns'], metadata['continuous_columns'] = _get_columns(metadata)
 53 | 
 54 |     data = convert_df_to_array(df, metadata)
 55 | 
 56 |     return data, metadata
 57 | 
 58 | 
 59 | def load_s3_data_as_array(filename):
 60 |     data = _load_file(filename + '.npz', np.load)
 61 |     metadata = _load_file(filename + '.json', _load_json)
 62 |     metadata['categorical_columns'], metadata['ordinal_columns'], metadata['continuous_columns'] = _get_columns(metadata)
 63 | 
 64 |     return np.concatenate([data['train'], data['test']]), metadata
 65 | 
 66 | 
 67 | def load_s3_data_as_df(filename):
 68 |     data = _load_file(filename + '.npz', np.load)
 69 |     metadata = _load_file(filename + '.json', _load_json)
 70 |     metadata['categorical_columns'], metadata['ordinal_columns'], metadata['continuous_columns'] = _get_columns(metadata)
 71 | 
 72 |     df = convert_array_to_df(np.concatenate([data['train'], data['test']]), metadata)
 73 | 
 74 |     df['ID'] = [f'ID{i}' for i in np.arange(len(df))]
 75 |     df = df.set_index('ID')
 76 | 
 77 |     return df, metadata
 78 | 
 79 | 
 80 | def _get_dtype(cd):
 81 |     if cd['type'] == FLOAT:
 82 |         return np.float
 83 |     elif cd['type'] == INTEGER:
 84 |         return np.int
 85 |     else:
 86 |         return np.object
 87 | 
 88 | 
 89 | def _get_columns(metadata):
 90 |     categorical_columns = list()
 91 |     ordinal_columns = list()
 92 |     continuous_columns = list()
 93 |     for column_idx, column in enumerate(metadata['columns']):
 94 |         if column['type'] == CATEGORICAL:
 95 |             categorical_columns.append(column_idx)
 96 |         elif column['type'] == ORDINAL:
 97 |             ordinal_columns.append(column_idx)
 98 |         elif column['type'] in NUMERICAL:
 99 |             continuous_columns.append(column_idx)
100 | 
101 |     return categorical_columns, ordinal_columns, continuous_columns
102 | 
103 | 
104 | def _load_json(path):
105 |     with open(path) as json_file:
106 |         return json.load(json_file)
107 | 
108 | 
109 | def _load_file(filename, loader):
110 |     local_path = path.join(DATA_PATH, filename)
111 |     if not path.exists(local_path):
112 |         makedirs(DATA_PATH, exist_ok=True)
113 |         urllib.request.urlretrieve(BASE_URL + filename, local_path)
114 | 
115 |     return loader(local_path)
116 | 
117 | 
118 | def convert_array_to_df(data, metadata):
119 |     df = pd.DataFrame(data)
120 |     column_names = []
121 |     for i, col in enumerate(metadata['columns']):
122 |         column_names.append(col['name'])
123 |         if col['type'] in [CATEGORICAL, ORDINAL]:
124 |             df.iloc[:, i] = df.iloc[:, i].astype('object')
125 |             df.iloc[:, i] = df.iloc[:, i].map(pd.Series(col['i2s']))
126 | 
127 |     df.columns = column_names
128 |     return df
129 | 
130 | 
131 | def convert_df_to_array(df, metadata):
132 |     dfcopy = df.copy()
133 |     for col in metadata['columns']:
134 |         if col['name'] in list(dfcopy):
135 |             col_data = dfcopy[col['name']]
136 |             if col['type'] in [CATEGORICAL, ORDINAL]:
137 |                 if len(col_data) > len(col_data.dropna()):
138 |                     col_data = col_data.fillna(FILLNA_VALUE_CAT)
139 |                     if FILLNA_VALUE_CAT not in col['i2s']:
140 |                         col['i2s'].append(FILLNA_VALUE_CAT)
141 |                         col['size'] += 1
142 |                 cat = CategoricalDtype(categories=col['i2s'], ordered=True)
143 |                 col_data = col_data.astype(cat)
144 |                 dfcopy[col['name']] = col_data.cat.codes
145 | 
146 |     return dfcopy.values
147 | 
148 | 
149 | def convert_series_to_array(scopy, metadata):
150 |     scopy = scopy.copy()
151 |     for col in metadata['columns']:
152 |         if col['name'] == scopy.name:
153 |             if col['type'] in [CATEGORICAL, ORDINAL]:
154 |                 if len(scopy) > len(scopy.dropna()):
155 |                     scopy = scopy.fillna(FILLNA_VALUE_CAT)
156 |                     if FILLNA_VALUE_CAT not in col['i2s']:
157 |                         col['i2s'].append(FILLNA_VALUE_CAT)
158 |                         col['size'] += 1
159 |                 cat = CategoricalDtype(categories=col['i2s'], ordered=True)
160 |                 scopy = scopy.astype(cat)
161 |                 scopy = scopy.cat.codes
162 | 
163 |     return scopy.values
164 | 
165 | 
166 | 


--------------------------------------------------------------------------------
/utils/evaluation_framework.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Procedures for running a privacy evaluation on a generative model
 3 | """
 4 | 
 5 | from numpy import where, mean
 6 | 
 7 | from utils.constants import *
 8 | 
 9 | def get_accuracy(guesses, labels, targetPresence):
10 |     idxIn = where(targetPresence == LABEL_IN)[0]
11 |     idxOut = where(targetPresence == LABEL_OUT)[0]
12 | 
13 |     pIn = sum([g == l for g,l in zip(guesses[idxIn], labels[idxIn])])/len(idxIn)
14 |     pOut = sum([g == l for g,l in zip(guesses[idxOut], labels[idxOut])])/len(idxOut)
15 |     return pIn, pOut
16 | 
17 | 
18 | def get_tp_fp_rates(guesses, labels):
19 |     targetIn = where(labels == LABEL_IN)[0]
20 |     targetOut = where(labels == LABEL_OUT)[0]
21 |     return sum(guesses[targetIn] == LABEL_IN)/len(targetIn), sum(guesses[targetOut] == LABEL_IN)/len(targetOut)
22 | 
23 | 
24 | def get_probs_correct(pdf, targetPresence):
25 |     idxIn = where(targetPresence == LABEL_IN)[0]
26 |     idxOut = where(targetPresence == LABEL_OUT)[0]
27 | 
28 |     pdf[pdf > 1.] = 1.
29 |     return mean(pdf[idxIn]), mean(pdf[idxOut])
30 | 
31 | 
32 | def get_mia_advantage(tp_rate, fp_rate):
33 |     return tp_rate - fp_rate
34 | 
35 | 
36 | def get_ai_advantage(pCorrectIn, pCorrectOut):
37 |     return pCorrectIn - pCorrectOut
38 | 
39 | 
40 | def get_util_advantage(pCorrectIn, pCorrectOut):
41 |     return pCorrectIn - pCorrectOut
42 | 
43 | 
44 | def get_prob_removed(before, after):
45 |     idxIn = where(before == LABEL_IN)[0]
46 |     return 1.0 - sum(after[idxIn]/len(idxIn))
47 | 
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/utils/logging.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import sys
 3 | 
 4 | 
 5 | def setup_logger(stream=sys.stderr):
 6 |     """Setup a logger."""
 7 |     logger = logging.getLogger()
 8 |     logger.setLevel(logging.INFO)
 9 |     handler = logging.StreamHandler(stream=stream)
10 |     handler.setLevel(logging.INFO)
11 |     formatter = logging.Formatter("%(asctime)s:%(name)s:%(levelname)s:%(message)s")
12 |     handler.setFormatter(formatter)
13 |     logger.addHandler(handler)
14 | 
15 |     return logger
16 | 
17 | 
18 | LOGGER = setup_logger()
19 | 


--------------------------------------------------------------------------------
/utils/plot_setup.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | from matplotlib.colors import to_rgb
 3 | import seaborn as sns
 4 | from palettable import cartocolors
 5 | from husl import hex_to_husl
 6 | 
 7 | colours = cartocolors.qualitative.Safe_10.hex_colors
 8 | cpalette = sns.color_palette(colours)
 9 | cpalette_light = sns.light_palette(hex_to_husl(colours[1]), input="husl")
10 | colours_rgb = [to_rgb(c) for c in colours]
11 | 
12 | cmap_qualitative = cartocolors.qualitative.Safe_10.mpl_colormap
13 | cmap_light = sns.light_palette(hex_to_husl(colours[1]), input="husl", as_cmap=True)
14 | 
15 | pltmarkers = ['o', 'X', 'D', 'P', '^']
16 | 
17 | fontsizelabels = 26
18 | fontsizeticks = 24
19 | 
20 | def set_style():
21 |     sns.set_palette(cpalette)
22 |     sns.set_style('whitegrid', {'axes.spines.right': True,
23 |                                 'axes.spines.top': True,
24 |                                 'axes.edgecolor': 'k',
25 |                                 'xtick.color': 'k',
26 |                                 'ytick.color': 'k',
27 |                                 'grid.color':'0.7',
28 |                                 'font.family': 'serif',
29 |                                 'font.sans-serif': 'cm',
30 |                                 'text.usetex': True})
31 | 
32 |     plt.rcParams.update({
33 |         'font.family': 'serif',
34 |         'font.sans-serif': 'cm',
35 |         'text.usetex': True,
36 |         'font.size': 14,
37 | 
38 |         'xtick.labelsize': 14,
39 |         'ytick.labelsize': 14,
40 |         'axes.labelsize': 16,
41 |         'axes.titlesize': 18,
42 | 
43 |         'savefig.dpi': 75,
44 | 
45 |         'figure.autolayout': False,
46 |         'figure.figsize': (13, 7),
47 |         'figure.titlesize': 20,
48 | 
49 |         'lines.linewidth': 2.0,
50 |         'lines.markersize': 8,
51 |         'legend.fontsize': 14
52 |     })


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | 
 4 | import numpy as np
 5 | import multiprocessing as mp
 6 | 
 7 | from warnings import simplefilter
 8 | simplefilter('ignore', category=FutureWarning)
 9 | simplefilter('ignore', category=DeprecationWarning)
10 | 
11 | 
12 | def json_numpy_serialzer(o):
13 |     """ Serialize numpy types for json
14 | 
15 |     Parameters:
16 |         o (object): any python object which fails to be serialized by json
17 | 
18 |     Example:
19 | 
20 |         >>> import json
21 |         >>> a = np.array([1, 2, 3])
22 |         >>> json.dumps(a, default=json_numpy_serializer)
23 | 
24 |     """
25 |     numpy_types = (
26 |         np.bool_,
27 |         np.float16,
28 |         np.float32,
29 |         np.float64,
30 |         # np.float128,  -- special handling below
31 |         np.int8,
32 |         np.int16,
33 |         np.int32,
34 |         np.int64,
35 |         np.str_,
36 |         np.timedelta64,
37 |         np.uint8,
38 |         np.uint16,
39 |         np.uint32,
40 |         np.uint64,
41 |         np.void,
42 |     )
43 | 
44 |     if isinstance(o, np.ndarray):
45 |         return o.tolist()
46 |     elif isinstance(o, numpy_types):
47 |         return o.item()
48 |     elif isinstance(o, np.float128):
49 |         return o.astype(np.float64).item()
50 |     else:
51 |         raise TypeError("{} of type {} is not JSON serializable".format(repr(o), type(o)))
52 | 
53 | 
54 | def set_random_seed(seed=0):
55 |     random.seed(seed)
56 |     np.random.seed(seed)
57 | 
58 | 
59 | def read_json_file(json_file):
60 |     with open(json_file, 'r') as file:
61 |         return json.load(file)
62 | 
63 | 
64 | def get_mia_gain(pCorrectSyn):
65 |     # return min(1, 2*(1 - pCorrectSyn))
66 |     return 2 * (1 - pCorrectSyn)
67 | 
68 | 
69 | def get_accuracy(guesses, labels):
70 |     return sum([g == l for g, l in zip(guesses, labels)])/len(labels)
71 | 
72 | 
73 | class CustomProcess(mp.Process):
74 |     def run(self, *args, **kwargs):
75 |         import warnings
76 |         with warnings.catch_warnings():
77 |             warnings.simplefilter('ignore', category=FutureWarning)
78 |             warnings.simplefilter('ignore', category=DeprecationWarning)
79 |             return mp.Process.run(self, *args, **kwargs)
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 


--------------------------------------------------------------------------------