├── .gitignore ├── LICENSE.txt ├── README.md ├── TorchOCC.ipynb ├── audit.py ├── data ├── __init__.py ├── amsterdam │ ├── __init__.py │ ├── data_preprocess.py │ ├── data_scripts.py │ └── data_utils.py ├── googlestock │ ├── __init__.py │ └── data_preprocess.py └── snp500 │ └── all_stocks_5yr.csv ├── environment.yml ├── generative_models ├── __init__.py ├── adsgan.py ├── dpgan │ ├── __init__.py │ ├── impl │ │ ├── __init__.py │ │ ├── data.py │ │ └── fc.py │ ├── main.py │ └── utilize.py ├── gan.py ├── pategan.py ├── pategan_from_bitbucket.py ├── rgan │ └── experiments │ │ └── settings │ │ ├── rgan-dp.txt │ │ ├── rgan.txt │ │ ├── rgan_dp.txt │ │ ├── sine.txt │ │ └── test_modified.txt ├── timegan │ ├── __init__.py │ ├── timegan.py │ └── utils.py └── vae.py ├── main_image.py ├── main_tabular.py ├── main_timeseries.py ├── main_timeseries_embedding.py ├── metrics ├── __init__.py ├── combined.py ├── compute_identifiability.py ├── compute_wd.py ├── evaluation.py ├── evaluation_old.py ├── feature_distribution.py ├── fid.py ├── improved_precision_recall.py ├── parzen.py ├── prd_score.py ├── prdc.py └── precision_recall.py ├── predictive_models └── __init__.py ├── representations ├── OneClass.py ├── __init__.py ├── networks.py └── ts_embedding │ ├── __init__.py │ ├── seq2seq_autoencoder.py │ ├── training.py │ └── utils.py ├── requirements.txt ├── requirements_dpgan.txt ├── requirements_timegan.txt ├── toy_metric_evaluation.ipynb └── utils ├── __init__.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Standard gitignore for python projects, from: 2 | # https://github.com/DonJayamanne/vscode-python-samples/blob/master/.gitignore 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | .venv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # --- 107 | 108 | # Custom: 109 | *.pkl 110 | .tmp* 111 | .vscode/ 112 | .pylintrc 113 | visualisations/ 114 | jupyter/ 115 | models/ 116 | 117 | # Data dir content 118 | data/* 119 | !data/**/ 120 | !data/__init__.py 121 | 122 | data/amsterdam/* 123 | !data/amsterdam/*.py 124 | data/googlestock/* 125 | !data/googlestock/*.py 126 | data/snp500/* 127 | !data/snp500/*.csv 128 | !data/snp500/*.py 129 | 130 | data/ts_embedding/* 131 | !data/ts_embedding/*.py 132 | data/ts_generated/* 133 | !data/ts_generated/*.py 134 | 135 | data/mnist/* 136 | !data/mnist/*.py 137 | 138 | data/tabular/original/* 139 | !data/tabular/original/*.py 140 | data/tabular/synth/* 141 | !data/tabular/synth/*.py 142 | 143 | generative_models/rgan/experiments/data 144 | generative_models/rgan/experiments/parameters 145 | generative_models/rgan/experiments/plots 146 | generative_models/rgan/experiments/traces 147 | 148 | /training_log.png 149 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [year] [fullname] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # evaluating-generative-models 2 | 3 | 🚧 This codebase is still a work in progress - expect some updates as we finalize and tidy up the code. 4 | -------------------------------------------------------------------------------- /audit.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | """ 4 | Author: Boris van Breugel (bv292@cam.ac.uk) 5 | ----------------------------------------- 6 | Auditing implementation 7 | ----------------------------------------- 8 | 9 | """ 10 | 11 | import numpy as np 12 | from sklearn.neighbors import NearestNeighbors 13 | 14 | import logging 15 | import torch 16 | import scipy 17 | from generative_models.adsgan import adsgan 18 | from metrics.evaluation import compute_alpha_precision 19 | from metrics.evaluation_old import compute_alpha_precision_old 20 | 21 | device = 'cuda' # matrices are too big for gpu 22 | 23 | 24 | def audit(real_data, params, OC_model): 25 | 26 | 27 | n_steps = 30 28 | n_orig = real_data.shape[0] 29 | nn_size = 2 30 | alphas = np.linspace(0, 1, n_steps) 31 | 32 | emb_center = torch.tensor(OC_model.c, device='cpu') 33 | 34 | with torch.no_grad(): 35 | X = OC_model(torch.tensor(real_data.to_numpy(), device=OC_model.device).float().to(device)).cpu().detach().numpy() 36 | 37 | Radii = np.quantile(torch.sqrt(torch.sum((torch.tensor(X).float() - emb_center) ** 2, dim=1)), alphas) 38 | alpha_precision_curve = [] 39 | beta_coverage_curve = [] 40 | nbrs_real = NearestNeighbors(n_neighbors = 2, n_jobs=-1, p=2).fit(X) 41 | real_to_real, real_to_real_args = nbrs_real.kneighbors(X) 42 | real_to_real = torch.from_numpy(real_to_real[:,1].squeeze()) 43 | 44 | print('Difference a;lf', (real_to_real_args[:,0]==np.arange(n_orig)).mean()) 45 | real_to_real_args = real_to_real_args[:,1].squeeze() 46 | 47 | 48 | number_per_quantile = np.round(np.quantile(np.arange(n_orig),alphas)) 49 | number_per_quantile = number_per_quantile[1:] - number_per_quantile[:-1] 50 | 51 | r2r = scipy.spatial.distance_matrix(X,X) 52 | r2r[np.eye(n_orig, dtype='bool')] = np.max(r2r)+1 #just set it large so it's not chosen 53 | min_r2r = np.min(r2r,axis=1) 54 | min_r2r_args = np.argmin(r2r,axis=1) 55 | print('min_r2r', (min_r2r==0).mean()) 56 | 57 | print('Difference abs', np.max(np.abs(min_r2r-real_to_real.numpy()))) 58 | print('Difference arguments') 59 | 60 | 61 | synthetic_data = [] 62 | 63 | generate_more = True 64 | iteration = 0 65 | 66 | while generate_more: 67 | print('Iteration:',iteration) 68 | iteration+=1 69 | synth_data = adsgan(real_data, params) 70 | with torch.no_grad(): 71 | Y = OC_model(torch.tensor(synth_data, device=OC_model.device).float().to(device)).cpu().detach().numpy() 72 | 73 | 74 | 75 | nbrs_synth = NearestNeighbors(n_neighbors = 1, n_jobs=-1, p=2).fit(Y) 76 | real_to_synth, real_to_synth_args = nbrs_synth.kneighbors(X) 77 | real_to_synth = torch.from_numpy(real_to_synth.squeeze()) 78 | real_to_synth_args = real_to_synth_args.squeeze() 79 | print('Mean real to synth' , torch.mean(real_to_synth)) 80 | print('mean real to real', torch.mean(real_to_real[real_to_synth_args])) 81 | # Audit 82 | #authen = np.ones(len(real_to_synth),dtype='bool')# 83 | authen = real_to_real[real_to_synth_args] < real_to_synth 84 | indices_to_use_authen = np.arange(len(authen), dtype = 'int')[authen] 85 | synth_data = synth_data[indices_to_use_authen] 86 | print('After auditing out unauthentic points, points remain:',synth_data.shape[0]) 87 | 88 | Y = Y[indices_to_use_authen] 89 | 90 | nbrs_synth = NearestNeighbors(n_neighbors = 1, n_jobs=-1, p=2).fit(Y) 91 | 92 | real_to_synth, real_to_synth_args = nbrs_synth.kneighbors(X) 93 | 94 | real_to_synth = torch.from_numpy(real_to_synth.squeeze()) 95 | real_to_synth_args = real_to_synth_args.squeeze() 96 | 97 | print('After which the authenticity is', np.mean(np.array(real_to_real[real_to_synth_args] < real_to_synth,dtype='bool'))) 98 | 99 | 100 | 101 | # Precisions 102 | synth_center = torch.tensor(np.mean(Y, axis=0)).float() 103 | synth_to_center = torch.sqrt(torch.sum((torch.tensor(Y).float() - emb_center) ** 2, dim=1)) 104 | 105 | real_synth_closest = Y[real_to_synth_args] 106 | real_synth_closest_d = torch.sqrt(torch.sum((torch.tensor(real_synth_closest).float()- synth_center) ** 2, dim=1)) 107 | closest_synth_Radii = np.quantile(real_synth_closest_d, alphas) 108 | 109 | n_synth = Y.shape[0] 110 | indices_available = np.ones(n_synth) 111 | indices_use = np.zeros(n_synth, dtype = 'bool') 112 | 113 | 114 | generate_more = False 115 | 116 | for k in range(n_steps-1): 117 | if number_per_quantile[k] != 0: 118 | 119 | precision_mask = (synth_to_center <= Radii[k+1]).detach().float().numpy() 120 | indices_close_enough = np.arange(n_synth,dtype='int')[np.logical_and(precision_mask, indices_available)] 121 | indices_available = np.logical_not(precision_mask) 122 | number_to_add = int(min(number_per_quantile[k], len(indices_close_enough))) 123 | indices_close_enough = indices_close_enough[:number_to_add] 124 | indices_use[indices_close_enough] = True 125 | number_per_quantile[k] -= number_to_add 126 | if number_per_quantile[k] != 0: 127 | generate_more = True 128 | 129 | 130 | synthetic_data.append(synth_data[indices_use]) 131 | 132 | synthetic_data = np.concatenate(synthetic_data,axis=0) 133 | with torch.no_grad(): 134 | Y = OC_model(torch.tensor(synthetic_data, device=OC_model.device).float().to(device)).cpu().detach().numpy() 135 | 136 | print('new results', compute_alpha_precision(X,Y, emb_center)[3:]) 137 | print('old_results', compute_alpha_precision_old(X,Y, emb_center)[3:-1]) 138 | 139 | return synthetic_data -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedmalaa/evaluating-generative-models/093910e487d07959db7d87b54698da60aaeb50c0/data/__init__.py -------------------------------------------------------------------------------- /data/amsterdam/__init__.py: -------------------------------------------------------------------------------- 1 | """Hide-and-Seek Privacy Challenge Codebase. 2 | 3 | Reference: James Jordon, Daniel Jarrett, Jinsung Yoon, Ari Ercole, Cheng Zhang, Danielle Belgrave, Mihaela van der Schaar, 4 | "Hide-and-Seek Privacy Challenge: Synthetic Data Generation vs. Patient Re-identification with Clinical Time-series Data," 5 | Neural Information Processing Systems (NeurIPS) Competition, 2020. 6 | 7 | Link: https://www.vanderschaar-lab.com/announcing-the-neurips-2020-hide-and-seek-privacy-challenge/ 8 | 9 | Last updated Date: Jan 19th 2021 10 | Code author: Jinsung Yoon, Evgeny Saveliev 11 | Contact: jsyoon0823@gmail.com, e.s.saveliev@gmail.com 12 | """ 13 | 14 | from .data_preprocess import ( 15 | AmsterdamLoader, 16 | preprocess_data, 17 | padding_mask_to_seq_lens, 18 | convert_front_padding_to_back_padding 19 | ) 20 | from .data_utils import data_division 21 | -------------------------------------------------------------------------------- /data/amsterdam/data_preprocess.py: -------------------------------------------------------------------------------- 1 | """Amsterdam UMCdb data preprocessing. 2 | 3 | The source data files required are those prepared for Hide-and-Seek NeurIPS 2020 competition: 4 | ``` 5 | train_longitudinal_data.csv 6 | test_longitudinal_data.csv 7 | ``` 8 | 9 | Author: Evgeny Saveliev (e.s.saveliev@gmail.com) 10 | """ 11 | 12 | import os 13 | from typing import Union, Tuple 14 | 15 | import numpy as np 16 | import pandas as pd 17 | from tqdm import tqdm 18 | from sklearn.preprocessing import MinMaxScaler 19 | 20 | from .data_utils import data_division 21 | 22 | 23 | # ---------------------------------------------------------------------------------------------------------------------- 24 | # General helpers. 25 | 26 | def _to_3d(arr: np.ndarray, max_seq_len: int) -> np.ndarray: 27 | n_patients = arr.shape[0] // max_seq_len 28 | dim = arr.shape[1] 29 | return np.reshape(arr, [n_patients, max_seq_len, dim]) 30 | 31 | 32 | def _to_2d(arr: np.ndarray) -> np.ndarray: 33 | n_patients = arr.shape[0] 34 | max_seq_len = arr.shape[1] 35 | dim = arr.shape[2] 36 | return np.reshape(arr, [n_patients * max_seq_len, dim]) 37 | 38 | 39 | # ---------------------------------------------------------------------------------------------------------------------- 40 | # Helpers for Seq2Seq autoencoder. 41 | 42 | def combine_csvs(path_train, path_test, path_combined): 43 | df_train = pd.read_csv(os.path.abspath(path_train)) 44 | df_test = pd.read_csv(os.path.abspath(path_test)) 45 | df_combined = df_train.append(df_test, ignore_index=True) 46 | df_combined.sort_values(by=["admissionid", "Unnamed: 0"], ignore_index=True, inplace=True) 47 | df_combined.to_csv(os.path.abspath(path_combined), index=False) 48 | 49 | 50 | def downsample_csv_by_admissionids(path, path_downsampled, downsample_n_ids, seed): 51 | df = pd.read_csv(os.path.abspath(path)) 52 | ids = df["admissionid"].unique() 53 | np.random.seed(seed) 54 | np.random.shuffle(ids) 55 | ds_ids = ids[:downsample_n_ids] 56 | df_ds = df[df["admissionid"].isin(ds_ids)] 57 | df_ds.to_csv(os.path.abspath(path_downsampled), index=False) 58 | 59 | 60 | def padding_mask_to_seq_lens(padding_mask): 61 | padding_mask_inverted = -1 * (padding_mask.astype(int) - 1) 62 | padding_mask_as_seq_lens = padding_mask_inverted.sum(axis=1)[:, 0] # Sum 1s along sequence dimension. 63 | # ^ As identical length for each feature, take 0th. 64 | return padding_mask_as_seq_lens 65 | 66 | 67 | def convert_front_padding_to_back_padding(data, seq_lens, pad_val): 68 | if 0 in seq_lens: 69 | raise ValueError("0 encountered in seq_lens.") 70 | data_ = np.full_like(data, pad_val) 71 | for idx, l in enumerate(seq_lens): 72 | data_[idx, :l, :] = data[idx, -l:, :] 73 | return data_ 74 | 75 | 76 | # ---------------------------------------------------------------------------------------------------------------------- 77 | # Data loader. 78 | class AmsterdamLoader(object): 79 | 80 | def __init__( 81 | self, 82 | data_path: str, 83 | max_seq_len: int, 84 | seed: int, 85 | train_rate: float, 86 | val_rate: float, 87 | include_time: bool, 88 | debug_data: Union[int, bool] = False, 89 | pad_before: bool = False, 90 | padding_fill: float = -1., 91 | ) -> None: 92 | """Initialise Amsterdam data loader. Here, the Amsterdam data refers to the Hide-and-Seek competition subset 93 | ot the Amsterdam UMCdb dataset, specifically `train_longitudinal_data.csv` or `test_longitudinal_data.csv`. 94 | 95 | Args: 96 | data_path (str): Data CSV file path. 97 | max_seq_len (int): Maximum sequence length of the time series dimension - for reshaping. 98 | seed (int): Random seed for data split. 99 | train_rate (float): The fraction of the data to allocate to training set. 100 | val_rate (float): The fraction of the data to allocate to validation set. 101 | include_time (bool): Whether to include time as the 0th feature in each example. 102 | debug_data (Union[int, bool], optional): If int, read only top debug_data-many rows, if True, 103 | read only top 10000 rows, if False read whole dataset. Defaults to False. 104 | pad_before (bool, optional): If True, padding will be added at the beginning of time dimension, 105 | else padding added at the end. Defaults to False. 106 | padding_fill (float, optional): Pad timeseries vectors shorter than max_seq_len with this value. 107 | Defaults to -1. 108 | """ 109 | assert train_rate > 0. and val_rate >= 0. and (train_rate + val_rate) < 1. 110 | self.data_path = os.path.abspath(data_path) 111 | self.max_seq_len = max_seq_len 112 | self.seed = seed 113 | self.train_rate = train_rate 114 | self.val_rate = val_rate 115 | self.include_time = include_time 116 | self.debug_data = debug_data 117 | self.pad_before = pad_before 118 | self.padding_fill = padding_fill 119 | 120 | def load_reshape_split_data(self, force_refresh: bool) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: 121 | """Load prepared data, reshape to a 3D array of shape [num_examples, max_seq_len, num_features], 122 | split into train, validation sets. Preprocessing of the data is done separately using `preprocess_data()`. 123 | 124 | Args: 125 | force_refresh (bool): If True, will rerun this from scratch, rather than using results cached in npz file. 126 | 127 | Returns: 128 | Tuple[np.ndarray, np.ndarray, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]: 129 | raw_data, padding_mask, (train_idx, val_idx, test_idx) 130 | """ 131 | 132 | npz_path = self.data_path.replace(".csv", ".npz") 133 | 134 | if os.path.exists(npz_path) and not force_refresh: 135 | 136 | print(f"Found existing cached .npz file ({npz_path}), using cached data. Set force_refresh=True to refresh.") 137 | with np.load(npz_path) as data: 138 | raw_data = data["raw_data"] 139 | padding_mask = data["padding_mask"] 140 | train_idx = data["train_idx"] 141 | val_idx = data["val_idx"] 142 | test_idx = data["test_idx"] 143 | 144 | else: 145 | 146 | raw_data, padding_mask = self._load_and_reshape(self.data_path) 147 | _, (train_idx, val_idx, test_idx) = data_division( 148 | raw_data, 149 | seed=self.seed, 150 | divide_rates=[self.train_rate, self.val_rate, 1 - self.train_rate - self.val_rate] 151 | ) 152 | 153 | np.savez(npz_path, raw_data=raw_data, padding_mask=padding_mask, train_idx=train_idx, val_idx=val_idx, test_idx=test_idx) 154 | 155 | return raw_data, padding_mask, (train_idx, val_idx, test_idx) 156 | 157 | def _load_and_reshape(self, file_name: str) -> Tuple[np.ndarray, np.ndarray]: 158 | """Load data from `file_name` and reshape into a 3D array of shape [num_examples, max_seq_len, num_features]. 159 | A padding mask of data will also be produced (same shape), having elements True where time series were padded 160 | (due to being shorter than max_seq_len). 161 | 162 | Note: 163 | The 0th feature is time. 164 | 165 | Args: 166 | file_name (str): Original data CSV file. 167 | 168 | Returns: 169 | Tuple[np.ndarray, np.ndarray]: [0] loaded and reshaped data, [1] corresponding padding. 170 | """ 171 | padding_indicator = -999.0 # This value avoids clashing with any actual data. 172 | 173 | # Load data 174 | if self.debug_data is not False: 175 | if isinstance(self.debug_data, bool): 176 | nrows: Union[int, None] = 10000 177 | else: 178 | assert isinstance(self.debug_data, int), "debug_data argument must be bool or int." 179 | nrows = self.debug_data 180 | else: 181 | nrows = None 182 | ori_data = pd.read_csv(file_name, nrows=nrows) 183 | if ori_data.columns[0] == "Unnamed: 0": # Remove spurious column, so that column 0 is now 'admissionid'. 184 | ori_data = ori_data.drop(["Unnamed: 0"], axis=1) 185 | 186 | # Drop time column if requested. 187 | if not self.include_time: 188 | ori_data = ori_data.drop(["time"], axis=1) 189 | 190 | # Parameters 191 | uniq_id = np.unique(ori_data["admissionid"]) 192 | no = len(uniq_id) 193 | dim = len(ori_data.columns) - 1 194 | 195 | # Output initialization 196 | assert np.any(ori_data == padding_indicator) == False, f"Padding indicator value {padding_indicator} found in data" 197 | loaded_data = np.empty([no, self.max_seq_len, dim]) # Shape: [no, max_seq_len, dim] 198 | loaded_data.fill(padding_indicator) 199 | 200 | # For each unique id 201 | print("Reshaping data...") 202 | for i in tqdm(range(no)): 203 | 204 | # Extract the time-series data with a certain admissionid 205 | idx = ori_data.index[ori_data["admissionid"] == uniq_id[i]] 206 | curr_data = ori_data.iloc[idx].to_numpy() # Shape: [curr_no, dim + 1] 207 | 208 | # Assign to the preprocessed data (Excluding ID) 209 | curr_no = len(curr_data) 210 | if curr_no >= self.max_seq_len: 211 | loaded_data[i, :, :] = curr_data[:self.max_seq_len, 1:] # Shape: [1, max_seq_len, dim] 212 | else: 213 | if self.pad_before: 214 | loaded_data[i, -curr_no:, :] = curr_data[:, 1:] # Shape: [1, max_seq_len, dim] 215 | else: 216 | loaded_data[i, :curr_no, :] = curr_data[:, 1:] # Shape: [1, max_seq_len, dim] 217 | 218 | padding_mask = loaded_data == padding_indicator 219 | loaded_data = np.where(padding_mask, self.padding_fill, loaded_data) 220 | 221 | return loaded_data, padding_mask 222 | 223 | 224 | # ---------------------------------------------------------------------------------------------------------------------- 225 | # Data preprocessing. 226 | 227 | def preprocess_data( 228 | data: np.ndarray, 229 | padding_mask: np.ndarray, 230 | padding_fill: float, 231 | time_feature_included: bool 232 | ) -> Tuple[np.ndarray, np.ndarray]: 233 | """Preprocess and impute `data`. 234 | 235 | Note: 236 | If `time_feature_included=True`, the 0th feature is time, and it is preprocessed differently to the other 237 | features: not normalized to [0, 1] but shifted by -max_time_for_example. 238 | 239 | Args: 240 | data (np.ndarray of float): 241 | Data as loaded (and reshaped to 3D). Shape [num_examples, max_seq_len, num_features]. 242 | padding_mask (np.ndarray of bool): 243 | Padding mask of data, indicating True where time series were shorter than max_seq_len and were padded. 244 | Same shape as data. 245 | padding_fill (float): 246 | Pad timeseries vectors shorter than max_seq_len with this value. 247 | time_feature_included (bool): 248 | Whether to include time as the 0th feature in each example. 249 | 250 | Returns: 251 | Tuple[np.ndarray, np.ndarray]: [0] preprocessed data, [1] preprocessed and imputed data. 252 | """ 253 | print("Preprocessing data...") 254 | 255 | median_vals = _get_medians(data, padding_mask) 256 | imputed_data = _impute(data, padding_mask, median_vals, padding_fill) 257 | 258 | scaler_imputed = _get_scaler(imputed_data, padding_mask) 259 | imputed_processed_data = \ 260 | _preprocess(imputed_data, padding_mask, scaler_imputed, padding_fill, time_feature_included) 261 | 262 | scaler_original = _get_scaler(data, padding_mask) 263 | processed_data = \ 264 | _preprocess(data, padding_mask, scaler_original, padding_fill, time_feature_included) 265 | 266 | return processed_data, imputed_processed_data 267 | 268 | def _imputation(curr_data: np.ndarray, median_vals: np.ndarray, zero_fill: bool = True) -> np.ndarray: 269 | """Impute missing data using bfill, ffill and median imputation. 270 | 271 | Args: 272 | curr_data (np.ndarray): Data before imputation. 273 | median_vals (np.ndarray): Median values for each column. 274 | zero_fill (bool, optional): Whather to Fill with zeros the cases where median_val is nan. Defaults to True. 275 | 276 | Returns: 277 | np.ndarray: Imputed data. 278 | """ 279 | 280 | curr_data = pd.DataFrame(data=curr_data) 281 | median_vals = pd.Series(median_vals) 282 | 283 | # Backward fill 284 | imputed_data = curr_data.bfill(axis="rows") 285 | # Forward fill 286 | imputed_data = imputed_data.ffill(axis="rows") 287 | # Median fill 288 | imputed_data = imputed_data.fillna(median_vals) 289 | 290 | # Zero-fill, in case the `median_vals` for a particular feature is `nan`. 291 | if zero_fill: 292 | imputed_data = imputed_data.fillna(0.0) 293 | 294 | if imputed_data.isnull().any().any(): 295 | raise ValueError("NaN values remain after imputation") 296 | 297 | return imputed_data.to_numpy() 298 | 299 | def _get_medians(data: np.ndarray, padding_mask: np.ndarray): 300 | assert len(data.shape) == 3 301 | 302 | data = _to_2d(data) 303 | if padding_mask is not None: 304 | padding_mask = _to_2d(padding_mask) 305 | data_temp = np.where(padding_mask, np.nan, data) # To avoid PADDING_INDICATOR affecting results. 306 | else: 307 | data_temp = data 308 | 309 | # Medians 310 | median_vals = np.nanmedian(data_temp, axis=0) # Shape: [dim + 1] 311 | 312 | return median_vals 313 | 314 | def _get_scaler(data: np.ndarray, padding_mask: np.ndarray): 315 | assert len(data.shape) == 3 316 | 317 | data = _to_2d(data) 318 | if padding_mask is not None: 319 | padding_mask = _to_2d(padding_mask) 320 | data_temp = np.where(padding_mask, np.nan, data) # To avoid PADDING_INDICATOR affecting results. 321 | else: 322 | data_temp = data 323 | 324 | # Scaler 325 | scaler = MinMaxScaler() 326 | scaler.fit(data_temp) # Note that np.nan's will be left untouched. 327 | 328 | return scaler 329 | 330 | def _impute( 331 | data: np.ndarray, 332 | padding_mask: np.ndarray, 333 | median_vals: np.ndarray, 334 | padding_fill: float 335 | ) -> Tuple[np.ndarray, np.ndarray]: 336 | 337 | assert len(data.shape) == 3 338 | 339 | data_imputed_ = np.zeros_like(data) 340 | 341 | for i in range(data.shape[0]): 342 | cur_data = data[i, :, :] 343 | if padding_mask is not None: 344 | cur_data = np.where(padding_mask[i, :, :], np.nan, cur_data) 345 | 346 | # Scale and impute (excluding time) 347 | cur_data_imputed = _imputation(cur_data, median_vals) 348 | 349 | # Update 350 | data_imputed_[i, :, :] = cur_data_imputed 351 | 352 | # Set padding 353 | if padding_mask is not None: 354 | data_imputed_ = np.where(padding_mask, padding_fill, data_imputed_) 355 | 356 | return data_imputed_ 357 | 358 | def _preprocess( 359 | data: np.ndarray, 360 | padding_mask: np.ndarray, 361 | scaler: MinMaxScaler, 362 | padding_fill: float, 363 | time_feature_included: bool, 364 | ) -> Tuple[np.ndarray, np.ndarray]: 365 | 366 | assert len(data.shape) == 3 367 | 368 | data_ = np.zeros_like(data) 369 | 370 | for i in range(data.shape[0]): 371 | cur_data = data[i, :, :] 372 | if padding_mask is not None: 373 | cur_data = np.where(padding_mask[i, :, :], np.nan, cur_data) 374 | 375 | # Preprocess time (0th element of dim. 2): 376 | if time_feature_included: 377 | preprocessed_time = cur_data[:, 0] - np.nanmin(cur_data[:, 0]) 378 | 379 | # Scale and impute (excluding time) 380 | cur_data = scaler.transform(cur_data) 381 | 382 | # Set time 383 | if time_feature_included: 384 | cur_data[:, 0] = preprocessed_time 385 | 386 | # Update 387 | data_[i, :, :] = cur_data 388 | 389 | # Set padding 390 | if padding_mask is not None: 391 | data_ = np.where(padding_mask, padding_fill, data_) 392 | 393 | return data_ 394 | -------------------------------------------------------------------------------- /data/amsterdam/data_scripts.py: -------------------------------------------------------------------------------- 1 | """Amsterdam UMCdb data preprocessing: scripts. 2 | 3 | Author: Evgeny Saveliev (e.s.saveliev@gmail.com) 4 | """ 5 | from .data_preprocess import combine_csvs, downsample_csv_by_admissionids 6 | 7 | 8 | # Script settings: 9 | run_script = "combine_downsample" 10 | filepaths = { 11 | "source": { 12 | "train_data_filepath": "./train_longitudinal_data.csv", 13 | "test_data_filepath": "./test_longitudinal_data.csv" 14 | }, 15 | "output": { 16 | "out_combined_filepath": "./combined_longitudinal_data.csv", 17 | "out_combined_downsampled_filepath": "./combined_downsampled_longitudinal_data.csv" 18 | } 19 | } 20 | downsample_n_ids = 1000 21 | downsample_seed = 12345 22 | 23 | 24 | def main(): 25 | 26 | if run_script == "combine_downsample": 27 | # Note: requires between 64 and 128 GB of memory. 28 | combine_csvs( 29 | path_train=filepaths["source"]["train_data_filepath"], 30 | path_test=filepaths["source"]["test_data_filepath"], 31 | path_combined=filepaths["output"]["out_combined_filepath"] 32 | ) 33 | downsample_csv_by_admissionids( 34 | path=filepaths["output"]["out_combined_filepath"], 35 | path_downsampled=filepaths["output"]["out_combined_downsampled_filepath"], 36 | downsample_n_ids=downsample_n_ids, 37 | seed=downsample_seed 38 | ) 39 | 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /data/amsterdam/data_utils.py: -------------------------------------------------------------------------------- 1 | """Amsterdam UMCdb data preprocessing: utilities. 2 | 3 | Author: Jinsung Yoon (jsyoon0823@gmail.com) 4 | """ 5 | 6 | import numpy as np 7 | import random 8 | 9 | 10 | def data_division(data: np.ndarray, seed: int, divide_rates: list): 11 | """Divide the dataset into sub datasets. 12 | 13 | Args: 14 | data (np.ndarray): Data. 15 | seed (int): Random seed for data division. 16 | divide_rates (list of float): Ratio for each division. 17 | 18 | Returns: 19 | divided_data: Divided data (list format). 20 | divided_index: Divided data index (list format). 21 | """ 22 | # sum of the division rates should be 1 23 | assert sum(divide_rates) == 1 24 | 25 | # Output initialization 26 | divided_data = list() 27 | divided_index = list() 28 | 29 | # Set index 30 | no = len(data) 31 | random.seed(seed) 32 | np.random.seed(seed) 33 | index = np.random.permutation(no) 34 | 35 | # Set divided index & data 36 | for i in range(len(divide_rates)): 37 | temp_idx = index[int(no * sum(divide_rates[:i])) : int(no * sum(divide_rates[: (i + 1)]))] 38 | divided_index.append(temp_idx) 39 | 40 | temp_data = [data[j] for j in temp_idx] 41 | divided_data.append(temp_data) 42 | 43 | return divided_data, divided_index 44 | -------------------------------------------------------------------------------- /data/googlestock/__init__.py: -------------------------------------------------------------------------------- 1 | """Loading and preprocessing of Google Stock data from: https://www.kaggle.com/thevirusx3/google-stock-market-data 2 | 3 | Author: Evgeny Saveliev (e.s.saveliev@gmail.com) 4 | """ 5 | 6 | from .data_preprocess import load_stock_data, split_stock_data 7 | -------------------------------------------------------------------------------- /data/googlestock/data_preprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | import numpy as np 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.preprocessing import MinMaxScaler 7 | 8 | 9 | def _read_stock_csv(path): 10 | df = pd.read_csv(path, parse_dates=["Date"], thousands=",") 11 | df["Volume"] = df["Volume"].astype("float") 12 | return df 13 | 14 | 15 | def load_stock_data(train_path, test_path, normalize=True, time=False): 16 | train_path = os.path.abspath(train_path) 17 | test_path = os.path.abspath(test_path) 18 | 19 | df_train = _read_stock_csv(train_path) 20 | df_test = _read_stock_csv(test_path) 21 | 22 | df = df_train.append(df_test, ignore_index=True) # Combine so that can do custom train/val/test split. 23 | 24 | df["Date"] = (df["Date"] - df["Date"].min()) / np.timedelta64(1, "D") # Days since start. 25 | 26 | data = df.to_numpy() 27 | 28 | if normalize: 29 | scaler = MinMaxScaler() 30 | data_no_time = data[:, 1:] 31 | scaler.fit(data_no_time) 32 | data[:, 1:] = scaler.transform(data_no_time) 33 | 34 | if not time: 35 | data = data[:, 1:] 36 | 37 | return data 38 | 39 | 40 | DEFAULT_SPLIT_ORDER = { 41 | "train": 1, 42 | "val": 2, 43 | "test": 3, 44 | } 45 | 46 | 47 | def split_stock_data(data, frac_train, frac_val, split_order=None): 48 | 49 | assert frac_train > 0. and frac_train < 1. 50 | assert frac_val >= 0. and frac_val < 1. 51 | 52 | frac_test = 1. - frac_train - frac_val 53 | assert frac_test + frac_val + frac_train == 1. 54 | 55 | frac_dict = dict() 56 | for k, v in split_order.items(): 57 | if k == "train": 58 | frac_dict[v] = frac_train 59 | elif k == "val": 60 | frac_dict[v] = frac_val 61 | else: 62 | frac_dict[v] = frac_test 63 | 64 | #print(frac_dict) 65 | frac_1_2_of_all = frac_dict[1] + frac_dict[2] 66 | frac_1_of_1_2 = frac_dict[1] / frac_1_2_of_all 67 | #print("frac_1_of_1_2", frac_1_of_1_2) 68 | 69 | if split_order is None: 70 | split_order = DEFAULT_SPLIT_ORDER 71 | assert tuple(sorted(list(split_order.keys()))) == ("test", "train", "val") 72 | assert tuple(sorted(list(split_order.values()))) == (1, 2, 3) 73 | 74 | # Note that shuffle=False. 75 | data_1_2, data_3 = train_test_split(data, train_size=frac_1_2_of_all, shuffle=False) 76 | data_1, data_2 = train_test_split(data_1_2, train_size=frac_1_of_1_2, shuffle=False) 77 | 78 | split_content = dict() 79 | for k, v in split_order.items(): 80 | if v == 1: 81 | split_content[k] = data_1 82 | elif v == 2: 83 | split_content[k] = data_2 84 | else: 85 | split_content[k] = data_3 86 | 87 | print("Split Google Stock data over time in fractions:\n" 88 | f"'train'={frac_train:.3f}, 'val'={frac_val:.3f}, 'test'={frac_test:.3f}\n" 89 | f"and the subsets are in the following chronological order: {split_order}") 90 | 91 | return split_content["train"], split_content["val"], split_content["test"] 92 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: py36_egm 2 | channels: 3 | - pytorch 4 | - defaults 5 | dependencies: 6 | - cudatoolkit=11.0.221 7 | - jupyterlab=2.2.6 8 | - matplotlib=3.3.2 9 | - notebook=6.1.6 10 | - numpy=1.19.2 11 | - pandas=1.1.3 12 | - pip=20.3.3 13 | - python=3.6.12 14 | - pytorch=1.7.1 15 | - scikit-learn=0.23.2 16 | - scipy=1.5.2 17 | - tqdm=4.55.1 18 | - pip: 19 | - tensorflow-gpu==1.15.0 20 | -------------------------------------------------------------------------------- /generative_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedmalaa/evaluating-generative-models/093910e487d07959db7d87b54698da60aaeb50c0/generative_models/__init__.py -------------------------------------------------------------------------------- /generative_models/adsgan.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reference: Jinsung Yoon, Lydia N. Drumright, Mihaela van der Schaar, 3 | "Anonymization through Data Synthesis using Generative Adversarial Networks (ADS-GAN): 4 | A harmonizing advancement for AI in medicine," 5 | IEEE Journal of Biomedical and Health Informatics (JBHI), 2019. 6 | Paper link: https://ieeexplore.ieee.org/document/9034117 7 | Last updated Date: December 22th 2020 8 | Code author: Jinsung Yoon (jsyoon0823@gmail.com) 9 | 10 | Minor modifications made by Boris van Breugel (bv292@cam.ac.uk) and Evgeny Saveliev (e.s.saveliev@gmail.com). 11 | ----------------------------- 12 | adsgan.py 13 | - Generate synthetic data for GAN framework 14 | (1) Use original data to generate synthetic data 15 | """ 16 | 17 | #%% Import necessary packages 18 | import tensorflow as tf 19 | import numpy as np 20 | 21 | from tqdm import tqdm 22 | 23 | tf.compat.v1.disable_eager_execution() 24 | 25 | def adsgan(orig_data, params): 26 | """Generate synthetic data for ADSGAN framework. 27 | 28 | Args: 29 | orig_data: original data 30 | params: Network parameters 31 | mb_size: mini-batch size 32 | z_dim: random state dimension 33 | h_dim: hidden state dimension 34 | lambda: identifiability parameter 35 | iterations: training iterations 36 | 37 | Returns: 38 | synth_data: synthetically generated data 39 | """ 40 | 41 | # Reset the tensorflow graph 42 | tf.compat.v1.reset_default_graph() 43 | 44 | ## Parameters 45 | # Feature no 46 | x_dim = len(orig_data.columns) 47 | 48 | try: 49 | no = params['sample_no'] 50 | except KeyError: 51 | no = len(orig_data) 52 | 53 | # Batch size 54 | mb_size = params['mb_size'] 55 | # Random variable dimension 56 | z_dim = params['z_dim'] 57 | # Hidden unit dimensions 58 | h_dim = params['h_dim'] 59 | # Identifiability parameter 60 | lambda_ = params['lambda'] 61 | # Training iterations 62 | iterations = params['iterations'] 63 | # WGAN-GP parameters 64 | lam = 10 65 | lr = 1e-4 66 | 67 | # Adam optimization 68 | beta_1 = 0.5 69 | 70 | try: 71 | lambda_tester = params['lambda_tester'] 72 | except KeyError: 73 | lambda_tester = False 74 | #%% Data Preprocessing 75 | orig_data = np.asarray(orig_data) 76 | 77 | def data_normalization(orig_data, epsilon = 1e-8): 78 | 79 | min_val = np.min(orig_data, axis=0) 80 | 81 | normalized_data = orig_data - min_val 82 | 83 | max_val = np.max(normalized_data, axis=0) 84 | normalized_data = normalized_data / (max_val + epsilon) 85 | 86 | normalization_params = {"min_val": min_val, "max_val": max_val} 87 | 88 | return normalized_data, normalization_params 89 | 90 | def data_renormalization(normalized_data, normalization_params, epsilon = 1e-8): 91 | 92 | renormalized_data = normalized_data * (normalization_params['max_val'] + epsilon) 93 | renormalized_data = renormalized_data + normalization_params['min_val'] 94 | 95 | return renormalized_data 96 | 97 | orig_data, normalization_params = data_normalization(orig_data) 98 | 99 | #%% Necessary Functions 100 | 101 | # Xavier Initialization Definition 102 | def xavier_init(size): 103 | in_dim = size[0] 104 | xavier_stddev = 1. / tf.sqrt(in_dim / 2.) 105 | return tf.random.normal(shape = size, stddev = xavier_stddev) 106 | 107 | def xavier_init_I(size): 108 | if lambda_tester: 109 | 110 | return tf.eye(size[0],size[1]) + xavier_init(size)/10 111 | else: 112 | return xavier_init(size) 113 | 114 | # Sample from uniform distribution 115 | def sample_Z(m, n): 116 | return np.random.uniform(-1., 1., size = [m, n]) 117 | 118 | # Sample from the real data 119 | def sample_X(m, n): 120 | return np.random.permutation(m)[:n] 121 | 122 | #%% Placeholder 123 | # Feature 124 | X = tf.compat.v1.placeholder(tf.float32, shape = [None, x_dim]) 125 | # Random Variable 126 | Z = tf.compat.v1.placeholder(tf.float32, shape = [None, z_dim]) 127 | 128 | #%% Discriminator 129 | # Discriminator 130 | D_W1 = tf.Variable(xavier_init([x_dim, h_dim])) 131 | D_b1 = tf.Variable(tf.zeros(shape=[h_dim])) 132 | 133 | D_W2 = tf.Variable(xavier_init([h_dim,h_dim])) 134 | D_b2 = tf.Variable(tf.zeros(shape=[h_dim])) 135 | 136 | D_W3 = tf.Variable(xavier_init([h_dim,1])) 137 | D_b3 = tf.Variable(tf.zeros(shape=[1])) 138 | 139 | theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] 140 | 141 | if lambda_tester: 142 | D_W4 = tf.Variable(xavier_init([h_dim,h_dim])) 143 | D_b4 = tf.Variable(tf.zeros(shape=[h_dim])) 144 | theta_D+= [D_W4, D_b4] 145 | 146 | 147 | #%% Generator 148 | 149 | G_W1 = tf.Variable(xavier_init_I([z_dim + x_dim, h_dim])) 150 | G_b1 = tf.Variable(tf.zeros(shape=[h_dim])) 151 | 152 | G_W2 = tf.Variable(xavier_init_I([h_dim,h_dim])) 153 | G_b2 = tf.Variable(tf.zeros(shape=[h_dim])) 154 | 155 | G_W3 = tf.Variable(xavier_init_I([h_dim,h_dim])) 156 | G_b3 = tf.Variable(tf.zeros(shape=[h_dim])) 157 | 158 | G_W4 = tf.Variable(xavier_init_I([h_dim, x_dim])) 159 | G_b4 = tf.Variable(tf.zeros(shape=[x_dim])) 160 | theta_G = [G_W1, G_W2, G_W3, G_W4, G_b1, G_b2, G_b3, G_b4] 161 | 162 | #%% Generator and discriminator functions 163 | def generator(z, x): 164 | inputs = tf.concat([x, z], axis = 1) 165 | G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) 166 | G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) 167 | G_h3 = tf.nn.relu(tf.matmul(G_h2, G_W3) + G_b3) 168 | G_log_prob = tf.nn.sigmoid(tf.matmul(G_h3, G_W4) + G_b4) 169 | 170 | return G_log_prob 171 | 172 | def discriminator(x): 173 | D_h1 = tf.nn.relu(tf.matmul(x, D_W1) + D_b1) 174 | D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) 175 | if lambda_tester: 176 | D_h4 = tf.nn.relu(tf.matmul(D_h2, D_W4) + D_b4) 177 | else: 178 | D_h4 = D_h2 179 | 180 | out = tf.matmul(D_h4, D_W3) + D_b3 181 | return out 182 | 183 | #%% Structure 184 | G_sample = generator(Z,X) 185 | D_real = discriminator(X) 186 | D_fake = discriminator(G_sample) 187 | 188 | 189 | 190 | # Replacement of Clipping algorithm to Penalty term 191 | # 1. Line 6 in Algorithm 1 192 | eps = tf.random.uniform([mb_size, 1], minval = 0., maxval = 1.) 193 | X_inter = eps*X + (1. - eps) * G_sample 194 | 195 | # 2. Line 7 in Algorithm 1 196 | grad = tf.gradients(ys=discriminator(X_inter), xs=[X_inter])[0] 197 | grad_norm = tf.sqrt(tf.reduce_sum(input_tensor=(grad)**2 + 1e-8, axis = 1)) 198 | grad_pen = lam * tf.reduce_mean(input_tensor=(grad_norm - 1)**2) 199 | 200 | # Loss function 201 | D_loss = tf.reduce_mean(input_tensor=D_fake) - tf.reduce_mean(input_tensor=D_real) + grad_pen 202 | 203 | 204 | 205 | 206 | G_loss1 = -tf.sqrt(tf.reduce_mean(input_tensor=tf.square(X - G_sample))) 207 | G_loss2 = -tf.reduce_mean(input_tensor=D_fake) 208 | 209 | G_loss = G_loss2 + lambda_ * G_loss1 210 | 211 | # Solver 212 | D_solver = (tf.compat.v1.train.AdamOptimizer(learning_rate = lr, beta1 = beta_1).minimize(D_loss, var_list = theta_D)) 213 | G_solver = (tf.compat.v1.train.AdamOptimizer(learning_rate = lr, beta1 = beta_1).minimize(G_loss, var_list = theta_G)) 214 | 215 | #%% Iterations 216 | sess = tf.compat.v1.Session() 217 | sess.run(tf.compat.v1.global_variables_initializer()) 218 | 219 | # Iterations 220 | for it in tqdm(range(iterations)): 221 | # Discriminator training 222 | for _ in range(5): 223 | Z_mb = sample_Z(mb_size, z_dim) 224 | 225 | X_idx = sample_X(no, mb_size) 226 | X_mb = orig_data[X_idx,:] 227 | 228 | _, D_loss_curr = sess.run([D_solver, D_loss], feed_dict = {X: X_mb, Z: Z_mb}) 229 | 230 | # Generator Training 231 | Z_mb = sample_Z(mb_size, z_dim) 232 | 233 | X_idx = sample_X(no, mb_size) 234 | X_mb = orig_data[X_idx,:] 235 | 236 | _, G_loss1_curr, G_loss2_curr = sess.run([G_solver, G_loss1, G_loss2], feed_dict = {X: X_mb, Z: Z_mb}) 237 | #if it%10==0: 238 | # print(G_loss1_curr, G_loss2_curr) 239 | 240 | #%% Output Generation 241 | synth_data = sess.run([G_sample], feed_dict = {Z: sample_Z(no, z_dim), X: orig_data}) 242 | synth_data = synth_data[0] 243 | 244 | # Renormalization 245 | synth_data = data_renormalization(synth_data, normalization_params) 246 | 247 | # Binary features 248 | for i in range(x_dim): 249 | if len(np.unique(orig_data[:, i])) == 2: 250 | synth_data[:, i] = np.array(np.round(synth_data[:, i]),dtype='int') 251 | 252 | return synth_data -------------------------------------------------------------------------------- /generative_models/dpgan/__init__.py: -------------------------------------------------------------------------------- 1 | """DPGAN baseline. 2 | 3 | Source: https://github.com/illidanlab/dpgan 4 | Authors: Liyang Xie, Kaixiang Lin, Shu Wang, Fei Wang, Jiayu Zhou 5 | Paper link: https://arxiv.org/abs/1802.06739 6 | 7 | Modified by: Evgeny Saveliev (e.s.saveliev@gmail.com) 8 | """ 9 | 10 | from .main import DPGAN, dpgan 11 | -------------------------------------------------------------------------------- /generative_models/dpgan/impl/__init__.py: -------------------------------------------------------------------------------- 1 | """Based on the implementation: dpgan/MIMIC-III/ 2 | 3 | Modified by: Evgeny Saveliev (e.s.saveliev@gmail.com) 4 | """ 5 | -------------------------------------------------------------------------------- /generative_models/dpgan/impl/data.py: -------------------------------------------------------------------------------- 1 | from numpy import random 2 | 3 | 4 | class NoiseSampler(object): 5 | def __call__(self, batch_size, z_dim): 6 | return random.normal(size=(batch_size, z_dim)) 7 | # the shape of return is: batch_size*z_dim 8 | # see Medgan line 209, use np.random.normal(), which has defauld std = 1.0 9 | -------------------------------------------------------------------------------- /generative_models/dpgan/impl/fc.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib.layers as tcl # pylint: disable=import-error 3 | from tensorflow.contrib.layers import batch_norm # pylint: disable=import-error 4 | 5 | 6 | class Autoencoder(object): 7 | def __init__(self, inputDim, l2scale, compressDims, aeActivation, decompressDims, dataType): 8 | self.x_dim = inputDim 9 | self.l2scale = l2scale 10 | self.compressDims = compressDims 11 | self.aeActivation = aeActivation 12 | self.decompressDims = decompressDims 13 | self.dataType = dataType 14 | self.name = 'dpgan/fc/autoencoder' 15 | 16 | def __call__(self, x_input): 17 | decodeVariables = {} 18 | with tf.variable_scope(self.name, regularizer=tcl.l2_regularizer(self.l2scale)): 19 | tempVec = x_input 20 | tempDim = self.x_dim 21 | i = 0 22 | for compressDim in self.compressDims: 23 | W = tf.get_variable('aee_W_' + str(i), shape=[tempDim, compressDim]) 24 | b = tf.get_variable('aee_b_' + str(i), shape=[compressDim]) 25 | tempVec = self.aeActivation(tf.add(tf.matmul(tempVec, W), b)) 26 | tempDim = compressDim 27 | i += 1 28 | 29 | i = 0 30 | for decompressDim in self.decompressDims[:-1]: 31 | W = tf.get_variable('aed_W_' + str(i), shape=[tempDim, decompressDim]) 32 | b = tf.get_variable('aed_b_' + str(i), shape=[decompressDim]) 33 | tempVec = self.aeActivation(tf.add(tf.matmul(tempVec, W), b)) 34 | tempDim = decompressDim 35 | decodeVariables['aed_W_' + str(i)] = W 36 | decodeVariables['aed_b_' + str(i)] = b 37 | i += 1 38 | W = tf.get_variable('aed_W_' + str(i), shape=[tempDim, self.decompressDims[-1]]) 39 | b = tf.get_variable('aed_b_' + str(i), shape=[self.decompressDims[-1]]) 40 | decodeVariables['aed_W_' + str(i)] = W 41 | decodeVariables['aed_b_' + str(i)] = b 42 | 43 | if self.dataType == 'binary': 44 | x_reconst = tf.nn.sigmoid(tf.add(tf.matmul(tempVec, W), b)) 45 | loss = tf.reduce_mean(-tf.reduce_sum(x_input * tf.log(x_reconst + 1e-12) + (1. - x_input) * tf.log(1. - x_reconst + 1e-12), 1), 0) 46 | else: 47 | x_reconst = tf.nn.relu(tf.add(tf.matmul(tempVec, W), b)) 48 | loss = tf.reduce_mean((x_input - x_reconst) ** 2) 49 | 50 | return loss, decodeVariables 51 | 52 | @property 53 | def vars(self): 54 | return [var for var in tf.trainable_variables() if self.name in var.name] 55 | 56 | 57 | class Generator(object): 58 | def __init__(self, randomDim, l2scale, generatorDims, generatorActivation, dataType): 59 | self.randomDim = randomDim 60 | self.l2scale = l2scale 61 | self.generatorDims = generatorDims 62 | # self.bn_train = bn_train 63 | self.generatorActivation = generatorActivation 64 | # self.bnDecay = bnDecay 65 | self.dataType = dataType 66 | self.name = 'dpgan/fc/g_net' 67 | 68 | def __call__(self, z): 69 | tempVec = z 70 | tempDim = self.randomDim 71 | with tf.variable_scope(self.name, regularizer=tcl.l2_regularizer(self.l2scale)): 72 | for i, genDim in enumerate(self.generatorDims[:-1]): 73 | W = tf.get_variable('W_' + str(i), shape=[tempDim, genDim]) 74 | h = tf.matmul(tempVec, W) 75 | h2 = batch_norm(h) # GAN: batch_norm(h, decay=self.bnDecay, scale=True, is_training=self.bn_train, updates_collections=None) 76 | h3 = self.generatorActivation(h2) 77 | tempVec = h3 # GAN: + tempVec 78 | tempDim = genDim 79 | W = tf.get_variable('W' + str(i), shape=[tempDim, self.generatorDims[-1]]) # pylint: disable=undefined-loop-variable 80 | h = tf.matmul(tempVec, W) 81 | h2 = h # GAN: batch_norm(h, decay=self.bnDecay, scale=True, is_training=self.bn_train, updates_collections=None) 82 | 83 | if self.dataType == 'binary': 84 | h3 = tf.nn.sigmoid(h2) # GAN: tf.nn.tanh(h2) 85 | else: 86 | h3 = tf.nn.relu(h2) 87 | 88 | output = h3 # GAN: + tempVec 89 | return output 90 | 91 | @property 92 | def vars(self): 93 | return [var for var in tf.trainable_variables() if self.name in var.name] 94 | 95 | 96 | class Discriminator(object): 97 | def __init__(self, inputDim, discriminatorDims, discriminatorActivation, l2scale): 98 | self.inputDim = inputDim 99 | self.discriminatorDims = discriminatorDims 100 | self.discriminatorActivation = discriminatorActivation 101 | self.l2scale = l2scale 102 | self.name = 'dpgan/fc/d_net' 103 | 104 | def __call__(self, x_input, keepRate, reuse=False): 105 | # batchSize = tf.shape(x_input)[0] 106 | # inputMean = tf.reshape(tf.tile(tf.reduce_mean(x_input, 0), [batchSize]), (batchSize, self.inputDim)) 107 | # tempVec = tf.concat(axis = 1, values = [x_input, inputMean]) # https://stackoverflow.com/questions/41813665/tensorflow-slim-typeerror-expected-int32-got-list-containing-tensors-of-type 108 | # tempDim = self.inputDim * 2 # need in GAN 109 | tempVec = x_input 110 | tempDim = self.inputDim # remove in GAN 111 | with tf.variable_scope(self.name, reuse=reuse): # GAN: regularizer=tcl.l2_regularizer(self.l2scale) 112 | for i, discDim in enumerate(self.discriminatorDims[:-1]): 113 | W = tf.get_variable('W_' + str(i), shape=[tempDim, discDim]) 114 | b = tf.get_variable('b_' + str(i), shape=[discDim]) 115 | h = self.discriminatorActivation(tf.add(tf.matmul(tempVec, W), b)) 116 | # h = tf.nn.dropout(h, keepRate) # need in GAN 117 | tempVec = h 118 | tempDim = discDim 119 | W = tf.get_variable('W', shape=[tempDim, 1]) 120 | b = tf.get_variable('b', shape=[1]) 121 | y_hat = tf.squeeze(tf.add(tf.matmul(tempVec, W), b)) # need sigmoid in GAN 122 | 123 | return y_hat, self.name 124 | 125 | 126 | class buildDiscriminator(object): 127 | '''Generated data need to go through a decoder before enter discriminator, real data enter discriminator directly''' 128 | def __init__(self, inputDim, discriminatorDims, discriminatorActivation, decompressDims, aeActivation, dataType, l2scale): 129 | self.d = Discriminator(inputDim, discriminatorDims, discriminatorActivation, l2scale) # it contains a discriminator 130 | self.inputDim = inputDim 131 | self.decompressDims = decompressDims 132 | self.aeActivation = aeActivation 133 | self.dataType = dataType 134 | self.name = 'dpgan/fc/build_d_net' 135 | 136 | def __call__(self, x_real, x_fake, keepRate, decodeVariables, reuse=True): 137 | y_hat_real, self.name = self.d(x_real, keepRate, reuse=False) 138 | tempVec = x_fake 139 | i = 0 140 | for _ in self.decompressDims[:-1]: 141 | tempVec = self.aeActivation(tf.add(tf.matmul(tempVec, decodeVariables['aed_W_' + str(i)]), decodeVariables['aed_b_' + str(i)])) 142 | i += 1 143 | if self.dataType == 'binary': 144 | x_decoded = tf.nn.sigmoid(tf.add(tf.matmul(tempVec, decodeVariables['aed_W_' + str(i)]), decodeVariables['aed_b_' + str(i)])) 145 | else: 146 | x_decoded = tf.nn.relu(tf.add(tf.matmul(tempVec, decodeVariables['aed_W_' + str(i)]), decodeVariables['aed_b_' + str(i)])) 147 | y_hat_fake, self.name = self.d(x_decoded, keepRate, reuse=True) 148 | d_loss = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake) 149 | g_loss = -tf.reduce_mean(y_hat_fake) 150 | 151 | return d_loss, g_loss, y_hat_real, y_hat_fake, x_decoded 152 | 153 | @property 154 | def vars(self): 155 | return [var for var in tf.trainable_variables() if self.name in var.name] 156 | -------------------------------------------------------------------------------- /generative_models/dpgan/utilize.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=unbalanced-tuple-unpacking 2 | import pickle 3 | 4 | from matplotlib.pylab import ( 5 | mean, array, nonzero, count_nonzero, putmask, around, split, clip, unique, where, concatenate, random 6 | ) 7 | 8 | from sklearn import linear_model 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score 11 | 12 | 13 | def data_readf(top): # pylint: disable=unused-argument 14 | '''Read MIMIC-III data''' 15 | with open('/home/xieliyan/Dropbox/GPU/Data/MIMIC-III/patient_vectors_1071.pickle', 'rb') as f_: # Original MIMIC-III data is in GPU1 16 | MIMIC_ICD9 = pickle.load(f_) # dictionary, each one is a list 17 | MIMIC_data = [] 18 | for value in MIMIC_ICD9: # dictionary to numpy array 19 | if mean(value) == 0.0: # skip all zero vectors, each patients should have as least one disease of course 20 | continue 21 | MIMIC_data.append(value) # amax(MIMIC_data): 540 22 | # MIMIC_data = age_filter(MIMIC_data) # remove those patients with age 18 or younger 23 | # MIMIC_data = binarize(array(MIMIC_data)) # binarize, non zero -> 1, average(MIMIC_data): , type(MIMIC_data[][]): 24 | # index, MIMIC_data = select_code(MIMIC_data, top) # should be done after binarize because we consider the frequency among different patients, select top codes and remove the patients that don't have at least one of these codes, see "applying deep learning to icd-9 multi-label classification from medical records" 25 | # MIMIC_data = MIMIC_data[:, index] # keep only those coordinates (features) correspondent to top ICD9 codes 26 | num_data = (array(MIMIC_data).shape)[0] # data number 27 | dim_data = (array(MIMIC_data).shape)[1] # data dimension 28 | return array(MIMIC_data), num_data, dim_data # (46520, 942) 46520 942 for whole dataset 29 | 30 | 31 | def c2b(train, generated, adj): 32 | '''Set the number of 1 in generated data as multiple time of in training data, the rest is set to 0 (or not)''' 33 | 34 | if count_nonzero(generated) <= count_nonzero(train): # special case: number of 1 in generated is <= train, all nonzero in train = 1 35 | putmask(generated, generated > 0, 1.0) 36 | return generated 37 | 38 | p = float(count_nonzero(train))/train.size # percentage of nonzero elements 39 | g = sorted(generated.flatten(), reverse=True) 40 | idx = int(around(adj*p*len(g))) # with adjustment 41 | v = g[idx] # any value large than this set to 1, o.w. to 0 42 | putmask(generated, generated<=v, 0.0) # due to the property of putmask, must first set 0 then set 1 43 | putmask(generated, generated>v, 1.0) 44 | print("Nonzero element portion in training data and adjustment value are:") 45 | print(p, adj) 46 | print("Nonzero element portion in generated data after adjustment of c2b function:") 47 | print(float(count_nonzero(generated))/generated.size) 48 | return generated 49 | 50 | 51 | def c2bcolwise(train, generated, adj): 52 | '''Set the number of 1 in each column in generated data the same as the same column in training data, the rest is set to 0. 53 | Network learn the joint distribution p(x1,...xd), then it should also learn the marginal distribution p(x1),...,p(xd), which 54 | is approximately the frequent of 1 (and 0) in each feature (coordinate) x1...xd, hence it make sense to do so. But 55 | by doing so we "force" the generated data have the same portion of 1 in each feature (coordinate) no matter how the network 56 | is trained (even not trained at all), this doesn't matters since features (coordinates) are dependent, p(x1,...xd) != p(x1)*...*p(xd) 57 | only setting the frequency of 1 in each feature (coordinate) is not enough, it also relies on the training of NN to learn the 58 | dependency among features (coordinates), i.e. conditional probability of x1...xd''' 59 | generated_new = [] # store new one 60 | s = train.sum(axis=0) 61 | print('Nonzero element in each feature (coordinate) in training data: ') 62 | print(list(map(int, s))) # not in scientific notation 63 | print("Adjustment value is: " + str(adj)) 64 | for col in range(len(s)): 65 | col_train = train[:,col] 66 | col_generated = generated[:,col] 67 | if count_nonzero(col_generated) <= count_nonzero(col_train): # special case: number of 1 in generated is <= train, all nonzero in train = 1 68 | putmask(col_generated, generated > 0, 1.0) 69 | generated_new.append(col_generated) 70 | continue 71 | g = sorted(col_generated, reverse=True) 72 | idx = int(adj*s[col]) # with adjustment 73 | v = g[idx] 74 | putmask(col_generated, col_generated<=v, 0.0) 75 | putmask(col_generated, col_generated>v, 1.0) 76 | generated_new.append(col_generated) 77 | generated_new = array(generated_new).T 78 | print('Nonzero element in each feature (coordinate) in generated data: ') 79 | print(list(map(int, generated_new.sum(axis=0)))) 80 | print('Portion of element that is match between training data and generated data') 81 | print(float(sum(train == generated_new))/(train.shape[0]*train.shape[1])) 82 | return generated_new 83 | 84 | 85 | def splitbycol(dataType, _VALIDATION_RATIO, col, MIMIC_data): 86 | '''Separate training and testing for each dimension (col), if we fix column col as label, 87 | we need to take _VALIDATION_RATIO of data with label 1 and _VALIDATION_RATIO of data with label 0 88 | and merge them together as testing set and leave the rest. Then balance the rest as training set 89 | by keeping whomever (0 or 1) is smaller and random select same number from the other one. 90 | Finally return training and testing set''' 91 | if dataType == 'binary': 92 | MIMIC_data = clip(MIMIC_data, 0, 1) 93 | _, c = split(MIMIC_data, col) # get column col 94 | if (unique(c).size == 1): # skip column: only one class 95 | return [], [] 96 | MIMIC_data_1 = MIMIC_data[nonzero(c), :][0] # Separate data matrix by label, label==1 97 | MIMIC_data_0 = MIMIC_data[where(c == 0)[0], :] 98 | trainX_1, testX_1 = train_test_split(MIMIC_data_1, test_size=_VALIDATION_RATIO, random_state=0) 99 | trainX_0, testX_0 = train_test_split(MIMIC_data_0, test_size=_VALIDATION_RATIO, random_state=0) 100 | testX = concatenate((testX_1, testX_0), axis=0) 101 | if len(trainX_1) == len(trainX_0): 102 | trainX = concatenate((trainX_1, trainX_0), axis=0) 103 | elif len(trainX_1) < len(trainX_0): 104 | temp_train, temp_test = train_test_split(trainX_0, test_size=len(trainX_1), random_state=0) 105 | trainX = concatenate((trainX_1, temp_test), axis=0) 106 | # testX = concatenate((testX, temp_train), axis=0) # can't merge, test set is already done 107 | else: 108 | temp_train, temp_test = train_test_split(trainX_1, test_size=len(trainX_0), random_state=0) 109 | trainX = concatenate((trainX_0, temp_test), axis=0) 110 | # testX = concatenate((testX, temp_train), axis=0) 111 | if ((array(trainX).shape)[0] == 0 or (array(testX).shape)[0] == 0): # skip column: no data point in training or testing set 112 | return [], [] 113 | return trainX, testX # 114 | 115 | 116 | 117 | def gene_check(col, x_gene): 118 | '''check if each column (coordinate) has one class or not, balance the data set then output''' 119 | _, c = split(x_gene, col) # get column col 120 | if (unique(c).size == 1): # skip column: only one class 121 | return [] 122 | x_gene_1 = x_gene[nonzero(c), :][0] 123 | x_gene_0 = x_gene[where(c == 0)[0], :] 124 | if len(x_gene_1) == len(x_gene_0): 125 | geneX = x_gene 126 | elif len(x_gene_1) < len(x_gene_0): 127 | temp_train, temp_test = train_test_split(x_gene_0, test_size=len(x_gene_1), random_state=0) 128 | geneX = concatenate((x_gene_1, temp_test), axis=0) 129 | else: 130 | temp_train, temp_test = train_test_split(x_gene_1, test_size=len(x_gene_0), random_state=0) 131 | geneX = concatenate((x_gene_0, temp_test), axis=0) 132 | if (array(geneX).shape)[0] == 0: 133 | return [] 134 | return x_gene 135 | 136 | 137 | def statistics(r, g, te, col): 138 | '''Column specific statistics (precision, recall(Sensitivity), f1-score, AUC)''' 139 | f_r, t_r = split(r, col) # separate feature and target 140 | f_g, t_g = split(g, col) 141 | f_te, t_te = split(te, col) # these 6 parts are all numpy array 142 | # t_g[t_g < 1.0] = 0 # hard decision boundary 143 | # t_g[t_g >= 0.5] = 1 144 | if (unique(t_r).size == 1) or (unique(t_g).size == 1): # if only those coordinates correspondent to top codes are kept, no coordinate should be skipped, if those patients that doesn't contain top ICD9 codes were removed, more coordinates will be skipped 145 | return [], [], [], [], [], [], [], [], [], [] 146 | model_r = linear_model.LogisticRegression() # logistic regression, if labels are all 0, this will cause: ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0 147 | model_r.fit(f_r, t_r) 148 | label_r = model_r.predict(f_te) # decision boundary is 0 149 | model_g = linear_model.LogisticRegression() 150 | model_g.fit(f_g, t_g) 151 | label_g = model_r.predict(f_te) 152 | precision_r = precision_score(t_te, label_r) # precision 153 | precision_g = precision_score(t_te, label_g) 154 | recall_r = recall_score(t_te, label_r) # recall 155 | recall_g = recall_score(t_te, label_g) 156 | acc_r = accuracy_score(t_te, label_r) # accuracy 157 | acc_g = accuracy_score(t_te, label_g) 158 | f1score_r = f1_score(t_te, label_r) # f1-score 159 | f1score_g = f1_score(t_te, label_g) 160 | auc_r = roc_auc_score(t_te, label_r) # AUC 161 | auc_g = roc_auc_score(t_te, label_g) 162 | 163 | return precision_r, precision_g, recall_r, recall_g, acc_r, acc_g, f1score_r, f1score_g, auc_r, auc_g 164 | 165 | 166 | def dwp(r, g, te, db=0.5, C=1.0): 167 | '''Dimension-wise prediction & dimension-wise probability, r for real, g for generated, t for test, all without separated feature and target, all are numpy array''' 168 | rv_pre = [] 169 | gv_pre = [] 170 | rv_pro = [] 171 | gv_pro = [] 172 | for i in range(len(r[0])): 173 | print(i) 174 | f_r, t_r = split(r, i) # separate feature and target 175 | f_g, t_g = split(g, i) 176 | f_te, t_te = split(te, i) # these 6 are all numpy array 177 | t_g[t_g < db ] = 0 # hard decision boundary 178 | t_g[t_g >= db ] = 1 179 | if (unique(t_r).size == 1) or (unique(t_g).size == 1): # if only those coordinates correspondent to top codes are kept, no coordinate should be skipped, if those patients that doesn't contain top ICD9 codes were removed, more coordinates will be skipped 180 | print("skip this coordinate") 181 | continue 182 | model_r = linear_model.LogisticRegression(C=C) # logistic regression, if labels are all 0, this will cause: ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0 183 | model_r.fit(f_r, t_r) 184 | label_r = model_r.predict(f_te) 185 | model_g = linear_model.LogisticRegression(C=C) 186 | model_g.fit(f_g, t_g) 187 | label_g = model_g.predict(f_te) 188 | # print(label_r) 189 | # print(mean(model_r.coef_), count_nonzero(model_r.coef_), mean(model_g.coef_), count_nonzero(model_g.coef_)) # statistics of classifiers 190 | # rv.append(match(label_r, t_te)/(len(t_te)+10**(-10))) # simply match 191 | # gv.append(match(label_g, t_te)/(len(t_te)+10**(-10))) 192 | rv_pre.append(f1_score(t_te, label_r)) # F1 score 193 | gv_pre.append(f1_score(t_te, label_g)) 194 | # reg = linear_model.LinearRegression() # least square error 195 | # reg.fit(f_r, t_r) 196 | # target_r = reg.predict(f_te) 197 | # reg = linear_model.LinearRegression() 198 | # reg.fit(f_g, t_g) 199 | # target_g = reg.predict(f_te) 200 | # rv.append(square(linalg.norm(target_r-t_te))) 201 | # gv.append(square(linalg.norm(target_g-t_te))) 202 | rv_pro.append(float(count_nonzero(t_r))/len(t_r)) # dimension-wise probability, see "https://onlinecourses.science.psu.edu/stat504/node/28" 203 | gv_pro.append(float(count_nonzero(t_g))/len(t_g)) 204 | 205 | return rv_pre, gv_pre, rv_pro, gv_pro 206 | 207 | 208 | def load_MIMICIII(dataType, _VALIDATION_RATIO, top): 209 | MIMIC_data, num_data, dim_data = data_readf(top) 210 | if dataType == 'binary': 211 | MIMIC_data = clip(MIMIC_data, 0, 1) 212 | trainX, testX = train_test_split(MIMIC_data, test_size=_VALIDATION_RATIO, random_state=0) 213 | return trainX, testX, dim_data 214 | 215 | 216 | def fig_add_noise(List): 217 | '''adding noise to results to make them distinguishable on figure''' 218 | print(len(List)) 219 | print(0.0001*random.randn(len(List))) 220 | List_new = List + 0.0001*random.randn(len(List)) 221 | return List_new 222 | -------------------------------------------------------------------------------- /generative_models/gan.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reference: Jinsung Yoon, Lydia N. Drumright, Mihaela van der Schaar, 3 | "Anonymization through Data Synthesis using Generative Adversarial Networks (ADS-GAN): 4 | A harmonizing advancement for AI in medicine," 5 | IEEE Journal of Biomedical and Health Informatics (JBHI), 2019. 6 | Paper link: https://ieeexplore.ieee.org/document/9034117 7 | Last updated Date: December 22th 2020 8 | Code author: Jinsung Yoon (jsyoon0823@gmail.com) 9 | 10 | Minor modifications made by Boris van Breugel (bv292@cam.ac.uk) and Evgeny Saveliev (e.s.saveliev@gmail.com). 11 | ----------------------------- 12 | adsgan.py 13 | - Generate synthetic data for GAN framework 14 | (1) Use original data to generate synthetic data 15 | """ 16 | 17 | #%% Import necessary packages 18 | import tensorflow as tf 19 | import numpy as np 20 | 21 | from tqdm import tqdm 22 | 23 | tf.compat.v1.disable_eager_execution() 24 | 25 | def gan(orig_data, params): 26 | """Generate synthetic data for ADSGAN framework. 27 | 28 | Args: 29 | orig_data: original data 30 | params: Network parameters 31 | mb_size: mini-batch size 32 | z_dim: random state dimension 33 | h_dim: hidden state dimension 34 | lambda: identifiability parameter 35 | iterations: training iterations 36 | 37 | Returns: 38 | synth_data: synthetically generated data 39 | """ 40 | 41 | # Reset the tensorflow graph 42 | tf.compat.v1.reset_default_graph() 43 | 44 | ## Parameters 45 | # Feature no 46 | x_dim = len(orig_data.columns) 47 | # Sample no 48 | no = len(orig_data) 49 | 50 | # Batch size 51 | mb_size = params['mb_size'] 52 | # Random variable dimension 53 | z_dim = params['z_dim'] 54 | # Hidden unit dimensions 55 | h_dim = params['h_dim'] 56 | # Training iterations 57 | iterations = params['iterations'] 58 | # GAN type 59 | gen_model_name = params['gen_model_name'] 60 | # WGAN-GP parameters 61 | lam = 10 62 | lr = 1e-4 63 | 64 | #%% Data Preprocessing 65 | orig_data = np.asarray(orig_data) 66 | 67 | def data_normalization(orig_data, epsilon = 1e-8): 68 | 69 | min_val = np.min(orig_data, axis=0) 70 | 71 | normalized_data = orig_data - min_val 72 | 73 | max_val = np.max(normalized_data, axis=0) 74 | normalized_data = normalized_data / (max_val + epsilon) 75 | 76 | normalization_params = {"min_val": min_val, "max_val": max_val} 77 | 78 | return normalized_data, normalization_params 79 | 80 | def data_renormalization(normalized_data, normalization_params, epsilon = 1e-8): 81 | 82 | renormalized_data = normalized_data * (normalization_params['max_val'] + epsilon) 83 | renormalized_data = renormalized_data + normalization_params['min_val'] 84 | 85 | return renormalized_data 86 | 87 | orig_data, normalization_params = data_normalization(orig_data) 88 | 89 | #%% Necessary Functions 90 | 91 | # Xavier Initialization Definition 92 | def xavier_init(size): 93 | in_dim = size[0] 94 | xavier_stddev = 1. / tf.sqrt(in_dim / 2.) 95 | return tf.random.normal(shape = size, stddev = xavier_stddev) 96 | 97 | # Sample from uniform distribution 98 | def sample_Z(m, n): 99 | return np.random.uniform(-1., 1., size = [m, n]) 100 | 101 | # Sample from the real data 102 | def sample_X(m, n): 103 | return np.random.permutation(m)[:n] 104 | 105 | #%% Placeholder 106 | # Feature 107 | X = tf.compat.v1.placeholder(tf.float32, shape = [None, x_dim]) 108 | # Random Variable 109 | Z = tf.compat.v1.placeholder(tf.float32, shape = [None, z_dim]) 110 | 111 | #%% Discriminator 112 | # Discriminator 113 | D_W1 = tf.Variable(xavier_init([x_dim, h_dim])) 114 | D_b1 = tf.Variable(tf.zeros(shape=[h_dim])) 115 | 116 | D_W2 = tf.Variable(xavier_init([h_dim,h_dim])) 117 | D_b2 = tf.Variable(tf.zeros(shape=[h_dim])) 118 | 119 | D_W3 = tf.Variable(xavier_init([h_dim,1])) 120 | D_b3 = tf.Variable(tf.zeros(shape=[1])) 121 | 122 | theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] 123 | 124 | #%% Generator 125 | G_W1 = tf.Variable(xavier_init([z_dim, h_dim])) 126 | G_b1 = tf.Variable(tf.zeros(shape=[h_dim])) 127 | 128 | G_W2 = tf.Variable(xavier_init([h_dim,h_dim])) 129 | G_b2 = tf.Variable(tf.zeros(shape=[h_dim])) 130 | 131 | G_W3 = tf.Variable(xavier_init([h_dim,h_dim])) 132 | G_b3 = tf.Variable(tf.zeros(shape=[h_dim])) 133 | 134 | G_W4 = tf.Variable(xavier_init([h_dim, x_dim])) 135 | G_b4 = tf.Variable(tf.zeros(shape=[x_dim])) 136 | 137 | theta_G = [G_W1, G_W2, G_W3, G_W4, G_b1, G_b2, G_b3, G_b4] 138 | 139 | #%% Generator and discriminator functions 140 | def generator(z): 141 | G_h1 = tf.nn.tanh(tf.matmul(z, G_W1) + G_b1) 142 | G_h2 = tf.nn.tanh(tf.matmul(G_h1, G_W2) + G_b2) 143 | G_h3 = tf.nn.tanh(tf.matmul(G_h2, G_W3) + G_b3) 144 | G_log_prob = tf.nn.sigmoid(tf.matmul(G_h3, G_W4) + G_b4) 145 | 146 | return G_log_prob 147 | 148 | def discriminator(x): 149 | D_h1 = tf.nn.relu(tf.matmul(x, D_W1) + D_b1) 150 | D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) 151 | out = (tf.matmul(D_h2, D_W3) + D_b3) 152 | 153 | return out 154 | 155 | #%% Structure 156 | G_sample = generator(Z) 157 | D_real = discriminator(X) 158 | D_fake = discriminator(G_sample) 159 | 160 | if gen_model_name=='wgan': 161 | 162 | # Replacement of Clipping algorithm to Penalty term 163 | # 1. Line 6 in Algorithm 1 164 | eps = tf.random.uniform([mb_size, 1], minval = 0., maxval = 1.) 165 | X_inter = eps*X + (1. - eps) * G_sample 166 | 167 | # 2. Line 7 in Algorithm 1 168 | grad = tf.gradients(ys=discriminator(X_inter), xs=[X_inter])[0] 169 | grad_norm = tf.sqrt(tf.reduce_sum(input_tensor=(grad)**2 + 1e-8, axis = 1)) 170 | grad_pen = lam * tf.reduce_mean(input_tensor=(grad_norm - 1)**2) 171 | 172 | # Loss function 173 | D_loss = tf.reduce_mean(input_tensor=D_fake) - tf.reduce_mean(input_tensor=D_real) + grad_pen 174 | 175 | 176 | 177 | elif gen_model_name == 'gan': 178 | D_loss = tf.reduce_mean(input_tensor=D_fake) - tf.reduce_mean(input_tensor=D_real) 179 | 180 | G_loss = -tf.reduce_mean(input_tensor=D_fake) 181 | 182 | 183 | # Solver 184 | D_solver = (tf.compat.v1.train.AdamOptimizer(learning_rate = lr, beta1 = 0.5).minimize(D_loss, var_list = theta_D)) 185 | G_solver = (tf.compat.v1.train.AdamOptimizer(learning_rate = lr, beta1 = 0.5).minimize(G_loss, var_list = theta_G)) 186 | 187 | #%% Iterations 188 | sess = tf.compat.v1.Session() 189 | sess.run(tf.compat.v1.global_variables_initializer()) 190 | 191 | # Iterations 192 | for it in tqdm(range(iterations)): 193 | # Discriminator training 194 | for _ in range(5): 195 | Z_mb = sample_Z(mb_size, z_dim) 196 | 197 | X_idx = sample_X(no,mb_size) 198 | X_mb = orig_data[X_idx,:] 199 | 200 | _, D_loss_curr = sess.run([D_solver, D_loss], feed_dict = {X: X_mb, Z: Z_mb}) 201 | 202 | # Generator Training 203 | Z_mb = sample_Z(mb_size, z_dim) 204 | 205 | X_idx = sample_X(no,mb_size) 206 | X_mb = orig_data[X_idx,:] 207 | 208 | _, G_loss_curr = sess.run([G_solver, G_loss], feed_dict = {X: X_mb, Z: Z_mb}) 209 | 210 | #%% Output Generation 211 | synth_data = sess.run([G_sample], feed_dict = {Z: sample_Z(no, z_dim)}) 212 | synth_data = synth_data[0] 213 | 214 | # Renormalization 215 | synth_data = data_renormalization(synth_data, normalization_params) 216 | 217 | # Binary features 218 | for i in range(x_dim): 219 | if len(np.unique(orig_data[:, i])) == 2: 220 | synth_data[:, i] = np.round(synth_data[:, i]) 221 | 222 | return synth_data -------------------------------------------------------------------------------- /generative_models/pategan.py: -------------------------------------------------------------------------------- 1 | '''PATE-GAN function''' 2 | 3 | # Necessary packages 4 | import tensorflow as tf 5 | import numpy as np 6 | import warnings 7 | #warnings.filterwarnings("ignore") 8 | 9 | tf.compat.v1.disable_eager_execution() 10 | 11 | from sklearn.linear_model import LogisticRegression 12 | 13 | 14 | def pate_lambda (x, teacher_models, lambda_): 15 | '''Returns PATE_lambda(x). 16 | 17 | Args: 18 | - x: feature vector 19 | - teacher_models: a list of teacher models 20 | - lambda_: parameter 21 | 22 | Returns: 23 | - n0, n1: the number of label 0 and 1, respectively 24 | - out: label after adding laplace noise. 25 | ''' 26 | 27 | y_hat = list() 28 | 29 | for teacher in teacher_models: 30 | temp_y = teacher.predict(np.reshape(x, [1,-1])) 31 | y_hat = y_hat + [temp_y] 32 | 33 | y_hat = np.asarray(y_hat) 34 | n0 = sum(y_hat == 0) 35 | n1 = sum(y_hat == 1) 36 | 37 | lap_noise = np.random.laplace(loc=0.0, scale=lambda_) 38 | 39 | out = (n1+lap_noise) / float(n0+n1) 40 | out = int(out>0.5) 41 | 42 | return n0, n1, out 43 | 44 | 45 | def pategan(x_train, parameters): 46 | '''Basic PATE-GAN framework. 47 | 48 | Args: 49 | - x_train: training data 50 | - parameters: PATE-GAN parameters 51 | - n_s: the number of student training iterations 52 | - batch_size: the number of batch size for training student and generator 53 | - k: the number of teachers 54 | - epsilon, delta: Differential privacy parameters 55 | - lambda_: noise size 56 | 57 | Returns: 58 | - x_train_hat: generated training data by differentially private generator 59 | ''' 60 | 61 | # Reset the graph 62 | tf.compat.v1.reset_default_graph() 63 | 64 | # PATE-GAN parameters 65 | # number of student training iterations 66 | n_s = parameters['n_s'] 67 | # number of batch size for student and generator training 68 | batch_size = parameters['batch_size'] 69 | # number of teachers 70 | k = parameters['k'] 71 | # epsilon 72 | epsilon = parameters['epsilon'] 73 | # delta 74 | delta = parameters['delta'] 75 | # lambda_ 76 | lambda_ = parameters['lambda'] 77 | 78 | # Other parameters 79 | # alpha initialize 80 | L = 20 81 | alpha = np.zeros([L]) 82 | # initialize epsilon_hat 83 | epsilon_hat = 0 84 | 85 | # Network parameters 86 | no, dim = x_train.shape 87 | # Random sample dimensions 88 | z_dim = int(dim) 89 | # Student hidden dimension 90 | student_h_dim = int(dim) 91 | # Generator hidden dimension 92 | generator_h_dim = int(4*dim) 93 | 94 | ## Partitioning the data into k subsets 95 | x_partition = list() 96 | partition_data_no = int(no/k) 97 | 98 | idx = np.random.permutation(no) 99 | 100 | for i in range(k): 101 | temp_idx = idx[int(i*partition_data_no):int((i+1)*partition_data_no)] 102 | temp_x = x_train[temp_idx, :] 103 | x_partition = x_partition + [temp_x] 104 | 105 | ## Necessary Functions for buidling NN models 106 | # Xavier Initialization Definition 107 | def xavier_init(size): 108 | in_dim = size[0] 109 | xavier_stddev = 1. / tf.sqrt(in_dim / 2.) 110 | return tf.random.normal(shape = size, stddev = xavier_stddev) 111 | 112 | # Sample from uniform distribution 113 | def sample_Z(m, n): 114 | return np.random.uniform(0., 1., size = [m, n]) 115 | 116 | ## Placeholder 117 | # PATE labels 118 | Y = tf.compat.v1.placeholder(tf.float32, shape = [None, 1]) 119 | # Random Variable 120 | Z = tf.compat.v1.placeholder(tf.float32, shape = [None, z_dim]) 121 | 122 | ## NN variables 123 | # Student 124 | S_W1 = tf.Variable(xavier_init([dim, student_h_dim])) 125 | S_b1 = tf.Variable(tf.zeros(shape=[student_h_dim])) 126 | 127 | S_W2 = tf.Variable(xavier_init([student_h_dim,1])) 128 | S_b2 = tf.Variable(tf.zeros(shape=[1])) 129 | 130 | theta_S = [S_W1, S_W2, S_b1, S_b2] 131 | 132 | # Generator 133 | 134 | G_W1 = tf.Variable(xavier_init([z_dim, generator_h_dim])) 135 | G_b1 = tf.Variable(tf.zeros(shape=[generator_h_dim])) 136 | 137 | G_W2 = tf.Variable(xavier_init([generator_h_dim,generator_h_dim])) 138 | G_b2 = tf.Variable(tf.zeros(shape=[generator_h_dim])) 139 | 140 | G_W3 = tf.Variable(xavier_init([generator_h_dim,dim])) 141 | G_b3 = tf.Variable(tf.zeros(shape=[dim])) 142 | 143 | theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] 144 | 145 | ## Models 146 | def generator(z): 147 | G_h1 = tf.nn.tanh(tf.matmul(z, G_W1) + G_b1) 148 | G_h2 = tf.nn.tanh(tf.matmul(G_h1, G_W2) + G_b2) 149 | G_out = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) 150 | 151 | return G_out 152 | 153 | def student(x): 154 | S_h1 = tf.nn.relu(tf.matmul(x, S_W1) + S_b1) 155 | S_out = tf.matmul(S_h1, S_W2) + S_b2 156 | 157 | return S_out 158 | 159 | ## Loss 160 | G_sample = generator(Z) 161 | S_fake = student(G_sample) 162 | 163 | S_loss = tf.reduce_mean(input_tensor=Y * S_fake) - tf.reduce_mean(input_tensor=(1-Y) * S_fake) 164 | G_loss = -tf.reduce_mean(input_tensor=S_fake) 165 | 166 | # Optimizer 167 | S_solver = (tf.compat.v1.train.RMSPropOptimizer(learning_rate=1e-4) 168 | .minimize(-S_loss, var_list=theta_S)) 169 | G_solver = (tf.compat.v1.train.RMSPropOptimizer(learning_rate=1e-4) 170 | .minimize(G_loss, var_list=theta_G)) 171 | 172 | clip_S = [p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in theta_S] 173 | 174 | ## Sessions 175 | sess = tf.compat.v1.Session() 176 | sess.run(tf.compat.v1.global_variables_initializer()) 177 | 178 | 179 | min_iterations = 1 180 | iteration=0 181 | ## Iterations 182 | while epsilon_hat < epsilon or iteration < min_iterations: 183 | iteration+=1 184 | 185 | # 1. Train teacher models 186 | teacher_models = list() 187 | 188 | for _ in range(k): 189 | 190 | Z_mb = sample_Z(partition_data_no, z_dim) 191 | G_mb = sess.run(G_sample, feed_dict = {Z: Z_mb}) 192 | 193 | temp_x = x_partition[i] 194 | idx = np.random.permutation(len(temp_x[:, 0])) 195 | X_mb = temp_x[idx[:partition_data_no], :] 196 | 197 | X_comb = np.concatenate((X_mb, G_mb), axis = 0) 198 | Y_comb = np.concatenate((np.ones([partition_data_no,]), 199 | np.zeros([partition_data_no,])), axis = 0) 200 | 201 | model = LogisticRegression() 202 | model.fit(X_comb, Y_comb) 203 | teacher_models = teacher_models + [model] 204 | 205 | # 2. Student training 206 | for _ in range(n_s): 207 | 208 | Z_mb = sample_Z(batch_size, z_dim) 209 | G_mb = sess.run(G_sample, feed_dict = {Z: Z_mb}) 210 | Y_mb = list() 211 | 212 | for j in range(batch_size): 213 | n0, n1, r_j = pate_lambda(G_mb[j, :], teacher_models, lambda_) 214 | Y_mb = Y_mb + [r_j] 215 | 216 | # Update moments accountant 217 | q = np.log(2 + lambda_ * abs(n0 - n1)) - np.log(4.0) - \ 218 | (lambda_ * abs(n0 - n1)) 219 | q = np.exp(q) 220 | 221 | # Compute alpha 222 | for l in range(L): 223 | temp1 = 2 * (lambda_**2) * (l+1) * (l+2) 224 | temp2 = (1-q) * ( ((1-q)/(1-q*np.exp(2*lambda_)))**(l+1) ) + \ 225 | q * np.exp(2*lambda_ * (l+1)) 226 | alpha[l] = alpha[l] + np.min([temp1, np.log(temp2)]) 227 | 228 | # PATE labels for G_mb 229 | Y_mb = np.reshape(np.asarray(Y_mb), [-1,1]) 230 | 231 | # Update student 232 | _, D_loss_curr, _ = sess.run([S_solver, S_loss, clip_S], 233 | feed_dict = {Z: Z_mb, Y: Y_mb}) 234 | 235 | # Generator Update 236 | Z_mb = sample_Z(batch_size, z_dim) 237 | _, G_loss_curr = sess.run([G_solver, G_loss], feed_dict = {Z: Z_mb}) 238 | print('G loss',G_loss_curr) 239 | print('D_loss', D_loss_curr) 240 | print(np.mean(Y_mb)) 241 | 242 | # epsilon_hat computation 243 | curr_list = list() 244 | for l in range(L): 245 | temp_alpha = (alpha[l] + np.log(1/delta)) / float(l+1) 246 | curr_list = curr_list + [temp_alpha] 247 | 248 | epsilon_hat = np.min(curr_list) 249 | print(epsilon_hat) 250 | 251 | ## Outputs 252 | x_train_hat = sess.run([G_sample], feed_dict = {Z: sample_Z(no, z_dim)})[0] 253 | 254 | for i in range(dim): 255 | if len(np.unique(x_train[:, i])) == 2: 256 | x_train_hat[:, i] = np.round(x_train_hat[:, i]) 257 | 258 | return x_train_hat 259 | 260 | 261 | ## Main 262 | if __name__ == '__main__': 263 | 264 | x_train = np.random.normal(0, 1, [10000,5]) 265 | 266 | # Normalization 267 | for i in range(len(x_train[0, :])): 268 | x_train[:, i] = x_train[:, i] - np.min(x_train[:, i]) 269 | x_train[:, i] = x_train[:, i] / (np.max(x_train[:, i]) + 1e-8) 270 | 271 | 272 | parameters = {'n_s': 1, 'batch_size': 1000, 273 | 'k': 100, 'epsilon': 100, 'delta': 0.0001, 'lambda': 1} 274 | 275 | x_train_new = pategan(x_train, parameters) 276 | -------------------------------------------------------------------------------- /generative_models/pategan_from_bitbucket.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Jinsung Yoon (0*/13/2018) 3 | PATEGAN 4 | ''' 5 | 6 | #%% Packages 7 | import tensorflow as tf 8 | import numpy as np 9 | from tqdm import tqdm 10 | 11 | #%% Function Start 12 | 13 | def pategan(X_train, Y_train, X_test, Y_test, params): 14 | epsilon = params['epsilon'] 15 | delta = params['delta'] 16 | niter = params['iterations'] 17 | num_teachers = params['k'] 18 | 19 | #%% Parameters 20 | # Batch size 21 | mb_size = 128 22 | 23 | # Feature no 24 | X_dim = len(X_train[0,:]) 25 | 26 | # Sample no 27 | no = len(X_train[:,0]) 28 | 29 | # Random variable dimension 30 | z_dim = int(X_dim/4) 31 | 32 | # Hidden unit dimensions 33 | h_dim = int(X_dim) 34 | 35 | C_dim = 1 36 | 37 | # WGAN-GP Parameters 38 | lam = 10 39 | lr = 1e-4 40 | 41 | lamda =np.sqrt(2*np.log(1.25*(10**(delta))))/epsilon 42 | 43 | #%% Data Preprocessing 44 | X_train = np.asarray(X_train) 45 | 46 | #%% Data Normalization 47 | Min_Val = np.min(X_train,0) 48 | 49 | X_train = X_train - Min_Val 50 | 51 | Max_Val = np.max(X_train,0) 52 | 53 | X_train = X_train / (Max_Val + 1e-8) 54 | 55 | #%% Algorithm Start 56 | 57 | #%% Necessary Functions 58 | 59 | # Xavier Initialization Definition 60 | def xavier_init(size): 61 | in_dim = size[0] 62 | xavier_stddev = 1. / tf.sqrt(in_dim / 2.) 63 | return tf.random_normal(shape = size, stddev = xavier_stddev) 64 | 65 | # Sample from uniform distribution 66 | def sample_Z(m, n): 67 | return np.random.uniform(-1., 1., size = [m, n]) 68 | 69 | # Sample from the real data 70 | def sample_X(m, n): 71 | return np.random.permutation(m)[:n] 72 | 73 | #%% Placeholder 74 | 75 | # Feature 76 | X = tf.placeholder(tf.float32, shape = [None, X_dim]) 77 | # Label 78 | Y = tf.placeholder(tf.float32, shape = [None, C_dim]) 79 | # Random Variable 80 | Z = tf.placeholder(tf.float32, shape = [None, z_dim]) 81 | # Conditional Variable 82 | M = tf.placeholder(tf.float32, shape = [None, C_dim]) 83 | 84 | #%% Discriminator 85 | # Discriminator 86 | D_W1 = tf.Variable(xavier_init([X_dim + C_dim, h_dim])) 87 | D_b1 = tf.Variable(tf.zeros(shape=[h_dim])) 88 | 89 | D_W2 = tf.Variable(xavier_init([h_dim,h_dim])) 90 | D_b2 = tf.Variable(tf.zeros(shape=[h_dim])) 91 | 92 | D_W3 = tf.Variable(xavier_init([h_dim,1])) 93 | D_b3 = tf.Variable(tf.zeros(shape=[1])) 94 | 95 | theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] 96 | 97 | #%% Generator 98 | 99 | G_W1 = tf.Variable(xavier_init([z_dim + C_dim, h_dim])) 100 | G_b1 = tf.Variable(tf.zeros(shape=[h_dim])) 101 | 102 | G_W2 = tf.Variable(xavier_init([h_dim,h_dim])) 103 | G_b2 = tf.Variable(tf.zeros(shape=[h_dim])) 104 | 105 | G_W3 = tf.Variable(xavier_init([h_dim,X_dim])) 106 | G_b3 = tf.Variable(tf.zeros(shape=[X_dim])) 107 | 108 | theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] 109 | 110 | #%% Functions 111 | def generator(z, y): 112 | inputs = tf.concat([z,y], axis = 1) 113 | G_h1 = tf.nn.tanh(tf.matmul(inputs, G_W1) + G_b1) 114 | G_h2 = tf.nn.tanh(tf.matmul(G_h1, G_W2) + G_b2) 115 | G_log_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) 116 | 117 | return G_log_prob 118 | 119 | def discriminator(x, y): 120 | inputs = tf.concat([x,y], axis = 1) 121 | D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) 122 | D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) 123 | out = (tf.matmul(D_h2, D_W3) + D_b3) 124 | 125 | return out 126 | 127 | #%% 128 | # Structure 129 | G_sample = generator(Z, Y) 130 | D_real = discriminator(X, Y) 131 | D_fake = discriminator(G_sample, Y) 132 | 133 | #%% 134 | D_entire = tf.concat(axis = 0, values = [D_real, D_fake]) 135 | 136 | #%% 137 | 138 | # Replacement of Clipping algorithm to Penalty term 139 | # 1. Line 6 in Algorithm 1 140 | eps = tf.random_uniform([mb_size, 1], minval = 0., maxval = 1.) 141 | X_inter = eps*X + (1. - eps) * G_sample 142 | 143 | # 2. Line 7 in Algorithm 1 144 | grad = tf.gradients(discriminator(X_inter, Y), [X_inter, Y])[0] 145 | grad_norm = tf.sqrt(tf.reduce_sum((grad)**2 + 1e-8, axis = 1)) 146 | grad_pen = lam * tf.reduce_mean((grad_norm - 1)**2) 147 | 148 | # Loss function 149 | D_loss = tf.reduce_mean((1-M) * D_entire) - tf.reduce_mean(M * D_entire) + grad_pen 150 | G_loss = -tf.reduce_mean(D_fake) 151 | 152 | # Solver 153 | D_solver = (tf.train.AdamOptimizer(learning_rate = lr, beta1 = 0.5).minimize(D_loss, var_list = theta_D)) 154 | G_solver = (tf.train.AdamOptimizer(learning_rate = lr, beta1 = 0.5).minimize(G_loss, var_list = theta_G)) 155 | 156 | #%% 157 | # Sessions 158 | sess = tf.Session() 159 | sess.run(tf.global_variables_initializer()) 160 | 161 | #%% 162 | # Iterations 163 | for it in tqdm(range(niter)): 164 | 165 | for _ in range(num_teachers): 166 | #%% Teacher Training 167 | Z_mb = sample_Z(mb_size, z_dim) 168 | 169 | # Teacher 1 170 | X_idx = sample_X(no,mb_size) 171 | X_mb = X_train[X_idx,:] 172 | 173 | Y_mb = np.reshape(Y_train[X_idx], [mb_size,1]) 174 | 175 | #%% 176 | 177 | M_real = np.ones([mb_size,]) 178 | M_fake = np.zeros([mb_size,]) 179 | 180 | M_entire = np.concatenate((M_real, M_fake),0) 181 | 182 | Normal_Add = np.random.normal(loc=0.0, scale=lamda, size = mb_size*2) 183 | 184 | M_entire = M_entire + Normal_Add 185 | 186 | M_entire = (M_entire > 0.5) 187 | 188 | M_mb = np.reshape(M_entire.astype(float), (2*mb_size,1)) 189 | 190 | _, D_loss_curr = sess.run([D_solver, D_loss], feed_dict = {X: X_mb, Z: Z_mb, M: M_mb, Y: Y_mb}) 191 | 192 | 193 | #%% Generator Training 194 | 195 | Z_mb = sample_Z(mb_size, z_dim) 196 | 197 | X_idx = sample_X(no,mb_size) 198 | X_mb = X_train[X_idx,:] 199 | 200 | Y_mb = np.reshape(Y_train[X_idx], [mb_size,1]) 201 | 202 | _, G_loss_curr = sess.run([G_solver, G_loss], feed_dict = {Z: Z_mb, Y: Y_mb}) 203 | print(G_loss_curr) 204 | #%% 205 | 206 | #%% Output Generation 207 | 208 | New_X_train = sess.run([G_sample], feed_dict = {Z: sample_Z(len(X_train[:,0]), z_dim), Y: np.reshape(Y_train, [len(Y_train),1])}) 209 | 210 | New_X_train = New_X_train[0] 211 | 212 | #### Renormalization 213 | 214 | New_X_train = New_X_train * (Max_Val + 1e-8) 215 | 216 | New_X_train = New_X_train + Min_Val 217 | 218 | ## Testing 219 | 220 | New_X_test = sess.run([G_sample], feed_dict = {Z: sample_Z(len(X_test[:,0]), z_dim), Y: np.reshape(Y_test, [len(Y_test),1])}) 221 | 222 | New_X_test = New_X_test[0] 223 | 224 | #### Renormalization 225 | 226 | New_X_test = New_X_test * (Max_Val + 1e-8) 227 | 228 | New_X_test = New_X_test + Min_Val 229 | 230 | return New_X_train, Y_train, New_X_test, Y_test 231 | -------------------------------------------------------------------------------- /generative_models/rgan/experiments/settings/rgan-dp.txt: -------------------------------------------------------------------------------- 1 | { 2 | "custom_experiment": true, 3 | "settings_file": "", 4 | "data": "snp500", 5 | "num_samples": 505, 6 | "seq_length": 1259, 7 | "num_signals": 5, 8 | "normalise": false, 9 | "cond_dim": 0, 10 | "max_val": 1, 11 | "one_hot": false, 12 | "predict_labels": false, 13 | "scale": 0.1, 14 | "freq_low": 1.0, 15 | "freq_high": 5.0, 16 | "amplitude_low": 0.1, 17 | "amplitude_high": 0.9, 18 | "multivariate_mnist": false, 19 | "full_mnist": false, 20 | "data_load_from": "", 21 | "resample_rate_in_min": 15, 22 | "hidden_units_g": 100, 23 | "hidden_units_d": 100, 24 | "kappa": 1, 25 | "latent_dim": 10, 26 | "batch_mean": false, 27 | "learn_scale": false, 28 | "learning_rate": 0.1, 29 | "batch_size": 16, 30 | "num_epochs": 100, 31 | "D_rounds": 4, 32 | "G_rounds": 1, 33 | "use_time": false, 34 | "WGAN": false, 35 | "WGAN_clip": false, 36 | "shuffle": true, 37 | "wrong_labels": false, 38 | "identifier": "rgan-dp", 39 | "dp": true, 40 | "l2norm_bound": 1e-05, 41 | "batches_per_lot": 1, 42 | "dp_sigma": 1e-05, 43 | "num_generated_features": 5 44 | } -------------------------------------------------------------------------------- /generative_models/rgan/experiments/settings/rgan.txt: -------------------------------------------------------------------------------- 1 | { 2 | "custom_experiment": true, 3 | "settings_file": "", 4 | "data": "snp500", 5 | "num_samples": 505, 6 | "seq_length": 1259, 7 | "num_signals": 5, 8 | "normalise": false, 9 | "cond_dim": 0, 10 | "max_val": 1, 11 | "one_hot": false, 12 | "predict_labels": false, 13 | "scale": 0.1, 14 | "freq_low": 1.0, 15 | "freq_high": 5.0, 16 | "amplitude_low": 0.1, 17 | "amplitude_high": 0.9, 18 | "multivariate_mnist": false, 19 | "full_mnist": false, 20 | "data_load_from": "", 21 | "resample_rate_in_min": 15, 22 | "hidden_units_g": 100, 23 | "hidden_units_d": 100, 24 | "kappa": 1, 25 | "latent_dim": 10, 26 | "batch_mean": false, 27 | "learn_scale": false, 28 | "learning_rate": 0.1, 29 | "batch_size": 64, 30 | "num_epochs": 500, 31 | "D_rounds": 1, 32 | "G_rounds": 6, 33 | "use_time": false, 34 | "WGAN": false, 35 | "WGAN_clip": false, 36 | "shuffle": true, 37 | "wrong_labels": false, 38 | "identifier": "rgan", 39 | "dp": false, 40 | "l2norm_bound": 1e-05, 41 | "batches_per_lot": 1, 42 | "dp_sigma": null, 43 | "num_generated_features": 5 44 | } -------------------------------------------------------------------------------- /generative_models/rgan/experiments/settings/rgan_dp.txt: -------------------------------------------------------------------------------- 1 | { 2 | "custom_experiment": true, 3 | "settings_file": "", 4 | "data": "amsterdam:combds", 5 | "num_samples": 1000, 6 | "seq_length": 100, 7 | "num_signals": 70, 8 | "normalise": false, 9 | "cond_dim": 0, 10 | "max_val": 1, 11 | "one_hot": false, 12 | "predict_labels": false, 13 | "scale": 0.1, 14 | "freq_low": 1.0, 15 | "freq_high": 5.0, 16 | "amplitude_low": 0.1, 17 | "amplitude_high": 0.9, 18 | "multivariate_mnist": false, 19 | "full_mnist": false, 20 | "data_load_from": "", 21 | "resample_rate_in_min": 15, 22 | "hidden_units_g": 100, 23 | "hidden_units_d": 100, 24 | "kappa": 1, 25 | "latent_dim": 10, 26 | "batch_mean": false, 27 | "learn_scale": false, 28 | "learning_rate": 0.1, 29 | "batch_size": 128, 30 | "num_epochs": 500, 31 | "D_rounds": 3, 32 | "G_rounds": 1, 33 | "use_time": false, 34 | "WGAN": false, 35 | "WGAN_clip": false, 36 | "shuffle": true, 37 | "wrong_labels": false, 38 | "identifier": "rgan_dp", 39 | "dp": true, 40 | "l2norm_bound": 1e-05, 41 | "batches_per_lot": 1, 42 | "dp_sigma": 0.001, 43 | "num_generated_features": 70 44 | } -------------------------------------------------------------------------------- /generative_models/rgan/experiments/settings/sine.txt: -------------------------------------------------------------------------------- 1 | { 2 | "settings_file": "", 3 | "data": "sine", 4 | "num_samples": 14000, 5 | "seq_length": 30, 6 | "num_signals": 4, 7 | "normalise": false, 8 | "cond_dim": 0, 9 | "max_val": 1, 10 | "one_hot": false, 11 | "predict_labels": false, 12 | "scale": 0.1, 13 | "freq_low": 1.0, 14 | "freq_high": 5.0, 15 | "amplitude_low": 0.1, 16 | "amplitude_high": 0.9, 17 | "multivariate_mnist": false, 18 | "full_mnist": false, 19 | "data_load_from": "", 20 | "resample_rate_in_min": 15, 21 | "hidden_units_g": 100, 22 | "hidden_units_d": 100, 23 | "kappa": 1, 24 | "latent_dim": 5, 25 | "batch_mean": false, 26 | "learn_scale": false, 27 | "learning_rate": 0.1, 28 | "batch_size": 28, 29 | "num_epochs": 2, 30 | "D_rounds": 5, 31 | "G_rounds": 1, 32 | "use_time": false, 33 | "WGAN": false, 34 | "WGAN_clip": false, 35 | "shuffle": true, 36 | "wrong_labels": false, 37 | "identifier": "sine", 38 | "dp": false, 39 | "l2norm_bound": 1e-05, 40 | "batches_per_lot": 1, 41 | "dp_sigma": 1e-05, 42 | "num_generated_features": 4 43 | } -------------------------------------------------------------------------------- /generative_models/rgan/experiments/settings/test_modified.txt: -------------------------------------------------------------------------------- 1 | { 2 | "settings_file": "", 3 | "data": "sine", 4 | "num_samples": 14000, 5 | "seq_length": 30, 6 | "num_signals": 4, 7 | "normalise": false, 8 | "cond_dim": 0, 9 | "max_val": 1, 10 | "one_hot": false, 11 | "predict_labels": false, 12 | "scale": 0.1, 13 | "freq_low": 1.0, 14 | "freq_high": 5.0, 15 | "amplitude_low": 0.1, 16 | "amplitude_high": 0.9, 17 | "multivariate_mnist": false, 18 | "full_mnist": false, 19 | "data_load_from": "", 20 | "resample_rate_in_min": 15, 21 | "hidden_units_g": 100, 22 | "hidden_units_d": 100, 23 | "kappa": 1, 24 | "latent_dim": 5, 25 | "batch_mean": false, 26 | "learn_scale": false, 27 | "learning_rate": 0.1, 28 | "batch_size": 28, 29 | "num_epochs": 2, 30 | "D_rounds": 5, 31 | "G_rounds": 1, 32 | "use_time": false, 33 | "WGAN": false, 34 | "WGAN_clip": false, 35 | "shuffle": true, 36 | "wrong_labels": false, 37 | "identifier": "test", 38 | "dp": false, 39 | "l2norm_bound": 1e-05, 40 | "batches_per_lot": 1, 41 | "dp_sigma": 1e-05, 42 | "num_generated_features": 4 43 | } -------------------------------------------------------------------------------- /generative_models/timegan/__init__.py: -------------------------------------------------------------------------------- 1 | """Time-series Generative Adversarial Networks (TimeGAN) Codebase. 2 | 3 | Reference: Jinsung Yoon, Daniel Jarrett, Mihaela van der Schaar, 4 | "Time-series Generative Adversarial Networks," 5 | Neural Information Processing Systems (NeurIPS), 2019. 6 | 7 | Paper link: https://papers.nips.cc/paper/8789-time-series-generative-adversarial-networks 8 | 9 | Last updated Date: Jan 19th 2021 10 | Code author: Jinsung Yoon (jsyoon0823@gmail.com) 11 | Code updated by: Evgeny Saveliev (e.s.saveliev@gmail.com) 12 | 13 | ----------------------------- 14 | 15 | timegan.py 16 | utils.py 17 | 18 | """ 19 | 20 | from .timegan import timegan 21 | -------------------------------------------------------------------------------- /generative_models/timegan/timegan.py: -------------------------------------------------------------------------------- 1 | """ 2 | timegan.py 3 | 4 | Note: Use original data as training set to generator synthetic data (time-series) 5 | """ 6 | 7 | # Necessary Packages 8 | import numpy as np 9 | from .utils import ( # pylint: disable=relative-beyond-top-level 10 | extract_time, 11 | rnn_cell, 12 | random_generator, 13 | batch_generator, 14 | ) 15 | 16 | import tensorflow as tf 17 | 18 | 19 | def timegan(ori_data, parameters): 20 | """TimeGAN function. 21 | 22 | Use original data as training set to generator synthetic data (time-series) 23 | 24 | Args: 25 | - ori_data: original time-series data 26 | - parameters: TimeGAN network parameters 27 | 28 | Returns: 29 | - generated_data: generated time-series data 30 | """ 31 | if parameters is None: 32 | parameters = dict() 33 | parameters["module"] = "gru" 34 | parameters["hidden_dim"] = 10 35 | parameters["num_layer"] = 3 36 | parameters["iterations"] = 20000 37 | parameters["batch_size"] = 128 38 | parameters["print_every_n_iters"] = 1000 39 | 40 | # Initialization on the Graph 41 | tf.reset_default_graph() 42 | 43 | # Basic Parameters 44 | no, seq_len, dim = np.asarray(ori_data).shape 45 | 46 | # Maximum sequence length and each sequence length 47 | ori_time, max_seq_len = extract_time(ori_data) 48 | 49 | def MinMaxScaler(data): 50 | """Min-Max Normalizer. 51 | 52 | Args: 53 | - data: raw data 54 | 55 | Returns: 56 | - norm_data: normalized data 57 | - min_val: minimum values (for renormalization) 58 | - max_val: maximum values (for renormalization) 59 | """ 60 | min_val = np.min(np.min(data, axis=0), axis=0) 61 | data = data - min_val 62 | 63 | max_val = np.max(np.max(data, axis=0), axis=0) 64 | norm_data = data / (max_val + 1e-7) 65 | 66 | return norm_data, min_val, max_val 67 | 68 | # Normalization 69 | ori_data, min_val, max_val = MinMaxScaler(ori_data) 70 | 71 | ## Build a RNN networks 72 | 73 | # Network Parameters 74 | hidden_dim = parameters["hidden_dim"] 75 | num_layers = parameters["num_layer"] 76 | iterations = parameters["iterations"] 77 | batch_size = parameters["batch_size"] 78 | module_name = parameters["module"] 79 | z_dim = dim 80 | gamma = 1 81 | 82 | batch_size = ori_data.shape[0] if ori_data.shape[0] < batch_size else batch_size 83 | 84 | # Input place holders 85 | X = tf.placeholder(tf.float32, [None, max_seq_len, dim], name="myinput_x") 86 | Z = tf.placeholder(tf.float32, [None, max_seq_len, z_dim], name="myinput_z") 87 | T = tf.placeholder(tf.int32, [None], name="myinput_t") 88 | 89 | def embedder(X, T): 90 | """Embedding network between original feature space to latent space. 91 | 92 | Args: 93 | - X: input time-series features 94 | - T: input time information 95 | 96 | Returns: 97 | - H: embeddings 98 | """ 99 | with tf.variable_scope("embedder", reuse=tf.AUTO_REUSE): 100 | e_cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell(module_name, hidden_dim) for _ in range(num_layers)]) 101 | e_outputs, e_last_states = tf.nn.dynamic_rnn(e_cell, X, dtype=tf.float32, sequence_length=T) 102 | H = tf.contrib.layers.fully_connected(e_outputs, hidden_dim, activation_fn=tf.nn.sigmoid) 103 | return H 104 | 105 | def recovery(H, T): 106 | """Recovery network from latent space to original space. 107 | 108 | Args: 109 | - H: latent representation 110 | - T: input time information 111 | 112 | Returns: 113 | - X_tilde: recovered data 114 | """ 115 | with tf.variable_scope("recovery", reuse=tf.AUTO_REUSE): 116 | r_cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell(module_name, hidden_dim) for _ in range(num_layers)]) 117 | r_outputs, r_last_states = tf.nn.dynamic_rnn(r_cell, H, dtype=tf.float32, sequence_length=T) 118 | X_tilde = tf.contrib.layers.fully_connected(r_outputs, dim, activation_fn=None) 119 | return X_tilde 120 | 121 | def generator(Z, T): 122 | """Generator function: Generate time-series data in latent space. 123 | 124 | Args: 125 | - Z: random variables 126 | - T: input time information 127 | 128 | Returns: 129 | - E: generated embedding 130 | """ 131 | with tf.variable_scope("generator", reuse=tf.AUTO_REUSE): 132 | e_cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell(module_name, hidden_dim) for _ in range(num_layers)]) 133 | e_outputs, e_last_states = tf.nn.dynamic_rnn(e_cell, Z, dtype=tf.float32, sequence_length=T) 134 | E = tf.contrib.layers.fully_connected(e_outputs, hidden_dim, activation_fn=tf.nn.sigmoid) 135 | return E 136 | 137 | def supervisor(H, T): 138 | """Generate next sequence using the previous sequence. 139 | 140 | Args: 141 | - H: latent representation 142 | - T: input time information 143 | 144 | Returns: 145 | - S: generated sequence based on the latent representations generated by the generator 146 | """ 147 | with tf.variable_scope("supervisor", reuse=tf.AUTO_REUSE): 148 | e_cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell(module_name, hidden_dim) for _ in range(num_layers - 1)]) 149 | e_outputs, e_last_states = tf.nn.dynamic_rnn(e_cell, H, dtype=tf.float32, sequence_length=T) 150 | S = tf.contrib.layers.fully_connected(e_outputs, hidden_dim, activation_fn=tf.nn.sigmoid) 151 | return S 152 | 153 | def discriminator(H, T): 154 | """Discriminate the original and synthetic time-series data. 155 | 156 | Args: 157 | - H: latent representation 158 | - T: input time information 159 | 160 | Returns: 161 | - Y_hat: classification results between original and synthetic time-series 162 | """ 163 | with tf.variable_scope("discriminator", reuse=tf.AUTO_REUSE): 164 | d_cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell(module_name, hidden_dim) for _ in range(num_layers)]) 165 | d_outputs, d_last_states = tf.nn.dynamic_rnn(d_cell, H, dtype=tf.float32, sequence_length=T) 166 | Y_hat = tf.contrib.layers.fully_connected(d_outputs, 1, activation_fn=None) 167 | return Y_hat 168 | 169 | # Embedder & Recovery 170 | H = embedder(X, T) 171 | X_tilde = recovery(H, T) 172 | 173 | # Generator 174 | E_hat = generator(Z, T) 175 | H_hat = supervisor(E_hat, T) 176 | H_hat_supervise = supervisor(H, T) 177 | 178 | # Synthetic data 179 | X_hat = recovery(H_hat, T) 180 | 181 | # Discriminator 182 | Y_fake = discriminator(H_hat, T) 183 | Y_real = discriminator(H, T) 184 | Y_fake_e = discriminator(E_hat, T) 185 | 186 | # Variables 187 | e_vars = [v for v in tf.trainable_variables() if v.name.startswith("embedder")] 188 | r_vars = [v for v in tf.trainable_variables() if v.name.startswith("recovery")] 189 | g_vars = [v for v in tf.trainable_variables() if v.name.startswith("generator")] 190 | s_vars = [v for v in tf.trainable_variables() if v.name.startswith("supervisor")] 191 | d_vars = [v for v in tf.trainable_variables() if v.name.startswith("discriminator")] 192 | 193 | # Discriminator loss 194 | D_loss_real = tf.losses.sigmoid_cross_entropy(tf.ones_like(Y_real), Y_real) 195 | D_loss_fake = tf.losses.sigmoid_cross_entropy(tf.zeros_like(Y_fake), Y_fake) 196 | D_loss_fake_e = tf.losses.sigmoid_cross_entropy(tf.zeros_like(Y_fake_e), Y_fake_e) 197 | D_loss = D_loss_real + D_loss_fake + gamma * D_loss_fake_e 198 | 199 | # Generator loss 200 | # 1. Adversarial loss 201 | G_loss_U = tf.losses.sigmoid_cross_entropy(tf.ones_like(Y_fake), Y_fake) 202 | G_loss_U_e = tf.losses.sigmoid_cross_entropy(tf.ones_like(Y_fake_e), Y_fake_e) 203 | 204 | # 2. Supervised loss 205 | G_loss_S = tf.losses.mean_squared_error(H[:, 1:, :], H_hat_supervise[:, :-1, :]) 206 | 207 | # 3. Two Moments 208 | G_loss_V1 = tf.reduce_mean( 209 | tf.abs(tf.sqrt(tf.nn.moments(X_hat, [0])[1] + 1e-6) - tf.sqrt(tf.nn.moments(X, [0])[1] + 1e-6)) 210 | ) 211 | G_loss_V2 = tf.reduce_mean(tf.abs((tf.nn.moments(X_hat, [0])[0]) - (tf.nn.moments(X, [0])[0]))) 212 | 213 | G_loss_V = G_loss_V1 + G_loss_V2 214 | 215 | # 4. Summation 216 | G_loss = G_loss_U + gamma * G_loss_U_e + 100 * tf.sqrt(G_loss_S) + 100 * G_loss_V 217 | 218 | # Embedder network loss 219 | E_loss_T0 = tf.losses.mean_squared_error(X, X_tilde) 220 | E_loss0 = 10 * tf.sqrt(E_loss_T0) 221 | E_loss = E_loss0 + 0.1 * G_loss_S 222 | 223 | # optimizer 224 | E0_solver = tf.train.AdamOptimizer().minimize(E_loss0, var_list=e_vars + r_vars) 225 | E_solver = tf.train.AdamOptimizer().minimize(E_loss, var_list=e_vars + r_vars) 226 | D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=d_vars) 227 | G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=g_vars + s_vars) 228 | GS_solver = tf.train.AdamOptimizer().minimize(G_loss_S, var_list=g_vars + s_vars) 229 | 230 | ## TimeGAN training 231 | sess = tf.Session() 232 | sess.run(tf.global_variables_initializer()) 233 | 234 | # 1. Embedding network training 235 | print("Start Embedding Network Training") 236 | 237 | for itt in range(iterations): 238 | # Set mini-batch 239 | X_mb, T_mb = batch_generator(ori_data, ori_time, batch_size) 240 | # Train embedder 241 | _, step_e_loss = sess.run([E0_solver, E_loss_T0], feed_dict={X: X_mb, T: T_mb}) 242 | # Checkpoint 243 | if itt % parameters["print_every_n_iters"] == 0: 244 | print("step: " + str(itt) + "/" + str(iterations) + ", e_loss: " + str(np.round(np.sqrt(step_e_loss), 4))) 245 | 246 | print("Finish Embedding Network Training") 247 | 248 | # 2. Training only with supervised loss 249 | print("Start Training with Supervised Loss Only") 250 | 251 | for itt in range(iterations): 252 | # Set mini-batch 253 | X_mb, T_mb = batch_generator(ori_data, ori_time, batch_size) 254 | # Random vector generation 255 | Z_mb = random_generator(batch_size, z_dim, T_mb, max_seq_len) 256 | # Train generator 257 | _, step_g_loss_s = sess.run([GS_solver, G_loss_S], feed_dict={Z: Z_mb, X: X_mb, T: T_mb}) 258 | # Checkpoint 259 | if itt % parameters["print_every_n_iters"] == 0: 260 | print("step: " + str(itt) + "/" + str(iterations) + ", s_loss: " + str(np.round(np.sqrt(step_g_loss_s), 4))) 261 | 262 | print("Finish Training with Supervised Loss Only") 263 | 264 | # 3. Joint Training 265 | print("Start Joint Training") 266 | 267 | for itt in range(iterations): 268 | # Generator training (twice more than discriminator training) 269 | for kk in range(2): 270 | # Set mini-batch 271 | X_mb, T_mb = batch_generator(ori_data, ori_time, batch_size) 272 | # Random vector generation 273 | Z_mb = random_generator(batch_size, z_dim, T_mb, max_seq_len) 274 | # Train generator 275 | _, step_g_loss_u, step_g_loss_s, step_g_loss_v = sess.run( 276 | [G_solver, G_loss_U, G_loss_S, G_loss_V], feed_dict={Z: Z_mb, X: X_mb, T: T_mb} 277 | ) 278 | # Train embedder 279 | _, step_e_loss_t0 = sess.run([E_solver, E_loss_T0], feed_dict={Z: Z_mb, X: X_mb, T: T_mb}) 280 | 281 | # Discriminator training 282 | # Set mini-batch 283 | X_mb, T_mb = batch_generator(ori_data, ori_time, batch_size) 284 | # Random vector generation 285 | Z_mb = random_generator(batch_size, z_dim, T_mb, max_seq_len) 286 | # Check discriminator loss before updating 287 | check_d_loss = sess.run(D_loss, feed_dict={X: X_mb, T: T_mb, Z: Z_mb}) 288 | # Train discriminator (only when the discriminator does not work well) 289 | if check_d_loss > 0.15: 290 | _, step_d_loss = sess.run([D_solver, D_loss], feed_dict={X: X_mb, T: T_mb, Z: Z_mb}) 291 | 292 | # Print multiple checkpoints 293 | if itt % parameters["print_every_n_iters"] == 0: 294 | print( 295 | "step: " 296 | + str(itt) 297 | + "/" 298 | + str(iterations) 299 | + ", d_loss: " 300 | + str(np.round(step_d_loss, 4)) 301 | + ", g_loss_u: " 302 | + str(np.round(step_g_loss_u, 4)) 303 | + ", g_loss_s: " 304 | + str(np.round(np.sqrt(step_g_loss_s), 4)) 305 | + ", g_loss_v: " 306 | + str(np.round(step_g_loss_v, 4)) 307 | + ", e_loss_t0: " 308 | + str(np.round(np.sqrt(step_e_loss_t0), 4)) 309 | ) 310 | print("Finish Joint Training") 311 | 312 | ## Synthetic data generation 313 | Z_mb = random_generator(no, z_dim, ori_time, max_seq_len) 314 | generated_data_curr = sess.run(X_hat, feed_dict={Z: Z_mb, X: ori_data, T: ori_time}) 315 | 316 | generated_data = list() 317 | 318 | for i in range(no): 319 | temp = generated_data_curr[i, : ori_time[i], :] 320 | generated_data.append(temp) 321 | 322 | # Renormalization 323 | generated_data = generated_data * max_val 324 | generated_data = generated_data + min_val 325 | 326 | return generated_data 327 | -------------------------------------------------------------------------------- /generative_models/timegan/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | utils.py 3 | 4 | (1) train_test_divide: Divide train and test data for both original and synthetic data. 5 | (2) extract_time: Returns Maximum sequence length and each sequence length. 6 | (3) rnn_cell: Basic RNN Cell. 7 | (4) random_generator: random vector generator 8 | (5) batch_generator: mini-batch generator 9 | """ 10 | 11 | ## Necessary Packages 12 | import numpy as np 13 | 14 | import tensorflow.compat.v1 as tf 15 | tf.disable_v2_behavior() 16 | 17 | 18 | def train_test_divide(data_x, data_x_hat, data_t, data_t_hat, train_rate=0.8): 19 | """Divide train and test data for both original and synthetic data. 20 | 21 | Args: 22 | - data_x: original data 23 | - data_x_hat: generated data 24 | - data_t: original time 25 | - data_t_hat: generated time 26 | - train_rate: ratio of training data from the original data 27 | """ 28 | # Divide train/test index (original data) 29 | no = len(data_x) 30 | idx = np.random.permutation(no) 31 | train_idx = idx[: int(no * train_rate)] 32 | test_idx = idx[int(no * train_rate) :] 33 | 34 | train_x = [data_x[i] for i in train_idx] 35 | test_x = [data_x[i] for i in test_idx] 36 | train_t = [data_t[i] for i in train_idx] 37 | test_t = [data_t[i] for i in test_idx] 38 | 39 | # Divide train/test index (synthetic data) 40 | no = len(data_x_hat) 41 | idx = np.random.permutation(no) 42 | train_idx = idx[: int(no * train_rate)] 43 | test_idx = idx[int(no * train_rate) :] 44 | 45 | train_x_hat = [data_x_hat[i] for i in train_idx] 46 | test_x_hat = [data_x_hat[i] for i in test_idx] 47 | train_t_hat = [data_t_hat[i] for i in train_idx] 48 | test_t_hat = [data_t_hat[i] for i in test_idx] 49 | 50 | return train_x, train_x_hat, test_x, test_x_hat, train_t, train_t_hat, test_t, test_t_hat 51 | 52 | 53 | def extract_time(data): 54 | """Returns Maximum sequence length and each sequence length. 55 | 56 | Args: 57 | - data: original data 58 | 59 | Returns: 60 | - time: extracted time information 61 | - max_seq_len: maximum sequence length 62 | """ 63 | time = list() 64 | max_seq_len = 0 65 | for i in range(len(data)): 66 | max_seq_len = max(max_seq_len, len(data[i][:, 0])) 67 | time.append(len(data[i][:, 0])) 68 | 69 | return time, max_seq_len 70 | 71 | 72 | def rnn_cell(module_name, hidden_dim): 73 | """Basic RNN Cell. 74 | 75 | Args: 76 | - module_name: gru, lstm, or lstmLN 77 | 78 | Returns: 79 | - rnn_cell: RNN Cell 80 | """ 81 | assert module_name in ["gru", "lstm", "lstmLN"] 82 | 83 | # GRU 84 | if module_name == "gru": 85 | rnn_cell = tf.nn.rnn_cell.GRUCell(num_units=hidden_dim, activation=tf.nn.tanh) 86 | # LSTM 87 | elif module_name == "lstm": 88 | rnn_cell = tf.contrib.rnn.BasicLSTMCell(num_units=hidden_dim, activation=tf.nn.tanh) 89 | # LSTM Layer Normalization 90 | elif module_name == "lstmLN": 91 | rnn_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(num_units=hidden_dim, activation=tf.nn.tanh) 92 | return rnn_cell 93 | 94 | 95 | def random_generator(batch_size, z_dim, T_mb, max_seq_len): 96 | """Random vector generation. 97 | 98 | Args: 99 | - batch_size: size of the random vector 100 | - z_dim: dimension of random vector 101 | - T_mb: time information for the random vector 102 | - max_seq_len: maximum sequence length 103 | 104 | Returns: 105 | - Z_mb: generated random vector 106 | """ 107 | Z_mb = list() 108 | for i in range(batch_size): 109 | temp = np.zeros([max_seq_len, z_dim]) 110 | temp_Z = np.random.uniform(0.0, 1, [T_mb[i], z_dim]) 111 | temp[: T_mb[i], :] = temp_Z 112 | Z_mb.append(temp_Z) 113 | return Z_mb 114 | 115 | 116 | def batch_generator(data, time, batch_size): 117 | """Mini-batch generator. 118 | 119 | Args: 120 | - data: time-series data 121 | - time: time information 122 | - batch_size: the number of samples in each batch 123 | 124 | Returns: 125 | - X_mb: time-series data in each batch 126 | - T_mb: time information in each batch 127 | """ 128 | no = len(data) 129 | idx = np.random.permutation(no) 130 | train_idx = idx[:batch_size] 131 | 132 | X_mb = list(data[i] for i in train_idx) 133 | T_mb = list(time[i] for i in train_idx) 134 | 135 | return X_mb, T_mb 136 | -------------------------------------------------------------------------------- /generative_models/vae.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code author: Boris van Breugel (bv292@cam.ac.uk) 3 | 4 | Based on code by Jinsung Yoon (jsyoon0823@gmail.com) 5 | 6 | ----------------------------- 7 | 8 | Generate synthetic data with VAE framework 9 | (1) Use original data to generate synthetic data 10 | """ 11 | 12 | #%% Import necessary packages 13 | import tensorflow as tf 14 | import numpy as np 15 | 16 | from tqdm import tqdm 17 | 18 | 19 | def vae(orig_data, params): 20 | """Generate synthetic data for VAE framework. 21 | 22 | Args: 23 | orig_data: original data 24 | params: Network parameters 25 | mb_size: mini-batch size 26 | z_dim: random state dimension 27 | h_dim: hidden state dimension 28 | lambda: identifiability parameter 29 | iterations: training iterations 30 | 31 | Returns: 32 | synth_data: synthetically generated data 33 | """ 34 | 35 | # Reset the tensorflow graph 36 | tf.compat.v1.reset_default_graph() 37 | 38 | ## Parameters 39 | # Feature no 40 | x_dim = len(orig_data.columns) 41 | # X_recon no 42 | no = len(orig_data) 43 | 44 | # Batch size 45 | mb_size = params['mb_size'] 46 | # Latent representation dimension 47 | z_dim = params['z_dim'] 48 | # Hidden unit dimensions 49 | h_dim = params['h_dim'] 50 | # Identifiability parameter 51 | 52 | # Training iterations 53 | iterations = params['iterations'] 54 | # VAE type 55 | lr = 1e-4 56 | 57 | #%% Data Preprocessing 58 | orig_data = np.asarray(orig_data) 59 | 60 | def data_normalization(orig_data, epsilon = 1e-8): 61 | 62 | min_val = np.min(orig_data, axis=0) 63 | 64 | normalized_data = orig_data - min_val 65 | 66 | max_val = np.max(normalized_data, axis=0) 67 | normalized_data = normalized_data / (max_val + epsilon) 68 | 69 | normalization_params = {"min_val": min_val, "max_val": max_val} 70 | 71 | return normalized_data, normalization_params 72 | 73 | def data_renormalization(normalized_data, normalization_params, epsilon = 1e-8): 74 | 75 | renormalized_data = normalized_data * (normalization_params['max_val'] + epsilon) 76 | renormalized_data = renormalized_data + normalization_params['min_val'] 77 | 78 | return renormalized_data 79 | 80 | orig_data, normalization_params = data_normalization(orig_data) 81 | 82 | #%% Necessary Functions 83 | 84 | # Xavier Initialization Definition 85 | def xavier_init(size): 86 | in_dim = size[0] 87 | xavier_stddev = 1. / tf.sqrt(in_dim / 2.) 88 | return tf.random.normal(shape = size, stddev = xavier_stddev) 89 | 90 | # X_recon from uniform distribution 91 | def X_recon_Z(m, n): 92 | return np.random.randn(m, n) 93 | 94 | # X_recon from the real data 95 | def X_recon_X(m, n): 96 | return np.random.permutation(m)[:n] 97 | 98 | def sample_Z(m,n): 99 | return tf.random.normal((m,n), 0, 1, dtype=tf.float32) 100 | 101 | #%% Placeholder 102 | # Feature 103 | X = tf.compat.v1.placeholder(tf.float32, shape = [None, x_dim]) 104 | X_recon = tf.compat.v1.placeholder(tf.float32, shape = [None, x_dim]) 105 | # Random Variable 106 | Z = tf.compat.v1.placeholder(tf.float32, shape = [None, z_dim]) 107 | mu = tf.compat.v1.placeholder(tf.float32, shape = [None, z_dim]) 108 | logvar = tf.compat.v1.placeholder(tf.float32, shape = [None, z_dim]) 109 | 110 | 111 | #%% Encoder 112 | E_W1 = tf.Variable(xavier_init([x_dim, h_dim])) 113 | E_b1 = tf.Variable(tf.zeros(shape=[h_dim])) 114 | 115 | E_W2e = tf.Variable(xavier_init([h_dim, h_dim])) 116 | E_b2e = tf.Variable(tf.zeros(shape=[h_dim])) 117 | 118 | 119 | E_W_sigma = tf.Variable(xavier_init([h_dim,z_dim])) 120 | E_b_sigma = tf.Variable(tf.zeros(shape=[z_dim])) 121 | 122 | E_W_mu = tf.Variable(xavier_init([h_dim,z_dim])) 123 | E_b_mu = tf.Variable(tf.zeros(shape=[z_dim])) 124 | 125 | 126 | # Decoder 127 | 128 | 129 | D_W3 = tf.Variable(xavier_init([z_dim,h_dim])) 130 | D_b3 = tf.Variable(tf.zeros(shape=[h_dim])) 131 | 132 | D_W2d = tf.Variable(xavier_init([h_dim, h_dim])) 133 | D_b2d = tf.Variable(tf.zeros(shape=[h_dim])) 134 | 135 | 136 | D_W4 = tf.Variable(xavier_init([h_dim, x_dim])) 137 | D_b4 = tf.Variable(tf.zeros(shape=[x_dim])) 138 | 139 | theta = [E_W1, E_W_sigma, E_W_mu, D_W3, D_W4, E_b1, 140 | E_b_mu, E_b_sigma, D_b3, D_b4, 141 | E_W2e, E_b2e, D_W2d, D_b2d] 142 | 143 | #%% Generator and discriminator functions 144 | def encoder(x): 145 | E_h1 = tf.nn.tanh(tf.matmul(x, E_W1) + E_b1) 146 | E_h2 = tf.nn.tanh(tf.matmul(E_h1, E_W2e) + E_b2e) 147 | E_hmu = tf.nn.tanh(tf.matmul(E_h2, E_W_mu) + E_b_mu) 148 | E_hsigma = tf.matmul(E_h1, E_W_sigma) + E_b_sigma 149 | return E_hmu, E_hsigma 150 | 151 | def decoder(z): 152 | D_h3 = tf.nn.tanh(tf.matmul(z, D_W3) + D_b3) 153 | D_h4 = tf.nn.tanh(tf.matmul(D_h3, D_W2d) + D_b2d) 154 | x_recon = tf.nn.sigmoid(tf.matmul(D_h4, D_W4) + D_b4) 155 | return x_recon 156 | 157 | 158 | 159 | #%% Structure 160 | mu, logvar = encoder(X) 161 | Z = mu + tf.exp(logvar/2) * tf.random.normal(tf.shape(input=mu), 0, 1, dtype=tf.float32) 162 | 163 | X_recon = decoder(Z) 164 | 165 | 166 | 167 | 168 | loss1 = tf.reduce_mean(input_tensor=tf.square(X_recon-X)) 169 | loss2 = 0.5 * tf.reduce_mean(input_tensor=tf.square(mu) + tf.exp(logvar) - logvar - 1, axis=1) 170 | 171 | loss = loss1 + loss2 172 | # Solver 173 | 174 | solver = (tf.compat.v1.train.AdamOptimizer(learning_rate = lr, beta1 = 0.5).minimize(loss, var_list = theta)) 175 | 176 | #%% Iterations 177 | sess = tf.compat.v1.Session() 178 | sess.run(tf.compat.v1.global_variables_initializer()) 179 | 180 | # Iterations 181 | for it in tqdm(range(iterations)): 182 | # Discriminator training 183 | 184 | X_idx = X_recon_X(no,mb_size) 185 | X_mb = orig_data[X_idx,:] 186 | 187 | _, E_loss1_curr, E_loss2_curr = sess.run([solver, loss1, loss2], feed_dict = {X: X_mb}) 188 | 189 | #%% Output Generation 190 | synth_data = sess.run([X_recon], feed_dict = {Z: np.random.randn(no, z_dim)}) 191 | synth_data = synth_data[0] 192 | print(synth_data.shape) 193 | 194 | # Renormalization 195 | synth_data = data_renormalization(synth_data, normalization_params) 196 | 197 | # Binary features 198 | for i in range(x_dim): 199 | if len(np.unique(orig_data[:, i])) == 2: 200 | synth_data[:, i] = np.round(synth_data[:, i]) 201 | 202 | return synth_data -------------------------------------------------------------------------------- /main_timeseries.py: -------------------------------------------------------------------------------- 1 | """Time series data generation. 2 | 3 | Author: Evgeny Saveliev (e.s.saveliev@gmail.com) 4 | """ 5 | import os 6 | 7 | import numpy as np 8 | 9 | from generative_models.timegan import timegan 10 | from data.amsterdam import AmsterdamLoader, preprocess_data, padding_mask_to_seq_lens 11 | 12 | # ---------------------------------------------------------------------------------------------------------------------- 13 | # Set experiment settings here: 14 | 15 | use_data = "amsterdam" 16 | use_model = "timegan" 17 | 18 | generated_data_dir = "./data/ts_generated/" 19 | 20 | amsterdam_data_settings = { 21 | "train_frac": 0.4, 22 | "val_frac": 0.2, 23 | "n_features": 70, 24 | "include_time": False, 25 | "max_timesteps": 100, 26 | "pad_val": -999., 27 | "data_split_seed": 12345, 28 | "data_loading_force_refresh": True, 29 | # -------------------- 30 | "data_path": "data/amsterdam/combined_downsampled_longitudinal_data.csv", 31 | } 32 | 33 | timegan_experiment_settings = { 34 | "model_params": { 35 | "module": "gru", 36 | "hidden_dim": 10, 37 | "num_layer": 3, 38 | "iterations": 1000, 39 | "batch_size": 128, 40 | "print_every_n_iters": 100, 41 | }, 42 | "generated_data_filename": "_timegan.npy" # NOTE: will be replaced with `use_data` value. 43 | } 44 | 45 | # ---------------------------------------------------------------------------------------------------------------------- 46 | # Utilities. 47 | 48 | def prepare_amsterdam(amsterdam_loader, settings): 49 | raw_data, padding_mask, (train_idx, val_idx, test_idx) = \ 50 | amsterdam_loader.load_reshape_split_data(force_refresh=settings["data_loading_force_refresh"]) 51 | processed_data, imputed_processed_data = preprocess_data( 52 | raw_data, 53 | padding_mask, 54 | padding_fill=settings["pad_val"], 55 | time_feature_included=settings["include_time"], 56 | ) 57 | seq_lens = padding_mask_to_seq_lens(padding_mask) 58 | return imputed_processed_data, seq_lens 59 | 60 | # ---------------------------------------------------------------------------------------------------------------------- 61 | 62 | def main(): 63 | 64 | if use_data == "amsterdam": 65 | active_data_settings = amsterdam_data_settings 66 | amsterdam_loader = AmsterdamLoader( 67 | data_path=os.path.abspath(active_data_settings["data_path"]), 68 | max_seq_len=active_data_settings["max_timesteps"], 69 | seed=active_data_settings["data_split_seed"], 70 | train_rate=active_data_settings["train_frac"], 71 | val_rate=active_data_settings["val_frac"], 72 | include_time=active_data_settings["include_time"], 73 | debug_data=False, 74 | pad_before=False, 75 | padding_fill=active_data_settings["pad_val"], 76 | ) 77 | if use_model == "timegan": 78 | # Timegan doesn't take variable-length sequences, use padding value of 0. 79 | amsterdam_loader.padding_fill = 0. 80 | original_data, seq_lens = prepare_amsterdam(amsterdam_loader=amsterdam_loader, settings=active_data_settings) 81 | 82 | if use_model == "timegan": 83 | active_experiment_settings = timegan_experiment_settings 84 | generated_data = timegan(ori_data=original_data, parameters=active_experiment_settings["model_params"]) 85 | 86 | generated_data_filepath = os.path.join( 87 | generated_data_dir, 88 | active_experiment_settings["generated_data_filename"].replace("", use_data)) 89 | np.save(generated_data_filepath, generated_data) 90 | print(f"Generative model: {use_model}, data: {use_data}\n" 91 | f"Generated and saved timeseries data of shape: {generated_data.shape}. File: {generated_data_filepath}.") 92 | 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedmalaa/evaluating-generative-models/093910e487d07959db7d87b54698da60aaeb50c0/metrics/__init__.py -------------------------------------------------------------------------------- /metrics/combined.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | Created on Fri Jan 15 15:52:59 2021 5 | 6 | @author: boris 7 | 8 | """ 9 | 10 | 11 | from metrics.feature_distribution import feature_distribution 12 | from metrics.compute_wd import compute_wd 13 | from metrics.compute_identifiability import compute_identifiability 14 | from metrics.fid import compute_frechet_distance 15 | from metrics.parzen import compute_parzen 16 | from metrics.precision_recall import compute_prc 17 | from metrics.prdc import compute_prdc 18 | from metrics.evaluation import compute_alpha_precision 19 | 20 | import torch 21 | import numpy as np 22 | 23 | if torch.cuda.is_available(): 24 | device = 'cuda' 25 | else: 26 | device = 'cpu' 27 | 28 | def compute_metrics(X, Y, which_metric=None, wd_params=None, model=None): 29 | results = {} 30 | emb_types = [''] 31 | 32 | if model is not None: 33 | emb_types.append('_OC') 34 | else: 35 | print('#####################!OC model not defined !##################') 36 | 37 | if wd_params is None: 38 | wd_params = dict() 39 | wd_params['iterations'] = 500 40 | wd_params['h_dim'] = 30 41 | wd_params['z_dim'] = 10 42 | wd_params['mb_size'] = 128 43 | 44 | if which_metric is None: 45 | which_metric = [['WD','FD', 'PRDC', 'OC'], # normal 46 | ['OC']] # additional OneClass 47 | 48 | for emb_index, emb in enumerate(emb_types): 49 | 50 | if emb_index == 1 and len(which_metric[1])>0: 51 | print('Computing metrics for OC embedding') 52 | print('Embedding data into OC representation') 53 | model.to(device) 54 | with torch.no_grad(): 55 | X = model(torch.tensor(X).float().to(device)).cpu().detach().numpy() 56 | Y = model(torch.tensor(Y).float().to(device)).cpu().detach().numpy() 57 | print('Done embedding') 58 | print('X, std X', np.mean(X), np.std(X)) 59 | print('Y, std Y', np.mean(Y), np.std(Y)) 60 | 61 | else: 62 | print('Computing metrics for no additional OneClass embedding') 63 | 64 | 65 | 66 | # (1) Marginal distributions 67 | if 'marg' in which_metric[emb_index]: 68 | 69 | print('Start computing marginal feature distributions') 70 | results[f'feat_dist{emb}'] = feature_distribution(X, Y) 71 | print('Finish computing feature distributions') 72 | print(results[f'feat_dist{emb}']) 73 | 74 | 75 | # (2) Wasserstein Distance (WD) 76 | if 'WD' in which_metric[emb_index]: 77 | print('Start computing Wasserstein Distance') 78 | results[f'wd_measure{emb}'] = compute_wd(X, Y, wd_params) 79 | print('WD measure: ',results[f'wd_measure{emb}']) 80 | 81 | 82 | # (3) Identifiability 83 | if 'ID' in which_metric[emb_index]: 84 | print('Start computing identifiability') 85 | results[f'identifiability{emb}'] = compute_identifiability(X, Y) 86 | print('Identifiability measure: ',results[f'identifiability{emb}']) 87 | 88 | 89 | # (4) Frechet distance 90 | if 'FD' in which_metric[emb_index] or 'FID' in which_metric[emb_index]: 91 | results[f'fid_value{emb}'] = compute_frechet_distance(X, Y) 92 | print('Frechet distance', results[f'fid_value{emb}']) 93 | print('Frechet distance/dim', results[f'fid_value{emb}']/Y.shape[-1]) 94 | 95 | 96 | # (5) Parzen 97 | if 'parzen' in which_metric[emb_index]: 98 | results[f'parzen_ll{emb}'], results[f'parzen_std{emb}'] = compute_parzen(X, Y, sigma=0.408) 99 | print(f'Parzen Log-Likelihood of test set = {results["parzen_ll"]}, se: {results["parzen_std"]}') 100 | 101 | 102 | # (6) Precision/Recall 103 | if 'PR' in which_metric[emb_index]: 104 | results[f'PR{emb}'] = compute_prc(X,Y) 105 | elif 'PRDC' in which_metric[emb_index]: 106 | print('Start computing P&R and D&C') 107 | prdc_res = compute_prdc(X,Y) 108 | for key in prdc_res: 109 | print('PRDC:', key, prdc_res[key]) 110 | results[key+emb] = prdc_res[key] 111 | 112 | # (7) OneClass 113 | if 'OC' in which_metric[emb_index]: 114 | if emb_index==1: 115 | emb_center = model.c 116 | else: 117 | emb_center = np.mean(X,axis=0) 118 | print('Start computing OC metrics') 119 | OC_res = compute_alpha_precision(X, Y, emb_center) 120 | alphas, alpha_precision_curve, beta_coverage_curve, Delta_precision_alpha, Delta_coverage_beta, authen = OC_res 121 | results[f'alphas{emb}'] = alphas 122 | results[f'alpha_pc{emb}'] = alpha_precision_curve 123 | results[f'beta_cv{emb}'] = beta_coverage_curve 124 | results[f'auten{emb}'] = authen 125 | results[f'Dpa{emb}'] = Delta_precision_alpha 126 | results[f'Dcb{emb}'] = Delta_coverage_beta 127 | results[f'Daut{emb}'] = np.mean(authen) 128 | print('OneClass: Delta_precision_alpha', results[f'Dpa{emb}']) 129 | print('OneClass: Delta_coverage_beta ', results[f'Dcb{emb}']) 130 | print('OneClass: Delta_autenticity ', results[f'Daut{emb}']) 131 | 132 | 133 | return results -------------------------------------------------------------------------------- /metrics/compute_identifiability.py: -------------------------------------------------------------------------------- 1 | """Anonymization through Data Synthesis using Generative Adversarial Networks: 2 | A harmonizing advancement for AI in medicine (ADS-GAN) Codebase. 3 | 4 | Reference: Jinsung Yoon, Lydia N. Drumright, Mihaela van der Schaar, 5 | "Anonymization through Data Synthesis using Generative Adversarial Networks (ADS-GAN): 6 | A harmonizing advancement for AI in medicine," 7 | IEEE Journal of Biomedical and Health Informatics (JBHI), 2019. 8 | Paper link: https://ieeexplore.ieee.org/document/9034117 9 | Last updated Date: December 22th 2020 10 | Code author: Jinsung Yoon (jsyoon0823@gmail.com) 11 | ----------------------------- 12 | compute_identifiability.py 13 | - Compare Identifiability between original data and synthetic data 14 | """ 15 | 16 | # Necessary packages 17 | import numpy as np 18 | from sklearn.neighbors import NearestNeighbors 19 | from scipy.stats import entropy 20 | 21 | # Function start 22 | def compute_identifiability (orig_data, synth_data): 23 | """Compare Wasserstein distance between original data and synthetic data. 24 | 25 | Args: 26 | orig_data: original data 27 | synth_data: synthetically generated data 28 | 29 | Returns: 30 | WD_value: Wasserstein distance 31 | """ 32 | 33 | # Entropy computation 34 | def compute_entropy(labels): 35 | value,counts = np.unique(np.round(labels), return_counts=True) 36 | return entropy(counts) 37 | 38 | # Original data 39 | orig_data = np.asarray(orig_data) 40 | 41 | # Parameters 42 | no, x_dim = np.shape(orig_data) 43 | 44 | #%% Weights 45 | W = np.zeros([x_dim,]) 46 | 47 | for i in range(x_dim): 48 | W[i] = compute_entropy(orig_data[:,i]) 49 | 50 | # Normalization 51 | orig_data_hat = orig_data.copy() 52 | synth_data_hat = synth_data.copy() 53 | 54 | eps = 0 #1e-16 55 | W = np.ones_like(W) 56 | 57 | for i in range(x_dim): 58 | orig_data_hat[:,i] = orig_data[:,i] * 1./(W[i]+eps) 59 | synth_data_hat[:,i] = synth_data[:,i] * 1./(W[i]+eps) 60 | 61 | #%% r_i computation 62 | nbrs = NearestNeighbors(n_neighbors = 2).fit(orig_data_hat) 63 | distance, _ = nbrs.kneighbors(orig_data_hat) 64 | 65 | # hat{r_i} computation 66 | nbrs_hat = NearestNeighbors(n_neighbors = 1).fit(synth_data_hat) 67 | distance_hat, _ = nbrs_hat.kneighbors(orig_data_hat) 68 | 69 | # See which one is bigger 70 | R_Diff = distance_hat[:,0] - distance[:,1] 71 | identifiability_value = np.sum(R_Diff<0) / float(no) 72 | 73 | return identifiability_value -------------------------------------------------------------------------------- /metrics/compute_wd.py: -------------------------------------------------------------------------------- 1 | """Anonymization through Data Synthesis using Generative Adversarial Networks: 2 | A harmonizing advancement for AI in medicine (ADS-GAN) Codebase. 3 | 4 | Reference: Jinsung Yoon, Lydia N. Drumright, Mihaela van der Schaar, 5 | "Anonymization through Data Synthesis using Generative Adversarial Networks (ADS-GAN): 6 | A harmonizing advancement for AI in medicine," 7 | IEEE Journal of Biomedical and Health Informatics (JBHI), 2019. 8 | Paper link: https://ieeexplore.ieee.org/document/9034117 9 | Last updated Date: December 22th 2020 10 | Code author: Jinsung Yoon (jsyoon0823@gmail.com) 11 | Updated by: Boris van Breugel (bv292@cam.ac.uk) 12 | 13 | ----------------------------- 14 | compute_wd.py 15 | - Compare Wasserstein distance between original data and synthetic data 16 | """ 17 | 18 | import numpy as np 19 | import tensorflow as tf 20 | from tqdm import tqdm 21 | 22 | tf.compat.v1.disable_eager_execution() 23 | 24 | def compute_wd (orig_data, synth_data, params): 25 | """Compare Wasserstein distance between original data and synthetic data. 26 | 27 | Args: 28 | orig_data: original data 29 | synth_data: synthetically generated data 30 | params: Network parameters 31 | mb_size: mini-batch size 32 | h_dim: hidden state dimension 33 | iterations: training iterations 34 | 35 | Returns: 36 | WD_value: Wasserstein distance 37 | """ 38 | 39 | # Preprocess the data 40 | orig_data = np.asarray(orig_data) 41 | synth_data = np.asarray(synth_data) 42 | 43 | no, x_dim = np.shape(orig_data) 44 | 45 | # Divide train / test 46 | orig_data_train = orig_data[:int(no/2),:] 47 | orig_data_test = orig_data[int(no/2):,:] 48 | 49 | synth_data_train = synth_data[:int(no/2),:] 50 | synth_data_test = synth_data[int(no/2):,:] 51 | 52 | #%% Parameters 53 | # Batch size 54 | mb_size = params['mb_size'] 55 | # Hidden unit dimensions 56 | h_dim = int(params['h_dim']/2) 57 | # Train iterations 58 | iterations = params['iterations'] 59 | 60 | #%% Necessary Functions 61 | 62 | # Xavier Initialization Definition 63 | def xavier_init(size): 64 | in_dim = size[0] 65 | xavier_stddev = 1. / tf.sqrt(in_dim / 2.) 66 | return tf.random.normal(shape = size, stddev = xavier_stddev) 67 | 68 | # Sample from the real data 69 | def sample_X(m, n): 70 | return np.random.permutation(m)[:n] 71 | 72 | #%% Placeholder 73 | X = tf.compat.v1.placeholder(tf.float32, shape = [None, x_dim]) 74 | X_hat = tf.compat.v1.placeholder(tf.float32, shape = [None, x_dim]) 75 | 76 | #%% Discriminator 77 | # Discriminator 78 | D_W1 = tf.Variable(xavier_init([x_dim, h_dim])) 79 | D_b1 = tf.Variable(tf.zeros(shape=[h_dim])) 80 | 81 | D_W2 = tf.Variable(xavier_init([h_dim,1])) 82 | D_b2 = tf.Variable(tf.zeros(shape=[1])) 83 | 84 | theta_D = [D_W1, D_W2, D_b1, D_b2] 85 | 86 | def discriminator(x): 87 | D_h1 = tf.nn.relu(tf.matmul(x, D_W1) + D_b1) 88 | out = (tf.matmul(D_h1, D_W2) + D_b2) 89 | return out 90 | 91 | # Structure 92 | D_real = discriminator(X) 93 | D_fake = discriminator(X_hat) 94 | 95 | D_loss = tf.reduce_mean(input_tensor=D_real) - tf.reduce_mean(input_tensor=D_fake) 96 | 97 | D_solver = (tf.compat.v1.train.RMSPropOptimizer(learning_rate=1e-4) 98 | .minimize(-D_loss, var_list=theta_D)) 99 | 100 | clip_D = [p.assign(tf.clip_by_value(p, -0.1, 0.1)) for p in theta_D] 101 | 102 | #%% 103 | sess = tf.compat.v1.Session() 104 | sess.run(tf.compat.v1.global_variables_initializer()) 105 | 106 | # Iterations 107 | for it in tqdm(range(iterations)): 108 | 109 | X_idx = sample_X(int(no/2),mb_size) 110 | X_mb = orig_data_train[X_idx,:] 111 | X_hat_mb = synth_data_train[X_idx,:] 112 | 113 | _, D_loss_curr, _ = sess.run([D_solver, D_loss, clip_D], feed_dict = {X: X_mb, X_hat: X_hat_mb}) 114 | 115 | #%% Test 116 | WD_value = sess.run([D_loss], feed_dict = {X: orig_data_test, X_hat: synth_data_test}) 117 | 118 | return WD_value[0] -------------------------------------------------------------------------------- /metrics/evaluation.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2021, Ahmed M. Alaa, Boris van Breugel 3 | # Licensed under the BSD 3-clause license (see LICENSE.txt) 4 | 5 | """ 6 | 7 | ----------------------------------------- 8 | Metrics implementation 9 | ----------------------------------------- 10 | 11 | """ 12 | 13 | from __future__ import absolute_import, division, print_function 14 | 15 | import numpy as np 16 | import sys 17 | from sklearn.neighbors import NearestNeighbors 18 | 19 | import logging 20 | import torch 21 | import scipy 22 | 23 | if not sys.warnoptions: 24 | import warnings 25 | warnings.simplefilter("ignore") 26 | 27 | device = 'cpu' # matrices are too big for gpu 28 | 29 | 30 | def compute_alpha_precision(real_data, synthetic_data, emb_center): 31 | 32 | 33 | emb_center = torch.tensor(emb_center, device=device) 34 | 35 | n_steps = 30 36 | nn_size = 2 37 | alphas = np.linspace(0, 1, n_steps) 38 | 39 | 40 | Radii = np.quantile(torch.sqrt(torch.sum((torch.tensor(real_data).float() - emb_center) ** 2, dim=1)), alphas) 41 | 42 | synth_center = torch.tensor(np.mean(synthetic_data, axis=0)).float() 43 | 44 | alpha_precision_curve = [] 45 | beta_coverage_curve = [] 46 | 47 | synth_to_center = torch.sqrt(torch.sum((torch.tensor(synthetic_data).float() - emb_center) ** 2, dim=1)) 48 | 49 | 50 | nbrs_real = NearestNeighbors(n_neighbors = 2, n_jobs=-1, p=2).fit(real_data) 51 | real_to_real, _ = nbrs_real.kneighbors(real_data) 52 | 53 | nbrs_synth = NearestNeighbors(n_neighbors = 1, n_jobs=-1, p=2).fit(synthetic_data) 54 | real_to_synth, real_to_synth_args = nbrs_synth.kneighbors(real_data) 55 | 56 | # Let us find closest real point to any real point, excluding itself (therefore 1 instead of 0) 57 | real_to_real = torch.from_numpy(real_to_real[:,1].squeeze()) 58 | real_to_synth = torch.from_numpy(real_to_synth.squeeze()) 59 | real_to_synth_args = real_to_synth_args.squeeze() 60 | 61 | real_synth_closest = synthetic_data[real_to_synth_args] 62 | 63 | real_synth_closest_d = torch.sqrt(torch.sum((torch.tensor(real_synth_closest).float()- synth_center) ** 2, dim=1)) 64 | closest_synth_Radii = np.quantile(real_synth_closest_d, alphas) 65 | 66 | 67 | 68 | for k in range(len(Radii)): 69 | precision_audit_mask = (synth_to_center <= Radii[k]).detach().float().numpy() 70 | alpha_precision = np.mean(precision_audit_mask) 71 | 72 | beta_coverage = np.mean(((real_to_synth <= real_to_real) * (real_synth_closest_d <= closest_synth_Radii[k])).detach().float().numpy()) 73 | 74 | alpha_precision_curve.append(alpha_precision) 75 | beta_coverage_curve.append(beta_coverage) 76 | 77 | 78 | # See which one is bigger 79 | 80 | authen = real_to_real[real_to_synth_args] < real_to_synth 81 | authenticity = np.mean(authen.numpy()) 82 | 83 | Delta_precision_alpha = 1 - 2 * np.sum(np.abs(np.array(alphas) - np.array(alpha_precision_curve))) * (alphas[1] - alphas[0]) 84 | Delta_coverage_beta = 1 - 2 * np.sum(np.abs(np.array(alphas) - np.array(beta_coverage_curve))) * (alphas[1] - alphas[0]) 85 | 86 | return alphas, alpha_precision_curve, beta_coverage_curve, Delta_precision_alpha, Delta_coverage_beta, authenticity 87 | -------------------------------------------------------------------------------- /metrics/evaluation_old.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2021, Ahmed M. Alaa, Boris van Breugel 3 | # Licensed under the BSD 3-clause license (see LICENSE.txt) 4 | 5 | """ 6 | 7 | ----------------------------------------- 8 | Metrics implementation 9 | ----------------------------------------- 10 | 11 | """ 12 | 13 | from __future__ import absolute_import, division, print_function 14 | 15 | import numpy as np 16 | import sys 17 | from sklearn.neighbors import NearestNeighbors 18 | 19 | import logging 20 | import torch 21 | import scipy 22 | 23 | if not sys.warnoptions: 24 | import warnings 25 | warnings.simplefilter("ignore") 26 | 27 | device = 'cpu' # matrices are too big for gpu 28 | 29 | 30 | def compute_alpha_precision_old(real_data, synthetic_data, emb_center): 31 | n_steps = 30 32 | nn_size = 2 33 | alphas = np.linspace(0, 1, 30) 34 | Radii = [np.quantile(torch.sqrt(torch.sum((torch.tensor(real_data).float() - emb_center) ** 2, dim=1)), alphas[k]) for k in range(len(alphas))] 35 | 36 | synth_center = torch.tensor(np.mean(synthetic_data, axis=0)).float() 37 | synth_Radii = [np.quantile(torch.sqrt(torch.sum((torch.tensor(synthetic_data).float() - synth_center) ** 2, dim=1)), alphas[k]) for k in range(len(alphas))] 38 | 39 | alpha_precision_curve = [] 40 | beta_coverage_curve = [] 41 | 42 | synth_to_center = torch.sqrt(torch.sum((torch.tensor(synthetic_data).float() - emb_center) ** 2, dim=1)) 43 | synth_to_synth_center = torch.sqrt(torch.sum((torch.tensor(synthetic_data).float() - synth_center) ** 2, dim=1)) 44 | real_to_center = torch.sqrt(torch.sum((torch.tensor(real_data).float() - emb_center) ** 2, dim=1)) 45 | 46 | real_to_synth = [np.min(np.sum(np.abs(real_data[k, :] - synthetic_data), axis=1)) for k in range(real_data.shape[0])] 47 | real_to_synth_args = [np.argmin(np.sum(np.abs(real_data[k, :] - synthetic_data), axis=1)) for k in range(real_data.shape[0])] 48 | real_to_synth = torch.tensor(np.array(real_to_synth)).float() 49 | real_synth_closest = np.array([synthetic_data[real_to_synth_args[k], :] for k in range(len(real_to_synth_args))]) 50 | 51 | closest_synth_Radii = [np.quantile(torch.sqrt(torch.sum((torch.tensor(real_synth_closest).float() - synth_center) ** 2, dim=1)), alphas[k]) for k in range(len(alphas))] 52 | real_synth_closest_d = torch.sqrt(torch.sum((torch.tensor(real_synth_closest).float()- synth_center) ** 2, dim=1)) 53 | 54 | real_to_real = [np.partition(np.sum(np.abs(real_data[k, :] - real_data), axis=1), nn_size)[nn_size-1] for k in range(real_data.shape[0])] 55 | real_to_real = torch.tensor(np.array(real_to_real)).float() 56 | 57 | real_to_synth_all = [np.min(np.sum(np.abs(real_data[k, :] - synthetic_data), axis=1)) for k in range(real_data.shape[0])] 58 | real_to_real_all = np.array([np.sum(np.abs(real_data[k, :] - real_data), axis=1) for k in range(real_data.shape[0])]) 59 | dist_probs = [1/np.mean(real_to_synth_all[k] <= real_to_real_all[k, :]) for k in range(real_data.shape[0])] 60 | 61 | for k in range(len(Radii)): 62 | 63 | precision_audit_mask = (synth_to_center <= Radii[k]).detach().float().numpy() 64 | alpha_precision = np.mean(precision_audit_mask) 65 | 66 | beta_coverage = np.mean(((real_to_synth <= real_to_real) * (real_synth_closest_d <= closest_synth_Radii[k])).detach().float().numpy()) 67 | 68 | alpha_precision_curve.append(alpha_precision) 69 | beta_coverage_curve.append(beta_coverage) 70 | 71 | 72 | Delta_precision_alpha = 1 - 2 * np.sum(np.abs(np.array(alphas) - np.array(alpha_precision_curve))) * (alphas[1] - alphas[0]) 73 | Delta_coverage_beta = 1 - 2 * np.sum(np.abs(np.array(alphas) - np.array(beta_coverage_curve))) * (alphas[1] - alphas[0]) 74 | 75 | dist_ps = np.array(dist_probs) 76 | dist_min = np.min(dist_ps) 77 | dist_max = np.max(dist_ps) 78 | 79 | thresholds = np.linspace(dist_min, dist_max, 1000) 80 | authen = np.array([np.mean(dist_ps >= thresholds[k]) for k in range(len(thresholds))]) 81 | 82 | return alphas, alpha_precision_curve, beta_coverage_curve, Delta_precision_alpha, Delta_coverage_beta, (thresholds, authen) -------------------------------------------------------------------------------- /metrics/feature_distribution.py: -------------------------------------------------------------------------------- 1 | """Anonymization through Data Synthesis using Generative Adversarial Networks: 2 | A harmonizing advancement for AI in medicine (ADS-GAN) Codebase. 3 | 4 | Reference: Jinsung Yoon, Lydia N. Drumright, Mihaela van der Schaar, 5 | "Anonymization through Data Synthesis using Generative Adversarial Networks (ADS-GAN): 6 | A harmonizing advancement for AI in medicine," 7 | IEEE Journal of Biomedical and Health Informatics (JBHI), 2019. 8 | Paper link: https://ieeexplore.ieee.org/document/9034117 9 | Last updated Date: December 22th 2020 10 | Code author: Jinsung Yoon (jsyoon0823@gmail.com) 11 | ----------------------------- 12 | feature_distribution.py 13 | - Compare feature distribution between original data and synthetic data 14 | """ 15 | 16 | # Import necessary packages 17 | import numpy as np 18 | 19 | def feature_distribution (orig_data, synth_data): 20 | """Compare feature distribution between orig data and synth data 21 | 22 | Args: 23 | orig_data: original data 24 | synth_data: synthetically generated data 25 | 26 | Returns: 27 | dist_comp_table: distribution comparison table 28 | """ 29 | 30 | orig_data = np.asarray(orig_data) 31 | 32 | # Parameters 33 | no, dim = np.shape(orig_data) 34 | 35 | # Output initialization 36 | dist_comp_table = np.zeros([dim, 4]) 37 | 38 | for i in range(dim): 39 | 40 | if len(np.unique(orig_data[:, i])) > 2: 41 | dist_comp_table[i,0] = np.mean(synth_data[:,i]) 42 | dist_comp_table[i,1] = np.std(synth_data[:,i]) 43 | 44 | dist_comp_table[i,2] = np.mean(orig_data[:,i]) 45 | dist_comp_table[i,3] = np.std(orig_data[:,i]) 46 | 47 | else: 48 | dist_comp_table[i,0] = np.sum(synth_data[:,i]==1) 49 | dist_comp_table[i,1] = np.sum(synth_data[:,i]==1) / float(no) 50 | 51 | dist_comp_table[i,2] = np.sum(orig_data[:,i]==1) 52 | dist_comp_table[i,3] = np.sum(orig_data[:,i]==1) / float(no) 53 | 54 | return dist_comp_table 55 | -------------------------------------------------------------------------------- /metrics/fid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' Calculates the Frechet Inception Distance (FID) to evalulate GANs. 3 | 4 | Paper: GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium. 5 | Code derived from https://github.com/bioinf-jku/TTUR 6 | 7 | The FID metric calculates the distance between two distributions of images. 8 | Typically, we have summary statistics (mean & covariance matrix) of one 9 | of these distributions, while the 2nd distribution is given by a GAN. 10 | 11 | When run as a stand-alone program, it compares the distribution of 12 | images that are stored as PNG/JPEG at a specified location with a 13 | distribution given by summary statistics (in pickle format). 14 | 15 | The FID is calculated by assuming that X_1 and X_2 are the activations of 16 | the pool_3 layer of the inception net for generated samples and real world 17 | samples respectivly. 18 | 19 | See --help to see further details. 20 | ''' 21 | 22 | from __future__ import absolute_import, division, print_function 23 | import numpy as np 24 | from scipy import linalg 25 | import warnings 26 | 27 | def compute_frechet_distance(X1, X2): 28 | """ 29 | Frechet distance between two datasets that are both assumed Gaussian 30 | 31 | 32 | """ 33 | mu1, cov1 = fit_gaussian(X1) 34 | mu2, cov2 = fit_gaussian(X2) 35 | return calculate_frechet_distance(mu1,cov1,mu2,cov2) 36 | 37 | def fit_gaussian(act): 38 | """Calculation of the statistics used by the FID. 39 | Params: 40 | -- act : activations 41 | Returns: 42 | -- mu : The mean over samples of the activations 43 | -- sigma : The covariance matrix of the activations 44 | """ 45 | mu = np.mean(act, axis=0) 46 | sigma = np.cov(act.T) 47 | return mu, sigma 48 | 49 | 50 | def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6): 51 | """Numpy implementation of the Frechet Distance. 52 | The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1) 53 | and X_2 ~ N(mu_2, C_2) is 54 | d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)). 55 | 56 | Stable version by Dougal J. Sutherland. 57 | 58 | Params: 59 | -- mu1 : Numpy array containing the activations of the pool_3 layer of the 60 | inception net ( like returned by the function 'get_predictions') 61 | for generated samples. 62 | -- mu2 : The sample mean over activations of the pool_3 layer, precalcualted 63 | on an representive data set. 64 | -- sigma1: The covariance matrix over activations of the pool_3 layer for 65 | generated samples. 66 | -- sigma2: The covariance matrix over activations of the pool_3 layer, 67 | precalcualted on an representive data set. 68 | 69 | Returns: 70 | -- : The Frechet Distance. 71 | """ 72 | 73 | mu1 = np.atleast_1d(mu1) 74 | mu2 = np.atleast_1d(mu2) 75 | 76 | sigma1 = np.atleast_2d(sigma1) 77 | sigma2 = np.atleast_2d(sigma2) 78 | 79 | assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths" 80 | assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions" 81 | 82 | diff = mu1 - mu2 83 | 84 | # product might be almost singular 85 | covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False) 86 | if not np.isfinite(covmean).all(): 87 | msg = "fid calculation produces singular product; adding %s to diagonal of cov estimates" % eps 88 | warnings.warn(msg) 89 | offset = np.eye(sigma1.shape[0]) * eps 90 | covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset)) 91 | 92 | # numerical error might give slight imaginary component 93 | if np.iscomplexobj(covmean): 94 | if not np.allclose(np.diagonal(covmean).imag, 0, atol=2e-3): 95 | m = np.max(np.abs(covmean.imag)) 96 | raise ValueError("Imaginary component {}".format(m)) 97 | covmean = covmean.real 98 | 99 | tr_covmean = np.trace(covmean) 100 | 101 | return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean 102 | -------------------------------------------------------------------------------- /metrics/improved_precision_recall.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # This work is licensed under the Creative Commons Attribution-NonCommercial 4 | # 4.0 International License. To view a copy of this license, visit 5 | # http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to 6 | # Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. 7 | 8 | """k-NN precision and recall. 9 | Taken from https://github.com/kynkaat/improved-precision-and-recall-metric/ 10 | Paper: https://arxiv.org/pdf/1904.06991.pdf 11 | 12 | 13 | """ 14 | 15 | import numpy as np 16 | import tensorflow as tf 17 | from time import time 18 | 19 | #---------------------------------------------------------------------------- 20 | 21 | def batch_pairwise_distances(U, V): 22 | """Compute pairwise distances between two batches of feature vectors.""" 23 | with tf.compat.v1.variable_scope('pairwise_dist_block'): 24 | # Squared norms of each row in U and V. 25 | norm_u = tf.reduce_sum(tf.square(U), 1) 26 | norm_v = tf.reduce_sum(tf.square(V), 1) 27 | 28 | # norm_u as a column and norm_v as a row vectors. 29 | norm_u = tf.reshape(norm_u, [-1, 1]) 30 | norm_v = tf.reshape(norm_v, [1, -1]) 31 | 32 | # Pairwise squared Euclidean distances. 33 | D = tf.maximum(norm_u - 2*tf.matmul(U, V, False, True) + norm_v, 0.0) 34 | 35 | return D 36 | 37 | #---------------------------------------------------------------------------- 38 | 39 | class DistanceBlock(): 40 | """Provides multi-GPU support to calculate pairwise distances between two batches of feature vectors.""" 41 | def __init__(self, num_features, num_gpus): 42 | self.num_features = num_features 43 | self.num_gpus = num_gpus 44 | 45 | # Initialize TF graph to calculate pairwise distances. 46 | with tf.device('/cpu:0'): 47 | self._features_batch1 = tf.compat.v1.placeholder(tf.float16, shape=[None, self.num_features]) 48 | self._features_batch2 = tf.compat.v1.placeholder(tf.float16, shape=[None, self.num_features]) 49 | features_split2 = tf.split(self._features_batch2, self.num_gpus, axis=0) 50 | distances_split = [] 51 | for gpu_idx in range(self.num_gpus): 52 | with tf.device('/gpu:%d' % gpu_idx): 53 | distances_split.append(batch_pairwise_distances(self._features_batch1, features_split2[gpu_idx])) 54 | self._distance_block = tf.concat(distances_split, axis=1) 55 | 56 | def pairwise_distances(self, U, V): 57 | """Evaluate pairwise distances between two batches of feature vectors.""" 58 | return self._distance_block.eval(feed_dict={self._features_batch1: U, self._features_batch2: V}) 59 | 60 | #---------------------------------------------------------------------------- 61 | 62 | class ManifoldEstimator(): 63 | """Estimates the manifold of given feature vectors.""" 64 | 65 | def __init__(self, distance_block, features, row_batch_size=25000, col_batch_size=50000, 66 | nhood_sizes=[3], clamp_to_percentile=None, eps=1e-5): 67 | """Estimate the manifold of given feature vectors. 68 | 69 | Args: 70 | distance_block: DistanceBlock object that distributes pairwise distance 71 | calculation to multiple GPUs. 72 | features (np.array/tf.Tensor): Matrix of feature vectors to estimate their manifold. 73 | row_batch_size (int): Row batch size to compute pairwise distances 74 | (parameter to trade-off between memory usage and performance). 75 | col_batch_size (int): Column batch size to compute pairwise distances. 76 | nhood_sizes (list): Number of neighbors used to estimate the manifold. 77 | clamp_to_percentile (float): Prune hyperspheres that have radius larger than 78 | the given percentile. 79 | eps (float): Small number for numerical stability. 80 | """ 81 | num_images = features.shape[0] 82 | self.nhood_sizes = nhood_sizes 83 | self.num_nhoods = len(nhood_sizes) 84 | self.eps = eps 85 | self.row_batch_size = row_batch_size 86 | self.col_batch_size = col_batch_size 87 | self._ref_features = features 88 | self._distance_block = distance_block 89 | 90 | # Estimate manifold of features by calculating distances to k-NN of each sample. 91 | self.D = np.zeros([num_images, self.num_nhoods], dtype=np.float16) 92 | distance_batch = np.zeros([row_batch_size, num_images], dtype=np.float16) 93 | seq = np.arange(max(self.nhood_sizes) + 1, dtype=np.int32) 94 | 95 | for begin1 in range(0, num_images, row_batch_size): 96 | end1 = min(begin1 + row_batch_size, num_images) 97 | row_batch = features[begin1:end1] 98 | 99 | for begin2 in range(0, num_images, col_batch_size): 100 | end2 = min(begin2 + col_batch_size, num_images) 101 | col_batch = features[begin2:end2] 102 | 103 | # Compute distances between batches. 104 | distance_batch[0:end1-begin1, begin2:end2] = self._distance_block.pairwise_distances(row_batch, col_batch) 105 | 106 | # Find the k-nearest neighbor from the current batch. 107 | self.D[begin1:end1, :] = np.partition(distance_batch[0:end1-begin1, :], seq, axis=1)[:, self.nhood_sizes] 108 | 109 | if clamp_to_percentile is not None: 110 | max_distances = np.percentile(self.D, clamp_to_percentile, axis=0) 111 | self.D[self.D > max_distances] = 0 112 | 113 | def evaluate(self, eval_features, return_realism=False, return_neighbors=False): 114 | """Evaluate if new feature vectors are at the manifold.""" 115 | num_eval_images = eval_features.shape[0] 116 | num_ref_images = self.D.shape[0] 117 | distance_batch = np.zeros([self.row_batch_size, num_ref_images], dtype=np.float32) 118 | batch_predictions = np.zeros([num_eval_images, self.num_nhoods], dtype=np.int32) 119 | max_realism_score = np.zeros([num_eval_images,], dtype=np.float32) 120 | nearest_indices = np.zeros([num_eval_images,], dtype=np.int32) 121 | 122 | for begin1 in range(0, num_eval_images, self.row_batch_size): 123 | end1 = min(begin1 + self.row_batch_size, num_eval_images) 124 | feature_batch = eval_features[begin1:end1] 125 | 126 | for begin2 in range(0, num_ref_images, self.col_batch_size): 127 | end2 = min(begin2 + self.col_batch_size, num_ref_images) 128 | ref_batch = self._ref_features[begin2:end2] 129 | 130 | distance_batch[0:end1-begin1, begin2:end2] = self._distance_block.pairwise_distances(feature_batch, ref_batch) 131 | 132 | # From the minibatch of new feature vectors, determine if they are in the estimated manifold. 133 | # If a feature vector is inside a hypersphere of some reference sample, then 134 | # the new sample lies at the estimated manifold. 135 | # The radii of the hyperspheres are determined from distances of neighborhood size k. 136 | samples_in_manifold = distance_batch[0:end1-begin1, :, None] <= self.D 137 | batch_predictions[begin1:end1] = np.any(samples_in_manifold, axis=1).astype(np.int32) 138 | 139 | max_realism_score[begin1:end1] = np.max(self.D[:, 0] / (distance_batch[0:end1-begin1, :] + self.eps), axis=1) 140 | nearest_indices[begin1:end1] = np.argmin(distance_batch[0:end1-begin1, :], axis=1) 141 | 142 | if return_realism and return_neighbors: 143 | return batch_predictions, max_realism_score, nearest_indices 144 | elif return_realism: 145 | return batch_predictions, max_realism_score 146 | elif return_neighbors: 147 | return batch_predictions, nearest_indices 148 | 149 | return batch_predictions 150 | 151 | #---------------------------------------------------------------------------- 152 | 153 | def knn_precision_recall_features(ref_features, eval_features, nhood_sizes=[3], 154 | row_batch_size=25000, col_batch_size=50000, num_gpus=1): 155 | """Calculates k-NN precision and recall for two sets of feature vectors. 156 | 157 | Args: 158 | ref_features (np.array/tf.Tensor): Feature vectors of reference images. 159 | eval_features (np.array/tf.Tensor): Feature vectors of generated images. 160 | nhood_sizes (list): Number of neighbors used to estimate the manifold. 161 | row_batch_size (int): Row batch size to compute pairwise distances 162 | (parameter to trade-off between memory usage and performance). 163 | col_batch_size (int): Column batch size to compute pairwise distances. 164 | num_gpus (int): Number of GPUs used to evaluate precision and recall. 165 | 166 | Returns: 167 | State (dict): Dict that contains precision and recall calculated from 168 | ref_features and eval_features. 169 | """ 170 | state = dict() 171 | num_images = ref_features.shape[0] 172 | num_features = ref_features.shape[1] 173 | 174 | # Initialize DistanceBlock and ManifoldEstimators. 175 | distance_block = DistanceBlock(num_features, num_gpus) 176 | ref_manifold = ManifoldEstimator(distance_block, ref_features, row_batch_size, col_batch_size, nhood_sizes) 177 | eval_manifold = ManifoldEstimator(distance_block, eval_features, row_batch_size, col_batch_size, nhood_sizes) 178 | 179 | # Evaluate precision and recall using k-nearest neighbors. 180 | print('Evaluating k-NN precision and recall with %i samples...' % num_images) 181 | start = time() 182 | 183 | # Precision: How many points from eval_features are in ref_features manifold. 184 | precision = ref_manifold.evaluate(eval_features) 185 | state['precision'] = precision.mean(axis=0) 186 | 187 | # Recall: How many points from ref_features are in eval_features manifold. 188 | recall = eval_manifold.evaluate(ref_features) 189 | state['recall'] = recall.mean(axis=0) 190 | 191 | print('Evaluated k-NN precision and recall in: %gs' % (time() - start)) 192 | 193 | return state['precision'], state['recall'] 194 | 195 | #---------------------------------------------------------------------------- -------------------------------------------------------------------------------- /metrics/parzen.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Parzen window loglikelihood estimate, 4 | Breuleux, O., Bengio, Y., and Vincent, P. (2011). Quickly generating representative samples from an 5 | RBM-derived process. Neural Computation, 23(8), 2053–2073. 6 | 7 | 8 | Original code author: Yann N.Dauphin and Ian Goodfellow 9 | https://github.com/goodfeli/adversarial/blob/master/parzen_ll.py 10 | Modified by Boris van Breugel (bv292@cam.ac.uk) 11 | 12 | """ 13 | 14 | import numpy as np 15 | import theano.tensor as T 16 | import theano 17 | from tqdm import tqdm 18 | 19 | 20 | def get_nll(x, parzen, batch_size=10): 21 | """ 22 | Credit: Yann N. Dauphin 23 | """ 24 | 25 | inds = range(x.shape[0]) 26 | n_batches = int(np.ceil(float(len(inds)) / batch_size)) 27 | 28 | nlls = [] 29 | for i in range(n_batches): 30 | nll = parzen(x[inds[i::n_batches]]) 31 | nlls.extend(nll) 32 | 33 | return np.array(nlls) 34 | 35 | 36 | def log_mean_exp(a): 37 | """ 38 | Credit: Yann N. Dauphin 39 | """ 40 | 41 | max_ = a.max(1) 42 | 43 | return max_ + T.log(T.exp(a - max_.dimshuffle(0, 'x')).mean(1)) 44 | 45 | 46 | def theano_parzen(mu, sigma): 47 | """ 48 | Credit: Yann N. Dauphin 49 | """ 50 | 51 | x = T.matrix() 52 | mu = theano.shared(mu) 53 | a = ( x.dimshuffle(0, 'x', 1) - mu.dimshuffle('x', 0, 1) ) / sigma 54 | E = log_mean_exp(-0.5*(a**2).sum(2)) 55 | Z = mu.shape[1] * T.log(sigma * np.sqrt(np.pi * 2)) 56 | 57 | return theano.function([x], E - Z) 58 | 59 | 60 | def cross_validate_sigma(samples, data, sigmas, batch_size): 61 | 62 | lls = [] 63 | for sigma in tqdm(sigmas): 64 | print(sigma) 65 | parzen = theano_parzen(samples, sigma) 66 | tmp = get_nll(data, parzen, batch_size = batch_size) 67 | lls.append(np.asarray(tmp).mean()) 68 | del parzen 69 | 70 | ind = np.argmax(lls) 71 | return sigmas[ind] 72 | 73 | 74 | def compute_parzen(orig_data, synth_data, sigma=None, start_sigma=-0.5, end_sigma=0.5, num_cv_evals=10, batch_size = 10): 75 | # Preprocess the data 76 | orig_data = np.asarray(orig_data) 77 | synth_data = np.asarray(synth_data) 78 | 79 | no, x_dim = np.shape(orig_data) 80 | 81 | 82 | 83 | if sigma is None: 84 | # Divide train / test 85 | orig_data_valid = orig_data[:int(no/5),:] 86 | orig_data_test = orig_data[int(no/5):,:] 87 | 88 | synth_data_valid = synth_data[:int(no/5),:] 89 | synth_data_test = synth_data[int(no/5):,:] 90 | sigma_range = np.logspace(start_sigma, end_sigma, num=num_cv_evals) 91 | sigma = cross_validate_sigma(synth_data_valid, orig_data_valid, sigma_range, batch_size) 92 | else: 93 | orig_data_test = orig_data 94 | synth_data_test = synth_data 95 | # fit and evaluate 96 | print('Using Sigma:', sigma) 97 | parzen = theano_parzen(synth_data_test, sigma) 98 | ll = get_nll(orig_data_test, parzen, batch_size = batch_size) 99 | se = ll.std() / np.sqrt(orig_data_test.shape[0]) 100 | 101 | return ll.mean(), se -------------------------------------------------------------------------------- /metrics/prd_score.py: -------------------------------------------------------------------------------- 1 | # - 2 | # coding=utf-8 3 | # Taken from: 4 | # https://github.com/google/compare_gan/blob/master/compare_gan/src/prd_score.py 5 | # 6 | # Changes: 7 | # - default dpi changed from 150 to 300 8 | # - added handling of cases where P = Q, where precision/recall may be 9 | # just above 1, leading to errors for the f_beta computation 10 | # 11 | # Copyright 2018 Google LLC & Hwalsuk Lee. 12 | # 13 | # Licensed under the Apache License, Version 2.0 (the "License"); 14 | # you may not use this file except in compliance with the License. 15 | # You may obtain a copy of the License at 16 | # 17 | # http://www.apache.org/licenses/LICENSE-2.0 18 | # 19 | # Unless required by applicable law or agreed to in writing, software 20 | # distributed under the License is distributed on an "AS IS" BASIS, 21 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 22 | # See the License for the specific language governing permissions and 23 | # limitations under the License. 24 | 25 | """Precision and recall computation based on samples from two distributions. 26 | 27 | Given a sample from the true and the fake distribution embedded in some feature 28 | space (say, Inception), it computes the precision and recall via the algorithm 29 | presented in [arxiv.org/abs/1806.00035]. Finally, one can plot the resulting 30 | curves for different models. 31 | 32 | Typical usage example: 33 | 34 | import prd 35 | prd_data_1 = prd.compute_prd_from_embedding(eval_feats_1, ref_feats_1) 36 | prd_data_2 = prd.compute_prd_from_embedding(eval_feats_2, ref_feats_2) 37 | prd.plot([prd_data_1, prd_data_2], ['GAN_1', 'GAN_2']) 38 | """ 39 | 40 | from __future__ import absolute_import 41 | from __future__ import division 42 | from __future__ import print_function 43 | 44 | from matplotlib import pyplot as plt 45 | import numpy as np 46 | import sklearn.cluster 47 | 48 | 49 | def compute_prd(eval_dist, ref_dist, num_angles=1001, epsilon=1e-10): 50 | """Computes the PRD curve for discrete distributions. 51 | 52 | This function computes the PRD curve for the discrete distribution eval_dist 53 | with respect to the reference distribution ref_dist. This implements the 54 | algorithm in [arxiv.org/abs/1806.2281349]. The PRD will be computed for an 55 | equiangular grid of num_angles values between [0, pi/2]. 56 | 57 | Args: 58 | eval_dist: 1D NumPy array or list of floats with the probabilities of the 59 | different states under the distribution to be evaluated. 60 | ref_dist: 1D NumPy array or list of floats with the probabilities of the 61 | different states under the reference distribution. 62 | num_angles: Number of angles for which to compute PRD. Must be in [3, 1e6]. 63 | The default value is 1001. 64 | epsilon: Angle for PRD computation in the edge cases 0 and pi/2. The PRD 65 | will be computes for epsilon and pi/2-epsilon, respectively. 66 | The default value is 1e-10. 67 | 68 | Returns: 69 | precision: NumPy array of shape [num_angles] with the precision for the 70 | different ratios. 71 | recall: NumPy array of shape [num_angles] with the recall for the different 72 | ratios. 73 | 74 | Raises: 75 | ValueError: If not 0 < epsilon <= 0.1. 76 | ValueError: If num_angles < 3. 77 | """ 78 | 79 | if not (epsilon > 0 and epsilon < 0.1): 80 | raise ValueError('epsilon must be in (0, 0.1] but is %s.' % str(epsilon)) 81 | if not (num_angles >= 3 and num_angles <= 1e6): 82 | raise ValueError('num_angles must be in [3, 1e6] but is %d.' % num_angles) 83 | 84 | # Compute slopes for linearly spaced angles between [0, pi/2] 85 | angles = np.linspace(epsilon, np.pi/2 - epsilon, num=num_angles) 86 | slopes = np.tan(angles) 87 | 88 | # Broadcast slopes so that second dimension will be states of the distribution 89 | slopes_2d = np.expand_dims(slopes, 1) 90 | 91 | # Broadcast distributions so that first dimension represents the angles 92 | ref_dist_2d = np.expand_dims(ref_dist, 0) 93 | eval_dist_2d = np.expand_dims(eval_dist, 0) 94 | 95 | # Compute precision and recall for all angles in one step via broadcasting 96 | precision = np.minimum(ref_dist_2d*slopes_2d, eval_dist_2d).sum(axis=1) 97 | recall = precision / slopes 98 | 99 | # handle numerical instabilities leaing to precision/recall just above 1 100 | max_val = max(np.max(precision), np.max(recall)) 101 | if max_val > 1.001: 102 | raise ValueError('Detected value > 1.001, this should not happen.') 103 | precision = np.clip(precision, 0, 1) 104 | recall = np.clip(recall, 0, 1) 105 | 106 | return precision, recall 107 | 108 | 109 | def _cluster_into_bins(eval_data, ref_data, num_clusters): 110 | """Clusters the union of the data points and returns the cluster distribution. 111 | 112 | Clusters the union of eval_data and ref_data into num_clusters using minibatch 113 | k-means. Then, for each cluster, it computes the number of points from 114 | eval_data and ref_data. 115 | 116 | Args: 117 | eval_data: NumPy array of data points from the distribution to be evaluated. 118 | ref_data: NumPy array of data points from the reference distribution. 119 | num_clusters: Number of cluster centers to fit. 120 | 121 | Returns: 122 | Two NumPy arrays, each of size num_clusters, where i-th entry represents the 123 | number of points assigned to the i-th cluster. 124 | """ 125 | 126 | cluster_data = np.vstack([eval_data, ref_data]) 127 | kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=num_clusters, n_init=10) 128 | labels = kmeans.fit(cluster_data).labels_ 129 | 130 | eval_labels = labels[:len(eval_data)] 131 | ref_labels = labels[len(eval_data):] 132 | 133 | eval_bins = np.histogram(eval_labels, bins=num_clusters, 134 | range=[0, num_clusters], density=True)[0] 135 | ref_bins = np.histogram(ref_labels, bins=num_clusters, 136 | range=[0, num_clusters], density=True)[0] 137 | return eval_bins, ref_bins 138 | 139 | 140 | def compute_prd_from_embedding(eval_data, ref_data, num_clusters=20, 141 | num_angles=1001, num_runs=10, 142 | enforce_balance=True): 143 | """Computes PRD data from sample embeddings. 144 | 145 | The points from both distributions are mixed and then clustered. This leads 146 | to a pair of histograms of discrete distributions over the cluster centers 147 | on which the PRD algorithm is executed. 148 | 149 | The number of points in eval_data and ref_data must be equal since 150 | unbalanced distributions bias the clustering towards the larger dataset. The 151 | check can be disabled by setting the enforce_balance flag to False (not 152 | recommended). 153 | 154 | Args: 155 | eval_data: NumPy array of data points from the distribution to be evaluated. 156 | ref_data: NumPy array of data points from the reference distribution. 157 | num_clusters: Number of cluster centers to fit. The default value is 20. 158 | num_angles: Number of angles for which to compute PRD. Must be in [3, 1e6]. 159 | The default value is 1001. 160 | num_runs: Number of independent runs over which to average the PRD data. 161 | enforce_balance: If enabled, throws exception if eval_data and ref_data do 162 | not have the same length. The default value is True. 163 | 164 | Returns: 165 | precision: NumPy array of shape [num_angles] with the precision for the 166 | different ratios. 167 | recall: NumPy array of shape [num_angles] with the recall for the different 168 | ratios. 169 | 170 | Raises: 171 | ValueError: If len(eval_data) != len(ref_data) and enforce_balance is set to 172 | True. 173 | """ 174 | 175 | if enforce_balance and len(eval_data) != len(ref_data): 176 | raise ValueError( 177 | 'The number of points in eval_data %d is not equal to the number of ' 178 | 'points in ref_data %d. To disable this exception, set enforce_balance ' 179 | 'to False (not recommended).' % (len(eval_data), len(ref_data))) 180 | 181 | eval_data = np.array(eval_data, dtype=np.float64) 182 | ref_data = np.array(ref_data, dtype=np.float64) 183 | precisions = [] 184 | recalls = [] 185 | for _ in range(num_runs): 186 | eval_dist, ref_dist = _cluster_into_bins(eval_data, ref_data, num_clusters) 187 | precision, recall = compute_prd(eval_dist, ref_dist, num_angles) 188 | precisions.append(precision) 189 | recalls.append(recall) 190 | precision = np.mean(precisions, axis=0) 191 | recall = np.mean(recalls, axis=0) 192 | return precision, recall 193 | 194 | 195 | def _prd_to_f_beta(precision, recall, beta=1, epsilon=1e-10): 196 | """Computes F_beta scores for the given precision/recall values. 197 | 198 | The F_beta scores for all precision/recall pairs will be computed and 199 | returned. 200 | 201 | For precision p and recall r, the F_beta score is defined as: 202 | F_beta = (1 + beta^2) * (p * r) / ((beta^2 * p) + r) 203 | 204 | Args: 205 | precision: 1D NumPy array of precision values in [0, 1]. 206 | recall: 1D NumPy array of precision values in [0, 1]. 207 | beta: Beta parameter. Must be positive. The default value is 1. 208 | epsilon: Small constant to avoid numerical instability caused by division 209 | by 0 when precision and recall are close to zero. 210 | 211 | Returns: 212 | NumPy array of same shape as precision and recall with the F_beta scores for 213 | each pair of precision/recall. 214 | 215 | Raises: 216 | ValueError: If any value in precision or recall is outside of [0, 1]. 217 | ValueError: If beta is not positive. 218 | """ 219 | 220 | if not ((precision >= 0).all() and (precision <= 1).all()): 221 | raise ValueError('All values in precision must be in [0, 1].') 222 | if not ((recall >= 0).all() and (recall <= 1).all()): 223 | raise ValueError('All values in recall must be in [0, 1].') 224 | if beta <= 0: 225 | raise ValueError('Given parameter beta %s must be positive.' % str(beta)) 226 | 227 | return (1 + beta**2) * (precision * recall) / ( 228 | (beta**2 * precision) + recall + epsilon) 229 | 230 | 231 | def prd_to_max_f_beta_pair(precision, recall, beta=8): 232 | """Computes max. F_beta and max. F_{1/beta} for precision/recall pairs. 233 | 234 | Computes the maximum F_beta and maximum F_{1/beta} score over all pairs of 235 | precision/recall values. This is useful to compress a PRD plot into a single 236 | pair of values which correlate with precision and recall. 237 | 238 | For precision p and recall r, the F_beta score is defined as: 239 | F_beta = (1 + beta^2) * (p * r) / ((beta^2 * p) + r) 240 | 241 | Args: 242 | precision: 1D NumPy array or list of precision values in [0, 1]. 243 | recall: 1D NumPy array or list of precision values in [0, 1]. 244 | beta: Beta parameter. Must be positive. The default value is 8. 245 | 246 | Returns: 247 | f_beta: Maximum F_beta score. 248 | f_beta_inv: Maximum F_{1/beta} score. 249 | 250 | Raises: 251 | ValueError: If beta is not positive. 252 | """ 253 | 254 | if not ((precision >= 0).all() and (precision <= 1).all()): 255 | raise ValueError('All values in precision must be in [0, 1].') 256 | if not ((recall >= 0).all() and (recall <= 1).all()): 257 | raise ValueError('All values in recall must be in [0, 1].') 258 | if beta <= 0: 259 | raise ValueError('Given parameter beta %s must be positive.' % str(beta)) 260 | 261 | f_beta = np.max(_prd_to_f_beta(precision, recall, beta)) 262 | f_beta_inv = np.max(_prd_to_f_beta(precision, recall, 1/beta)) 263 | return f_beta, f_beta_inv 264 | 265 | 266 | def plot(precision_recall_pairs, labels=None, out_path=None, 267 | legend_loc='lower left', dpi=300): 268 | """Plots precision recall curves for distributions. 269 | 270 | Creates the PRD plot for the given data and stores the plot in a given path. 271 | 272 | Args: 273 | precision_recall_pairs: List of prd_data to plot. Each item in this list is 274 | a 2D array of precision and recall values for the 275 | same number of ratios. 276 | labels: Optional list of labels of same length as list_of_prd_data. The 277 | default value is None. 278 | out_path: Output path for the resulting plot. If None, the plot will be 279 | opened via plt.show(). The default value is None. 280 | legend_loc: Location of the legend. The default value is 'lower left'. 281 | dpi: Dots per inch (DPI) for the figure. The default value is 150. 282 | 283 | Raises: 284 | ValueError: If labels is a list of different length than list_of_prd_data. 285 | """ 286 | 287 | 288 | if labels is not None and len(labels) != len(precision_recall_pairs): 289 | raise ValueError( 290 | 'Length of labels %d must be identical to length of ' 291 | 'precision_recall_pairs %d.' 292 | % (len(labels), len(precision_recall_pairs))) 293 | 294 | fig = plt.figure(figsize=(3.5, 3.5), dpi=dpi) 295 | plot_handle = fig.add_subplot(111) 296 | plot_handle.tick_params(axis='both', which='major', labelsize=12) 297 | 298 | for i in range(len(precision_recall_pairs)): 299 | precision, recall = precision_recall_pairs[i] 300 | label = labels[i] if labels is not None else None 301 | plt.plot(recall, precision, label=label, alpha=0.5, linewidth=3) 302 | 303 | if labels is not None: 304 | plt.legend(loc=legend_loc) 305 | 306 | plt.xlim([0, 1]) 307 | plt.ylim([0, 1]) 308 | plt.xlabel('Recall', fontsize=12) 309 | plt.ylabel('Precision', fontsize=12) 310 | plt.tight_layout() 311 | if out_path is None: 312 | plt.show() 313 | else: 314 | plt.savefig(out_path, bbox_inches='tight', dpi=dpi) 315 | plt.close() -------------------------------------------------------------------------------- /metrics/prdc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Taken from https://github.com/clovaai/generative-evaluation-prdc 5 | 6 | prdc 7 | Copyright (c) 2020-present NAVER Corp. 8 | MIT license 9 | 10 | """ 11 | 12 | import numpy as np 13 | import sklearn.metrics 14 | 15 | __all__ = ['compute_prdc'] 16 | 17 | 18 | def compute_pairwise_distance(data_x, data_y=None): 19 | """ 20 | Args: 21 | data_x: numpy.ndarray([N, feature_dim], dtype=np.float32) 22 | data_y: numpy.ndarray([N, feature_dim], dtype=np.float32) 23 | Returns: 24 | numpy.ndarray([N, N], dtype=np.float32) of pairwise distances. 25 | """ 26 | if data_y is None: 27 | data_y = data_x 28 | dists = sklearn.metrics.pairwise_distances( 29 | data_x, data_y, metric='euclidean', n_jobs=8) 30 | return dists 31 | 32 | 33 | def get_kth_value(unsorted, k, axis=-1): 34 | """ 35 | Args: 36 | unsorted: numpy.ndarray of any dimensionality. 37 | k: int 38 | Returns: 39 | kth values along the designated axis. 40 | """ 41 | indices = np.argpartition(unsorted, k, axis=axis)[..., :k] 42 | k_smallests = np.take_along_axis(unsorted, indices, axis=axis) 43 | kth_values = k_smallests.max(axis=axis) 44 | return kth_values 45 | 46 | 47 | def compute_nearest_neighbour_distances(input_features, nearest_k): 48 | """ 49 | Args: 50 | input_features: numpy.ndarray([N, feature_dim], dtype=np.float32) 51 | nearest_k: int 52 | Returns: 53 | Distances to kth nearest neighbours. 54 | """ 55 | distances = compute_pairwise_distance(input_features) 56 | radii = get_kth_value(distances, k=nearest_k + 1, axis=-1) 57 | return radii 58 | 59 | 60 | def compute_prdc(real_features, fake_features, nearest_k=5): 61 | """ 62 | Computes precision, recall, density, and coverage given two manifolds. 63 | Args: 64 | real_features: numpy.ndarray([N, feature_dim], dtype=np.float32) 65 | fake_features: numpy.ndarray([N, feature_dim], dtype=np.float32) 66 | nearest_k: int. 67 | Returns: 68 | dict of precision, recall, density, and coverage. 69 | """ 70 | 71 | real_nearest_neighbour_distances = compute_nearest_neighbour_distances( 72 | real_features, nearest_k) 73 | fake_nearest_neighbour_distances = compute_nearest_neighbour_distances( 74 | fake_features, nearest_k) 75 | distance_real_fake = compute_pairwise_distance( 76 | real_features, fake_features) 77 | 78 | precision = ( 79 | distance_real_fake < 80 | np.expand_dims(real_nearest_neighbour_distances, axis=1) 81 | ).any(axis=0).mean() 82 | 83 | recall = ( 84 | distance_real_fake < 85 | np.expand_dims(fake_nearest_neighbour_distances, axis=0) 86 | ).any(axis=1).mean() 87 | 88 | density = (1. / float(nearest_k)) * ( 89 | distance_real_fake < 90 | np.expand_dims(real_nearest_neighbour_distances, axis=1) 91 | ).sum(axis=0).mean() 92 | 93 | coverage = ( 94 | distance_real_fake.min(axis=1) < 95 | real_nearest_neighbour_distances 96 | ).mean() 97 | 98 | return dict(precision=precision, recall=recall, 99 | density=density, coverage=coverage) -------------------------------------------------------------------------------- /metrics/precision_recall.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | 4 | From 5 | https://github.com/msmsajjadi/precision-recall-distributions/blob/master/prd_from_image_folders.py 6 | """ 7 | 8 | # coding=utf-8 9 | # Copyright: Mehdi S. M. Sajjadi (msajjadi.com) 10 | 11 | import metrics.prd_score as prd 12 | from metrics.improved_precision_recall import knn_precision_recall_features 13 | 14 | def compute_prc(orig_data,synth_data, params=None, plot_path=None, improved_version=True, verbose=True): 15 | if verbose: 16 | print('computing PRD') 17 | if improved_version: 18 | prd_data = knn_precision_recall_features(orig_data,synth_data) 19 | else: 20 | if params is None: 21 | params = {} 22 | params['num_clusters'] = 20 23 | params['num_angles'] = 1001 24 | params['num_runs'] = 10 25 | prd_data = prd.compute_prd_from_embedding( 26 | eval_data=synth_data, 27 | ref_data=orig_data, 28 | num_clusters=params['num_clusters'], 29 | num_angles=params['num_angles'], 30 | num_runs=params['num_runs']) 31 | 32 | precision, recall = prd_data 33 | 34 | if verbose: 35 | print('plotting results') 36 | 37 | f_beta = prd.prd_to_max_f_beta_pair(precision, recall, beta=8) 38 | print('%.3f %.3f' % (f_beta[0], f_beta[1])) 39 | 40 | return prd_data 41 | -------------------------------------------------------------------------------- /predictive_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedmalaa/evaluating-generative-models/093910e487d07959db7d87b54698da60aaeb50c0/predictive_models/__init__.py -------------------------------------------------------------------------------- /representations/OneClass.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2021, Ahmed M. Alaa 3 | # Licensed under the BSD 3-clause license (see LICENSE.txt) 4 | 5 | """ 6 | 7 | ----------------------------------------- 8 | One-class representations 9 | ----------------------------------------- 10 | 11 | """ 12 | 13 | from __future__ import absolute_import, division, print_function 14 | 15 | import numpy as np 16 | import sys 17 | 18 | import logging 19 | import torch 20 | import torch.nn as nn 21 | 22 | if not sys.warnoptions: 23 | import warnings 24 | warnings.simplefilter("ignore") 25 | 26 | from representations.networks import * 27 | 28 | from torch.autograd import Variable 29 | 30 | # One-class loss functions 31 | # ------------------------ 32 | 33 | 34 | def OneClassLoss(outputs, c): 35 | 36 | dist = torch.sum((outputs - c) ** 2, dim=1) 37 | loss = torch.mean(dist) 38 | 39 | return loss 40 | 41 | 42 | def SoftBoundaryLoss(outputs, R, c, nu): 43 | 44 | dist = torch.sum((outputs - c) ** 2, dim=1) 45 | scores = dist - R ** 2 46 | loss = R ** 2 + (1 / nu) * torch.mean(torch.max(torch.zeros_like(scores), scores)) 47 | 48 | scores = dist 49 | loss = (1 / nu) * torch.mean(torch.max(torch.zeros_like(scores), scores)) 50 | 51 | return loss 52 | 53 | 54 | LossFns = dict({"OneClass": OneClassLoss, "SoftBoundary": SoftBoundaryLoss}) 55 | 56 | # Base network 57 | # --------------------- 58 | 59 | class BaseNet(nn.Module): 60 | 61 | """Base class for all neural networks.""" 62 | 63 | def __init__(self): 64 | 65 | super().__init__() 66 | 67 | self.logger = logging.getLogger(self.__class__.__name__) 68 | self.rep_dim = None # representation dimensionality, i.e. dim of the last layer 69 | 70 | def forward(self, *input): 71 | 72 | """Forward pass logic 73 | 74 | :return: Network output 75 | """ 76 | raise NotImplementedError 77 | 78 | def summary(self): 79 | 80 | """Network summary.""" 81 | 82 | net_parameters = filter(lambda p: p.requires_grad, self.parameters()) 83 | params = sum([np.prod(p.size()) for p in net_parameters]) 84 | 85 | self.logger.info('Trainable parameters: {}'.format(params)) 86 | self.logger.info(self) 87 | 88 | 89 | def get_radius(dist:torch.Tensor, nu:float): 90 | 91 | """Optimally solve for radius R via the (1-nu)-quantile of distances.""" 92 | 93 | return np.quantile(np.sqrt(dist.clone().data.float().numpy()), 1 - nu) 94 | 95 | class OneClassLayer(BaseNet): 96 | 97 | def __init__(self, params=None, hyperparams=None): 98 | 99 | super().__init__() 100 | 101 | # set all representation parameters - remove these lines 102 | 103 | self.rep_dim = params["rep_dim"] 104 | self.input_dim = params["input_dim"] 105 | self.num_layers = params["num_layers"] 106 | self.num_hidden = params["num_hidden"] 107 | self.activation = params["activation"] 108 | self.dropout_prob = params["dropout_prob"] 109 | self.dropout_active = params["dropout_active"] 110 | self.loss_type = params["LossFn"] 111 | self.train_prop = params['train_prop'] 112 | self.learningRate = params['lr'] 113 | self.epochs = params['epochs'] 114 | self.warm_up_epochs = params['warm_up_epochs'] 115 | self.weight_decay = params['weight_decay'] 116 | if torch.cuda.is_available(): 117 | self.device = torch.device('cuda') # Make this an option 118 | else: 119 | self.device = torch.device('cpu') 120 | # set up the network 121 | 122 | self.model = build_network(network_name="feedforward", params=params).to(self.device) 123 | 124 | # create the loss function 125 | 126 | self.c = hyperparams["center"].to(self.device) 127 | self.R = hyperparams["Radius"] 128 | self.nu = hyperparams["nu"] 129 | 130 | self.loss_fn = LossFns[self.loss_type] 131 | 132 | 133 | def forward(self, x): 134 | 135 | x = self.model(x) 136 | 137 | return x 138 | 139 | 140 | def fit(self, x_train, verbosity=True): 141 | 142 | 143 | self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.learningRate, weight_decay = self.weight_decay) 144 | self.X = torch.tensor(x_train.reshape((-1, self.input_dim))).float() 145 | 146 | if self.train_prop != 1: 147 | x_train, x_val = x_train[:int(self.train_prop*len(x_train))], x_train[int(self.train_prop*len(x_train)):] 148 | inputs_val = Variable(torch.from_numpy(x_val).to(self.device)).float() 149 | 150 | self.losses = [] 151 | self.loss_vals = [] 152 | 153 | 154 | for epoch in range(self.epochs): 155 | 156 | # Converting inputs and labels to Variable 157 | 158 | inputs = Variable(torch.from_numpy(x_train)).to(self.device).float() 159 | 160 | self.model.zero_grad() 161 | 162 | self.optimizer.zero_grad() 163 | 164 | # get output from the model, given the inputs 165 | outputs = self.model(inputs) 166 | 167 | # get loss for the predicted output 168 | 169 | if self.loss_type=="SoftBoundary": 170 | 171 | self.loss = self.loss_fn(outputs=outputs, R=self.R, c=self.c, nu=self.nu) 172 | 173 | elif self.loss_type=="OneClass": 174 | 175 | self.loss = self.loss_fn(outputs=outputs, c=self.c) 176 | 177 | 178 | #self.c = torch.mean(torch.tensor(outputs).float(), dim=0) 179 | 180 | # get gradients w.r.t to parameters 181 | self.loss.backward(retain_graph=True) 182 | self.losses.append(self.loss.detach().cpu().numpy()) 183 | 184 | # update parameters 185 | self.optimizer.step() 186 | 187 | if (epoch >= self.warm_up_epochs) and (self.loss_type=="SoftBoundary"): 188 | 189 | dist = torch.sum((outputs - self.c) ** 2, dim=1) 190 | #self.R = torch.tensor(get_radius(dist, self.nu)) 191 | 192 | if self.train_prop != 1.0: 193 | with torch.no_grad(): 194 | 195 | # get output from the model, given the inputs 196 | outputs = self.model(inputs_val) 197 | 198 | # get loss for the predicted output 199 | 200 | if self.loss_type=="SoftBoundary": 201 | 202 | loss_val = self.loss_fn(outputs=outputs, R=self.R, c=self.c, nu=self.nu) 203 | 204 | elif self.loss_type=="OneClass": 205 | 206 | loss_val = self.loss_fn(outputs=outputs, c=self.c).detach.cpu().numpy() 207 | 208 | self.loss_vals.append(loss_val) 209 | 210 | 211 | 212 | 213 | if verbosity: 214 | if self.train_prop == 1: 215 | print('epoch {}, loss {}'.format(epoch, self.loss.item())) 216 | else: 217 | print('epoch {:4}, train loss {:.4e}, val loss {:.4e}'.format(epoch, self.loss.item(),loss_val)) 218 | 219 | 220 | -------------------------------------------------------------------------------- /representations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedmalaa/evaluating-generative-models/093910e487d07959db7d87b54698da60aaeb50c0/representations/__init__.py -------------------------------------------------------------------------------- /representations/networks.py: -------------------------------------------------------------------------------- 1 | 2 | # Copyright (c) 2021, Ahmed M. Alaa 3 | # Licensed under the BSD 3-clause license (see LICENSE.txt) 4 | 5 | """ 6 | 7 | ----------------------------------------- 8 | Construction of feature representations 9 | ----------------------------------------- 10 | 11 | + build_network: 12 | -------------- 13 | | 14 | +--------> feedforward_network: 15 | | 16 | +--------> recurrent_network: 17 | | 18 | +--------> MNIST_network: 19 | 20 | """ 21 | 22 | # TODO: add arguments details 23 | 24 | 25 | from __future__ import absolute_import, division, print_function 26 | 27 | # import numpy as np 28 | # import pandas as pd 29 | import sys 30 | 31 | if not sys.warnoptions: 32 | import warnings 33 | warnings.simplefilter("ignore") 34 | 35 | import torch 36 | # from torch.autograd import Variable 37 | # import torch.nn.functional as nnf 38 | # from torch.utils.data import random_split 39 | # from torch.optim import SGD 40 | from torch import nn 41 | 42 | 43 | # from copy import deepcopy 44 | # import time 45 | 46 | torch.manual_seed(1) 47 | 48 | # Global variables 49 | 50 | ACTIVATION_DICT = {"ReLU": torch.nn.ReLU(), 51 | "Hardtanh": torch.nn.Hardtanh(), 52 | "ReLU6": torch.nn.ReLU6(), 53 | "Sigmoid": torch.nn.Sigmoid(), 54 | "Tanh": torch.nn.Tanh(), 55 | "ELU": torch.nn.ELU(), 56 | "CELU": torch.nn.CELU(), 57 | "SELU": torch.nn.SELU(), 58 | "GLU": torch.nn.GLU(), 59 | "LeakyReLU": torch.nn.LeakyReLU(), 60 | "LogSigmoid": torch.nn.LogSigmoid(), 61 | "Softplus": torch.nn.Softplus()} 62 | 63 | 64 | def build_network(network_name, params): 65 | 66 | if network_name=="feedforward": 67 | 68 | net = feedforward_network(params) 69 | 70 | return net 71 | 72 | 73 | def feedforward_network(params): 74 | 75 | """Architecture for a Feedforward Neural Network 76 | 77 | Args: 78 | 79 | ::params:: 80 | 81 | ::params["input_dim"]:: 82 | ::params[""rep_dim""]:: 83 | ::params["num_hidden"]:: 84 | ::params["activation"]:: 85 | ::params["num_layers"]:: 86 | ::params["dropout_prob"]:: 87 | ::params["dropout_active"]:: 88 | ::params["LossFn"]:: 89 | 90 | Returns: 91 | 92 | ::_architecture:: 93 | 94 | """ 95 | 96 | modules = [] 97 | 98 | if params["dropout_active"]: 99 | 100 | modules.append(torch.nn.Dropout(p=params["dropout_prob"])) 101 | 102 | # Input layer 103 | 104 | modules.append(torch.nn.Linear(params["input_dim"], params["num_hidden"],bias=False)) 105 | modules.append(ACTIVATION_DICT[params["activation"]]) 106 | 107 | # Intermediate layers 108 | 109 | for u in range(params["num_layers"] - 1): 110 | 111 | if params["dropout_active"]: 112 | 113 | modules.append(torch.nn.Dropout(p=params["dropout_prob"])) 114 | 115 | modules.append(torch.nn.Linear(params["num_hidden"], params["num_hidden"], 116 | bias=False)) 117 | modules.append(ACTIVATION_DICT[params["activation"]]) 118 | 119 | 120 | # Output layer 121 | 122 | modules.append(torch.nn.Linear(params["num_hidden"], params["rep_dim"],bias=False)) 123 | 124 | _architecture = nn.Sequential(*modules) 125 | 126 | return _architecture 127 | -------------------------------------------------------------------------------- /representations/ts_embedding/__init__.py: -------------------------------------------------------------------------------- 1 | """Timeseries encoding to a fixed size vector representation. 2 | 3 | Author: Evgeny Saveliev (e.s.saveliev@gmail.com) 4 | """ 5 | 6 | from .seq2seq_autoencoder import Encoder, Decoder, Seq2Seq, init_hidden, compute_loss 7 | from .training import train_seq2seq_autoencoder, iterate_eval_set 8 | -------------------------------------------------------------------------------- /representations/ts_embedding/seq2seq_autoencoder.py: -------------------------------------------------------------------------------- 1 | """Seq-2-Seq autoencoder. 2 | """ 3 | import torch 4 | import torch.nn as nn 5 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence 6 | 7 | 8 | class Encoder(nn.Module): 9 | def __init__(self, input_size, hidden_size, num_rnn_layers): 10 | super(Encoder, self).__init__() 11 | 12 | self.input_size = input_size 13 | self.hidden_size = hidden_size 14 | self.num_rnn_layers = num_rnn_layers 15 | 16 | self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_rnn_layers, batch_first=True) 17 | 18 | def forward(self, x, x_seq_lengths, hc, padding_value, max_seq_len): 19 | x = pack_padded_sequence(x, x_seq_lengths, batch_first=True, enforce_sorted=False) 20 | x, hc = self.lstm(x, hc) 21 | x, x_seq_lens = pad_packed_sequence(x, batch_first=True, padding_value=padding_value, total_length=max_seq_len) 22 | return x, x_seq_lens, hc 23 | 24 | 25 | class Decoder(nn.Module): 26 | def __init__(self, input_size, hidden_size, num_rnn_layers): 27 | super(Decoder, self).__init__() 28 | 29 | self.input_size = input_size 30 | self.hidden_size = hidden_size 31 | self.num_rnn_layers = num_rnn_layers 32 | 33 | self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_rnn_layers, batch_first=True) 34 | self.linear = nn.Linear(hidden_size, input_size) 35 | 36 | def forward(self, x, x_seq_lengths, hc, padding_value, max_seq_len): 37 | batch_size = x.shape[0] 38 | x = pack_padded_sequence(x, x_seq_lengths, batch_first=True, enforce_sorted=False) 39 | x, hc = self.lstm(x, hc) 40 | x, x_seq_lens = pad_packed_sequence(x, batch_first=True, padding_value=padding_value, total_length=max_seq_len) 41 | # x = x.contiguous() 42 | x = x.view(-1, self.hidden_size) 43 | x = self.linear(x) 44 | x = x.view(batch_size, -1, self.input_size) 45 | return x, x_seq_lens, hc 46 | 47 | 48 | class Seq2Seq(nn.Module): 49 | def __init__(self, encoder, decoder): 50 | super(Seq2Seq, self).__init__() 51 | assert encoder.input_size == decoder.input_size 52 | assert encoder.hidden_size == decoder.hidden_size 53 | self.encoder = encoder 54 | self.decoder = decoder 55 | def forward(self, x_enc, x_dec, x_seq_lengths, hc_init, padding_value, max_seq_len): 56 | # print(x_enc.dtype, x_dec.dtype, x_seq_lengths.dtype, hc_init[0].dtype, hc_init[1].dtype) 57 | x_enc_out, _, hc_enc = self.encoder(x_enc, x_seq_lengths, hc_init, padding_value, max_seq_len) 58 | # print("x_enc.shape", x_enc.shape) 59 | # print("x_enc_out.shape", x_enc_out.shape) 60 | x_dec_out, _, hc_dec = self.decoder(x_dec, x_seq_lengths, hc_enc, padding_value, max_seq_len) 61 | return x_dec_out, hc_enc 62 | def get_embeddings_only(self, x_enc, x_seq_lengths, hc_init, padding_value, max_seq_len): 63 | _, _, hc_enc = self.encoder(x_enc, x_seq_lengths, hc_init, padding_value, max_seq_len) 64 | return hc_enc 65 | 66 | 67 | def init_hidden(batch_size, hidden_size, num_rnn_layers, device): 68 | h = torch.zeros(num_rnn_layers, batch_size, hidden_size, device=device, dtype=torch.float32) 69 | c = torch.zeros(num_rnn_layers, batch_size, hidden_size, device=device, dtype=torch.float32) 70 | return (h, c) 71 | 72 | 73 | def compute_loss(loss_function, x_pred, x_targ, x_seq_len): 74 | assert x_pred.shape == x_targ.shape 75 | 76 | mask = torch.ones_like(x_pred, dtype=int).to(x_pred.device) 77 | mask_seq_len = x_seq_len - 1 # As target sequence is one shorter. 78 | for idx, l in enumerate(mask_seq_len): 79 | mask[idx, l.item():, :] = 0. 80 | 81 | x_pred *= mask 82 | x_targ *= mask 83 | 84 | loss = loss_function(x_pred, x_targ) 85 | return loss 86 | -------------------------------------------------------------------------------- /representations/ts_embedding/training.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | try: 4 | import IPython.display 5 | except ImportError: 6 | print("IPython not found, ts_embedding > training live plot will not work.") 7 | 8 | import torch 9 | import torch.nn as nn 10 | # import torch.optim as optim 11 | 12 | from .seq2seq_autoencoder import init_hidden, compute_loss 13 | 14 | 15 | loss_function = nn.MSELoss(reduction="none") 16 | 17 | 18 | def iterate_eval_set(seq2seq, dataloader, padding_value, max_seq_len): 19 | epoch_test_loss = 0. 20 | 21 | seq2seq.eval() 22 | n_samples_test = 0 23 | with torch.no_grad(): 24 | for iter_, (x, x_len, x_rev, x_rev_shift) in enumerate(dataloader): 25 | batch_size = x.shape[0] 26 | n_samples_test += batch_size 27 | 28 | hc_init = init_hidden( 29 | batch_size=batch_size, 30 | hidden_size=seq2seq.encoder.hidden_size, 31 | num_rnn_layers=seq2seq.encoder.num_rnn_layers, 32 | device=x.device) 33 | 34 | x_dec_out, hc_repr = seq2seq( 35 | x_enc=x, 36 | x_dec=x_rev, 37 | x_seq_lengths=x_len, 38 | hc_init=hc_init, 39 | padding_value=padding_value, 40 | max_seq_len=max_seq_len 41 | ) 42 | 43 | loss_tensor = compute_loss( 44 | loss_function=loss_function, x_pred=x_dec_out, x_targ=x_rev_shift, x_seq_len=x_len) 45 | loss = loss_tensor.mean() 46 | epoch_test_loss += loss.item() * batch_size 47 | 48 | epoch_test_loss /= n_samples_test 49 | 50 | return epoch_test_loss 51 | 52 | 53 | def train_seq2seq_autoencoder( 54 | seq2seq, 55 | optimizer, 56 | train_dataloader, 57 | val_dataloader, 58 | n_epochs, 59 | batch_size, 60 | padding_value, 61 | max_seq_len, 62 | jupyter_live_plot_enabled=False 63 | ): 64 | 65 | train_losses, val_losses = np.full([n_epochs], np.nan), np.full([n_epochs], np.nan) 66 | x_axis = list(range(1, n_epochs + 1)) 67 | 68 | for epoch in range(n_epochs): 69 | epoch_train_loss = 0. 70 | epoch_val_loss = 0. 71 | # print(f"Epoch {epoch}") 72 | 73 | seq2seq.train() 74 | n_samples_train = 0 75 | for iter_, (x, x_len, x_rev, x_rev_shift) in enumerate(train_dataloader): 76 | batch_size = x.shape[0] 77 | n_samples_train += batch_size 78 | 79 | optimizer.zero_grad() 80 | hc_init = init_hidden( 81 | batch_size=batch_size, 82 | hidden_size=seq2seq.encoder.hidden_size, 83 | num_rnn_layers=seq2seq.encoder.num_rnn_layers, 84 | device=x.device) 85 | 86 | x_dec_out, hc_repr = seq2seq( 87 | x_enc=x, 88 | x_dec=x_rev, 89 | x_seq_lengths=x_len, 90 | hc_init=hc_init, 91 | padding_value=padding_value, 92 | max_seq_len=max_seq_len 93 | ) 94 | 95 | loss_tensor = compute_loss( 96 | loss_function=loss_function, x_pred=x_dec_out, x_targ=x_rev_shift, x_seq_len=x_len) 97 | loss = loss_tensor.mean() 98 | epoch_train_loss += loss.item() * batch_size 99 | 100 | loss.backward() 101 | optimizer.step() 102 | 103 | epoch_train_loss /= n_samples_train 104 | 105 | epoch_val_loss = iterate_eval_set( 106 | seq2seq=seq2seq, dataloader=val_dataloader, padding_value=padding_value, max_seq_len=max_seq_len) 107 | 108 | train_losses[epoch] = epoch_train_loss 109 | val_losses[epoch] = epoch_val_loss 110 | 111 | if jupyter_live_plot_enabled or (not jupyter_live_plot_enabled and epoch == n_epochs-1): 112 | # A live updating plot showing the training and validation over time (i.e. over epochs). 113 | plt.plot(x_axis, train_losses, label = "training loss") 114 | plt.plot(x_axis, val_losses, label = "validation loss") 115 | plt.title("Training Tracker") 116 | plt.legend() 117 | x_max = n_epochs 118 | y_max = np.nanmax(train_losses) 119 | plt.xlim(1, x_max) 120 | plt.ylim(0, y_max) 121 | if jupyter_live_plot_enabled: 122 | IPython.display.clear_output(wait=True) 123 | plt.show() 124 | plt.savefig("./training_log.png", dpi=300) 125 | 126 | print(f"Epoch {epoch}: Tr.Ls.={epoch_train_loss:.3f} Vl.Ls.={epoch_val_loss:.3f}") 127 | -------------------------------------------------------------------------------- /representations/ts_embedding/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import numpy as np 4 | 5 | import torch 6 | from torch.utils.data import TensorDataset, DataLoader 7 | 8 | from .seq2seq_autoencoder import init_hidden 9 | 10 | 11 | def rearrange_data(x, x_len, pad_val, eos_val): 12 | """Take in sequence `x` [dims `(n_samples, max_seq_len, n_features)`, data type `float`] and an array of 13 | sequence lengths `x_len` [dims `(n_samples,)`, data type `int`] and return: 14 | * a reversed sequence `x_rev`, same dims as `x`, and padded at the same indices as `x`. 15 | * a reversed and shifted (forward by one) sequence `x_rev_shifted`, same dims as `x`, and padded at the same 16 | indices as `x`. Like `x_rev` but sequence elements at x_{t} become x_{t-1}, so element at `t=0` is lost 17 | and the element at `t=t_end_of_sequence` is assigned `eos_val`. 18 | Note that `x` is expected to be padded at the end along the sequence dimension, rather than at the beginning. 19 | 20 | Args: 21 | x (np.ndarray): sequence data [dims `(n_samples, max_seq_len, n_features)`, data type `float`]. 22 | x_len (np.ndarray): array of sequence lengths [dims `(n_samples,)`, data type `int`]. 23 | pad_val (float): padding value to use in output arrays. 24 | eos_val (float): end-of-sequence indicator value to use in the output `x_rev_shifted`. 25 | 26 | Returns: 27 | Tuple[np.ndarray, np.ndarray]: x_rev, x_rev_shifted 28 | """ 29 | x_rev = np.full_like(x, pad_val) 30 | x_rev_shifted = np.full_like(x, pad_val) 31 | for idx, l in enumerate(x_len): 32 | x_rev[idx][:l] = x[idx][:l][::-1].copy() 33 | x_rev_shifted[idx][:l-1] = x_rev[idx][1:l] 34 | x_rev_shifted[idx][l-1] = eos_val 35 | return x_rev, x_rev_shifted 36 | 37 | 38 | def data_to_tensors(x, x_len, x_rev, x_rev_shifted, float_type, device): 39 | X = torch.tensor(x, device=device, dtype=float_type) 40 | X_rev = torch.tensor(x_rev, device=device, dtype=float_type) 41 | X_rev_shifted = torch.tensor(x_rev_shifted, device=device, dtype=float_type) 42 | X_len = torch.tensor(x_len, dtype=int) # CPU by requirement of packing. 43 | return X, X_len, X_rev, X_rev_shifted 44 | 45 | 46 | def inference_data_to_tensors(x, x_len, float_type, device): 47 | X = torch.tensor(x, device=device, dtype=float_type) 48 | X_len = torch.tensor(x_len, dtype=int) # CPU by requirement of packing. 49 | return X, X_len 50 | 51 | 52 | def _generate_dummy_data(n_samples, min_timesteps, max_timesteps, n_features, pad_val, seed): 53 | np.random.seed(seed) 54 | 55 | seq_lengths = np.random.randint(low=min_timesteps, high=max_timesteps+1, size=n_samples) 56 | # ^ We assume all features for the same example have same seq length. 57 | 58 | data = np.full((n_samples, max_timesteps, n_features), pad_val) 59 | for i, length in enumerate(seq_lengths): 60 | generated_data = np.random.randn(length, n_features) 61 | data[i, 0:length, :] = generated_data 62 | 63 | return data, seq_lengths 64 | 65 | 66 | def generate_dummy_data( 67 | n_samples: int, 68 | min_timesteps: int, 69 | max_timesteps: int, 70 | n_features: int, 71 | pad_val: float, 72 | eos_val: float, 73 | seed: int, 74 | to_tensors: bool, 75 | float_type: Optional[torch.dtype] = None, 76 | device: Optional[torch.device] = None): 77 | 78 | x, x_len = _generate_dummy_data(n_samples, min_timesteps, max_timesteps, n_features, pad_val, seed) 79 | x_rev, x_rev_shifted = rearrange_data(x, x_len, pad_val, eos_val) 80 | 81 | if to_tensors: 82 | x, x_len, x_rev, x_rev_shifted = data_to_tensors( 83 | x, x_len, x_rev, x_rev_shifted, float_type=float_type, device=device) 84 | 85 | return x, x_len, x_rev, x_rev_shifted 86 | 87 | 88 | def make_dataloader(data_tensors, **dataloader_kwargs): 89 | dataset = TensorDataset(*data_tensors) 90 | dataloader = DataLoader(dataset, **dataloader_kwargs) 91 | return dataset, dataloader 92 | 93 | 94 | def _hc_repr_to_np(hc_repr): 95 | h, c = hc_repr 96 | batch_size = h.shape[1] 97 | h, c = h.view(batch_size, -1), c.view(batch_size, -1) 98 | h, c = h.detach().cpu().numpy(), c.detach().cpu().numpy() 99 | hc = np.hstack([h, c]) 100 | return hc 101 | 102 | 103 | def get_embeddings(seq2seq, dataloaders, padding_value, max_seq_len): 104 | """Put together the embeddings: stack horizontally the arrays of h and c; stack vertically these arrays. 105 | """ 106 | hc_np_list = [] 107 | for dataloader in dataloaders: 108 | seq2seq.eval() 109 | with torch.no_grad(): 110 | for iter_, dataloader_items in enumerate(dataloader): 111 | x, x_len = dataloader_items[0], dataloader_items[1] 112 | batch_size = x.shape[0] 113 | hc_init = init_hidden( 114 | batch_size=batch_size, 115 | hidden_size=seq2seq.encoder.hidden_size, 116 | num_rnn_layers=seq2seq.encoder.num_rnn_layers, 117 | device=x.device) 118 | hc_repr = seq2seq.get_embeddings_only( 119 | x_enc=x, 120 | x_seq_lengths=x_len, 121 | hc_init=hc_init, 122 | padding_value=padding_value, 123 | max_seq_len=max_seq_len) 124 | hc_np = _hc_repr_to_np(hc_repr) 125 | hc_np_list.append(hc_np) 126 | hc_all = np.vstack(hc_np_list) 127 | return hc_all 128 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | keras==2.4.3 2 | scikit-learn==0.23.2 3 | pillow==8.1.0 4 | pandas==1.2.0 5 | matplotlib==3.3.2 6 | tqdm==4.55.1 7 | theano 8 | torch==1.7.1 -------------------------------------------------------------------------------- /requirements_dpgan.txt: -------------------------------------------------------------------------------- 1 | # Requirements for running main_tabular.py with 'dpgan' option 2 | # cudatoolkit: 10.0 3 | # cudnn: 7.6.5 4 | matplotlib==3.3.2 5 | numpy==1.19.2 6 | pandas==1.2.1 7 | torch==1.4.0 8 | scikit-learn==0.23.2 9 | scipy==1.5.2 10 | tqdm==4.56.0 11 | tensorflow-gpu==1.15.0 12 | theano==1.0.5 -------------------------------------------------------------------------------- /requirements_timegan.txt: -------------------------------------------------------------------------------- 1 | numpy==1.19.2 2 | pandas==1.1.3 3 | tqdm==4.55.1 4 | scikit-learn==0.23.2 5 | tensorflow==1.15.0 -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Common utilities. 2 | """ 3 | 4 | from .utils import * 5 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | def check_tf2(): 2 | found = False 3 | message = "Note: TensorFlow 2.x not found, some functionality may not be available." 4 | try: 5 | import tensorflow as tf 6 | if str(tf.__version__).split(".")[0] == "2": # pylint: disable=no-member 7 | found = True 8 | except ImportError: 9 | pass 10 | if not found: 11 | print(message) 12 | return found 13 | --------------------------------------------------------------------------------