├── .gitignore
├── LICENSE.txt
├── README.md
├── TorchOCC.ipynb
├── audit.py
├── data
    ├── __init__.py
    ├── amsterdam
    │   ├── __init__.py
    │   ├── data_preprocess.py
    │   ├── data_scripts.py
    │   └── data_utils.py
    ├── googlestock
    │   ├── __init__.py
    │   └── data_preprocess.py
    └── snp500
    │   └── all_stocks_5yr.csv
├── environment.yml
├── generative_models
    ├── __init__.py
    ├── adsgan.py
    ├── dpgan
    │   ├── __init__.py
    │   ├── impl
    │   │   ├── __init__.py
    │   │   ├── data.py
    │   │   └── fc.py
    │   ├── main.py
    │   └── utilize.py
    ├── gan.py
    ├── pategan.py
    ├── pategan_from_bitbucket.py
    ├── rgan
    │   └── experiments
    │   │   └── settings
    │   │       ├── rgan-dp.txt
    │   │       ├── rgan.txt
    │   │       ├── rgan_dp.txt
    │   │       ├── sine.txt
    │   │       └── test_modified.txt
    ├── timegan
    │   ├── __init__.py
    │   ├── timegan.py
    │   └── utils.py
    └── vae.py
├── main_image.py
├── main_tabular.py
├── main_timeseries.py
├── main_timeseries_embedding.py
├── metrics
    ├── __init__.py
    ├── combined.py
    ├── compute_identifiability.py
    ├── compute_wd.py
    ├── evaluation.py
    ├── evaluation_old.py
    ├── feature_distribution.py
    ├── fid.py
    ├── improved_precision_recall.py
    ├── parzen.py
    ├── prd_score.py
    ├── prdc.py
    └── precision_recall.py
├── predictive_models
    └── __init__.py
├── representations
    ├── OneClass.py
    ├── __init__.py
    ├── networks.py
    └── ts_embedding
    │   ├── __init__.py
    │   ├── seq2seq_autoencoder.py
    │   ├── training.py
    │   └── utils.py
├── requirements.txt
├── requirements_dpgan.txt
├── requirements_timegan.txt
├── toy_metric_evaluation.ipynb
└── utils
    ├── __init__.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Standard gitignore for python projects, from: 
  2 | # https://github.com/DonJayamanne/vscode-python-samples/blob/master/.gitignore
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | env/
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # dotenv
 86 | .env
 87 | 
 88 | # virtualenv
 89 | .venv
 90 | venv/
 91 | ENV/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # ---
107 | 
108 | # Custom:
109 | *.pkl
110 | .tmp*
111 | .vscode/
112 | .pylintrc
113 | visualisations/
114 | jupyter/
115 | models/
116 | 
117 | # Data dir content
118 | data/*
119 | !data/**/
120 | !data/__init__.py
121 | 
122 | data/amsterdam/*
123 | !data/amsterdam/*.py
124 | data/googlestock/*
125 | !data/googlestock/*.py
126 | data/snp500/*
127 | !data/snp500/*.csv
128 | !data/snp500/*.py
129 | 
130 | data/ts_embedding/*
131 | !data/ts_embedding/*.py
132 | data/ts_generated/*
133 | !data/ts_generated/*.py
134 | 
135 | data/mnist/*
136 | !data/mnist/*.py
137 | 
138 | data/tabular/original/*
139 | !data/tabular/original/*.py
140 | data/tabular/synth/*
141 | !data/tabular/synth/*.py
142 | 
143 | generative_models/rgan/experiments/data
144 | generative_models/rgan/experiments/parameters
145 | generative_models/rgan/experiments/plots
146 | generative_models/rgan/experiments/traces
147 | 
148 | /training_log.png
149 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [year] [fullname]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # evaluating-generative-models
2 | 
3 | 🚧 This codebase is still a work in progress - expect some updates as we finalize and tidy up the code.
4 | 


--------------------------------------------------------------------------------
/audit.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | """
  4 | Author: Boris van Breugel (bv292@cam.ac.uk)
  5 |   ----------------------------------------- 
  6 |   Auditing implementation
  7 |   ----------------------------------------- 
  8 | 
  9 | """
 10 | 
 11 | import numpy as np
 12 | from sklearn.neighbors import NearestNeighbors
 13 | 
 14 | import logging
 15 | import torch
 16 | import scipy
 17 | from generative_models.adsgan import adsgan
 18 | from metrics.evaluation import compute_alpha_precision
 19 | from metrics.evaluation_old import compute_alpha_precision_old
 20 | 
 21 | device = 'cuda' # matrices are too big for gpu
 22 | 
 23 | 
 24 | def audit(real_data, params, OC_model):
 25 | 
 26 |     
 27 |     n_steps = 30
 28 |     n_orig = real_data.shape[0]
 29 |     nn_size = 2
 30 |     alphas  = np.linspace(0, 1, n_steps)
 31 |         
 32 |     emb_center = torch.tensor(OC_model.c, device='cpu')
 33 | 
 34 |     with torch.no_grad():
 35 |         X = OC_model(torch.tensor(real_data.to_numpy(), device=OC_model.device).float().to(device)).cpu().detach().numpy()
 36 | 
 37 |     Radii   = np.quantile(torch.sqrt(torch.sum((torch.tensor(X).float() - emb_center) ** 2, dim=1)), alphas)
 38 |     alpha_precision_curve = []
 39 |     beta_coverage_curve   = []
 40 |     nbrs_real = NearestNeighbors(n_neighbors = 2, n_jobs=-1, p=2).fit(X)
 41 |     real_to_real, real_to_real_args       = nbrs_real.kneighbors(X)
 42 |     real_to_real          = torch.from_numpy(real_to_real[:,1].squeeze())
 43 |     
 44 |     print('Difference a;lf', (real_to_real_args[:,0]==np.arange(n_orig)).mean())
 45 |     real_to_real_args          = real_to_real_args[:,1].squeeze()
 46 | 
 47 | 
 48 |     number_per_quantile = np.round(np.quantile(np.arange(n_orig),alphas))
 49 |     number_per_quantile = number_per_quantile[1:] - number_per_quantile[:-1] 
 50 |     
 51 |     r2r = scipy.spatial.distance_matrix(X,X)
 52 |     r2r[np.eye(n_orig, dtype='bool')] = np.max(r2r)+1 #just set it large so it's not chosen
 53 |     min_r2r = np.min(r2r,axis=1)
 54 |     min_r2r_args = np.argmin(r2r,axis=1)    
 55 |     print('min_r2r', (min_r2r==0).mean())
 56 |     
 57 |     print('Difference abs', np.max(np.abs(min_r2r-real_to_real.numpy())))
 58 |     print('Difference arguments')
 59 |     
 60 |     
 61 |     synthetic_data = []
 62 |     
 63 |     generate_more = True
 64 |     iteration = 0
 65 | 
 66 |     while generate_more:
 67 |         print('Iteration:',iteration)
 68 |         iteration+=1
 69 |         synth_data = adsgan(real_data, params)
 70 |         with torch.no_grad():
 71 |             Y = OC_model(torch.tensor(synth_data, device=OC_model.device).float().to(device)).cpu().detach().numpy()
 72 |         
 73 |         
 74 |     
 75 |         nbrs_synth = NearestNeighbors(n_neighbors = 1, n_jobs=-1, p=2).fit(Y)
 76 |         real_to_synth, real_to_synth_args = nbrs_synth.kneighbors(X)
 77 |         real_to_synth         = torch.from_numpy(real_to_synth.squeeze())
 78 |         real_to_synth_args    = real_to_synth_args.squeeze()
 79 |         print('Mean real to synth' , torch.mean(real_to_synth))
 80 |         print('mean real to real', torch.mean(real_to_real[real_to_synth_args]))
 81 |         # Audit
 82 |         #authen = np.ones(len(real_to_synth),dtype='bool')#
 83 |         authen = real_to_real[real_to_synth_args] < real_to_synth
 84 |         indices_to_use_authen = np.arange(len(authen), dtype = 'int')[authen]
 85 |         synth_data = synth_data[indices_to_use_authen]
 86 |         print('After auditing out unauthentic points, points remain:',synth_data.shape[0])
 87 | 
 88 |         Y = Y[indices_to_use_authen]
 89 | 
 90 |         nbrs_synth            = NearestNeighbors(n_neighbors = 1, n_jobs=-1, p=2).fit(Y)
 91 |         
 92 |         real_to_synth, real_to_synth_args = nbrs_synth.kneighbors(X)
 93 |         
 94 |         real_to_synth         = torch.from_numpy(real_to_synth.squeeze())
 95 |         real_to_synth_args    = real_to_synth_args.squeeze()
 96 |         
 97 |         print('After which the authenticity is', np.mean(np.array(real_to_real[real_to_synth_args] < real_to_synth,dtype='bool')))
 98 |         
 99 | 
100 | 
101 |         # Precisions
102 |         synth_center          = torch.tensor(np.mean(Y, axis=0)).float()
103 |         synth_to_center       = torch.sqrt(torch.sum((torch.tensor(Y).float() - emb_center) ** 2, dim=1))
104 | 
105 |         real_synth_closest    = Y[real_to_synth_args]
106 |         real_synth_closest_d  = torch.sqrt(torch.sum((torch.tensor(real_synth_closest).float()- synth_center) ** 2, dim=1))
107 |         closest_synth_Radii   = np.quantile(real_synth_closest_d, alphas)
108 | 
109 |         n_synth = Y.shape[0]
110 |         indices_available = np.ones(n_synth)
111 |         indices_use = np.zeros(n_synth, dtype = 'bool')
112 |         
113 |         
114 |         generate_more = False
115 | 
116 |         for k in range(n_steps-1):
117 |             if number_per_quantile[k] != 0:
118 |                 
119 |                 precision_mask = (synth_to_center <= Radii[k+1]).detach().float().numpy()
120 |                 indices_close_enough = np.arange(n_synth,dtype='int')[np.logical_and(precision_mask, indices_available)]
121 |                 indices_available = np.logical_not(precision_mask)
122 |                 number_to_add = int(min(number_per_quantile[k], len(indices_close_enough)))
123 |                 indices_close_enough = indices_close_enough[:number_to_add]
124 |                 indices_use[indices_close_enough] = True
125 |                 number_per_quantile[k] -= number_to_add
126 |                 if number_per_quantile[k] != 0: 
127 |                     generate_more = True
128 |         
129 |         
130 |         synthetic_data.append(synth_data[indices_use])
131 |     
132 |     synthetic_data = np.concatenate(synthetic_data,axis=0)
133 |     with torch.no_grad():
134 |         Y = OC_model(torch.tensor(synthetic_data, device=OC_model.device).float().to(device)).cpu().detach().numpy()
135 |     
136 |     print('new results', compute_alpha_precision(X,Y, emb_center)[3:])
137 |     print('old_results', compute_alpha_precision_old(X,Y, emb_center)[3:-1])
138 | 
139 |     return synthetic_data


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedmalaa/evaluating-generative-models/093910e487d07959db7d87b54698da60aaeb50c0/data/__init__.py


--------------------------------------------------------------------------------
/data/amsterdam/__init__.py:
--------------------------------------------------------------------------------
 1 | """Hide-and-Seek Privacy Challenge Codebase.
 2 | 
 3 | Reference: James Jordon, Daniel Jarrett, Jinsung Yoon, Ari Ercole, Cheng Zhang, Danielle Belgrave, Mihaela van der Schaar, 
 4 | "Hide-and-Seek Privacy Challenge: Synthetic Data Generation vs. Patient Re-identification with Clinical Time-series Data," 
 5 | Neural Information Processing Systems (NeurIPS) Competition, 2020.
 6 | 
 7 | Link: https://www.vanderschaar-lab.com/announcing-the-neurips-2020-hide-and-seek-privacy-challenge/
 8 | 
 9 | Last updated Date: Jan 19th 2021
10 | Code author: Jinsung Yoon, Evgeny Saveliev
11 | Contact: jsyoon0823@gmail.com, e.s.saveliev@gmail.com
12 | """
13 | 
14 | from .data_preprocess import (
15 |     AmsterdamLoader, 
16 |     preprocess_data,
17 |     padding_mask_to_seq_lens, 
18 |     convert_front_padding_to_back_padding
19 | )
20 | from .data_utils import data_division
21 | 


--------------------------------------------------------------------------------
/data/amsterdam/data_preprocess.py:
--------------------------------------------------------------------------------
  1 | """Amsterdam UMCdb data preprocessing.
  2 | 
  3 | The source data files required are those prepared for Hide-and-Seek NeurIPS 2020 competition:
  4 | ```
  5 | train_longitudinal_data.csv
  6 | test_longitudinal_data.csv
  7 | ```
  8 | 
  9 | Author: Evgeny Saveliev (e.s.saveliev@gmail.com)
 10 | """
 11 | 
 12 | import os
 13 | from typing import Union, Tuple
 14 | 
 15 | import numpy as np
 16 | import pandas as pd
 17 | from tqdm import tqdm
 18 | from sklearn.preprocessing import MinMaxScaler
 19 | 
 20 | from .data_utils import data_division
 21 | 
 22 | 
 23 | # ----------------------------------------------------------------------------------------------------------------------
 24 | # General helpers.
 25 | 
 26 | def _to_3d(arr: np.ndarray, max_seq_len: int) -> np.ndarray:
 27 |     n_patients = arr.shape[0] // max_seq_len
 28 |     dim = arr.shape[1]
 29 |     return np.reshape(arr, [n_patients, max_seq_len, dim])
 30 | 
 31 | 
 32 | def _to_2d(arr: np.ndarray) -> np.ndarray:
 33 |     n_patients = arr.shape[0]
 34 |     max_seq_len = arr.shape[1]
 35 |     dim = arr.shape[2]
 36 |     return np.reshape(arr, [n_patients * max_seq_len, dim])
 37 | 
 38 | 
 39 | # ----------------------------------------------------------------------------------------------------------------------
 40 | # Helpers for Seq2Seq autoencoder.
 41 | 
 42 | def combine_csvs(path_train, path_test, path_combined):
 43 |     df_train = pd.read_csv(os.path.abspath(path_train))
 44 |     df_test = pd.read_csv(os.path.abspath(path_test))
 45 |     df_combined = df_train.append(df_test, ignore_index=True)
 46 |     df_combined.sort_values(by=["admissionid", "Unnamed: 0"], ignore_index=True, inplace=True)
 47 |     df_combined.to_csv(os.path.abspath(path_combined), index=False)
 48 | 
 49 | 
 50 | def downsample_csv_by_admissionids(path, path_downsampled, downsample_n_ids, seed):
 51 |     df = pd.read_csv(os.path.abspath(path))
 52 |     ids = df["admissionid"].unique()
 53 |     np.random.seed(seed)
 54 |     np.random.shuffle(ids)
 55 |     ds_ids = ids[:downsample_n_ids]
 56 |     df_ds = df[df["admissionid"].isin(ds_ids)]
 57 |     df_ds.to_csv(os.path.abspath(path_downsampled), index=False)
 58 | 
 59 | 
 60 | def padding_mask_to_seq_lens(padding_mask):
 61 |     padding_mask_inverted = -1 * (padding_mask.astype(int) - 1)
 62 |     padding_mask_as_seq_lens = padding_mask_inverted.sum(axis=1)[:, 0]  # Sum 1s along sequence dimension.  
 63 |     # ^ As identical length for each feature, take 0th.
 64 |     return padding_mask_as_seq_lens
 65 | 
 66 | 
 67 | def convert_front_padding_to_back_padding(data, seq_lens, pad_val):
 68 |     if 0 in seq_lens:
 69 |         raise ValueError("0 encountered in seq_lens.")
 70 |     data_ = np.full_like(data, pad_val)
 71 |     for idx, l in enumerate(seq_lens):
 72 |         data_[idx, :l, :] = data[idx, -l:, :]
 73 |     return data_
 74 | 
 75 | 
 76 | # ----------------------------------------------------------------------------------------------------------------------
 77 | # Data loader.
 78 | class AmsterdamLoader(object):
 79 |     
 80 |     def __init__(
 81 |         self,
 82 |         data_path: str,
 83 |         max_seq_len: int,
 84 |         seed: int,
 85 |         train_rate: float,
 86 |         val_rate: float,
 87 |         include_time: bool,
 88 |         debug_data: Union[int, bool] = False,
 89 |         pad_before: bool = False,
 90 |         padding_fill: float = -1.,
 91 |     ) -> None:
 92 |         """Initialise Amsterdam data loader. Here, the Amsterdam data refers to the Hide-and-Seek competition subset 
 93 |         ot the Amsterdam UMCdb dataset, specifically `train_longitudinal_data.csv` or `test_longitudinal_data.csv`.
 94 | 
 95 |         Args:
 96 |             data_path (str): Data CSV file path.
 97 |             max_seq_len (int): Maximum sequence length of the time series dimension - for reshaping.
 98 |             seed (int): Random seed for data split.
 99 |             train_rate (float): The fraction of the data to allocate to training set.
100 |             val_rate (float): The fraction of the data to allocate to validation set.
101 |             include_time (bool): Whether to include time as the 0th feature in each example.
102 |             debug_data (Union[int, bool], optional): If int, read only top debug_data-many rows, if True, 
103 |                 read only top 10000 rows, if False read whole dataset. Defaults to False.
104 |             pad_before (bool, optional): If True, padding will be added at the beginning of time dimension, 
105 |                 else padding added at the end. Defaults to False.
106 |             padding_fill (float, optional): Pad timeseries vectors shorter than max_seq_len with this value. 
107 |                 Defaults to -1.
108 |         """
109 |         assert train_rate > 0. and val_rate >= 0. and (train_rate + val_rate) < 1.
110 |         self.data_path = os.path.abspath(data_path)
111 |         self.max_seq_len = max_seq_len
112 |         self.seed = seed
113 |         self.train_rate = train_rate
114 |         self.val_rate = val_rate
115 |         self.include_time = include_time
116 |         self.debug_data = debug_data
117 |         self.pad_before = pad_before
118 |         self.padding_fill = padding_fill
119 | 
120 |     def load_reshape_split_data(self, force_refresh: bool) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
121 |         """Load prepared data, reshape to a 3D array of shape [num_examples, max_seq_len, num_features], 
122 |         split into train, validation sets. Preprocessing of the data is done separately using `preprocess_data()`.
123 | 
124 |         Args:
125 |             force_refresh (bool): If True, will rerun this from scratch, rather than using results cached in npz file.
126 | 
127 |         Returns:
128 |             Tuple[np.ndarray, np.ndarray, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]: 
129 |                 raw_data, padding_mask, (train_idx, val_idx, test_idx)
130 |         """
131 | 
132 |         npz_path = self.data_path.replace(".csv", ".npz")
133 | 
134 |         if os.path.exists(npz_path) and not force_refresh:
135 |             
136 |             print(f"Found existing cached .npz file ({npz_path}), using cached data. Set force_refresh=True to refresh.")
137 |             with np.load(npz_path) as data:
138 |                 raw_data = data["raw_data"]
139 |                 padding_mask = data["padding_mask"]
140 |                 train_idx = data["train_idx"]
141 |                 val_idx = data["val_idx"]
142 |                 test_idx = data["test_idx"]
143 | 
144 |         else:
145 |             
146 |             raw_data, padding_mask = self._load_and_reshape(self.data_path)
147 |             _, (train_idx, val_idx, test_idx) = data_division(
148 |                 raw_data, 
149 |                 seed=self.seed, 
150 |                 divide_rates=[self.train_rate, self.val_rate, 1 - self.train_rate - self.val_rate]
151 |             )
152 | 
153 |             np.savez(npz_path, raw_data=raw_data, padding_mask=padding_mask, train_idx=train_idx, val_idx=val_idx, test_idx=test_idx)
154 | 
155 |         return raw_data, padding_mask, (train_idx, val_idx, test_idx)
156 | 
157 |     def _load_and_reshape(self, file_name: str) -> Tuple[np.ndarray, np.ndarray]:
158 |         """Load data from `file_name` and reshape into a 3D array of shape [num_examples, max_seq_len, num_features].
159 |         A padding mask of data will also be produced (same shape), having elements True where time series were padded 
160 |         (due to being shorter than max_seq_len).
161 | 
162 |         Note:
163 |             The 0th feature is time.
164 | 
165 |         Args:
166 |             file_name (str): Original data CSV file.
167 | 
168 |         Returns:
169 |             Tuple[np.ndarray, np.ndarray]: [0] loaded and reshaped data, [1] corresponding padding.
170 |         """
171 |         padding_indicator = -999.0  # This value avoids clashing with any actual data.
172 | 
173 |         # Load data
174 |         if self.debug_data is not False:
175 |             if isinstance(self.debug_data, bool):
176 |                 nrows: Union[int, None] = 10000
177 |             else:
178 |                 assert isinstance(self.debug_data, int), "debug_data argument must be bool or int."
179 |                 nrows = self.debug_data
180 |         else:
181 |             nrows = None
182 |         ori_data = pd.read_csv(file_name, nrows=nrows)
183 |         if ori_data.columns[0] == "Unnamed: 0":  # Remove spurious column, so that column 0 is now 'admissionid'.
184 |             ori_data = ori_data.drop(["Unnamed: 0"], axis=1)
185 | 
186 |         # Drop time column if requested.
187 |         if not self.include_time:
188 |             ori_data = ori_data.drop(["time"], axis=1)
189 | 
190 |         # Parameters
191 |         uniq_id = np.unique(ori_data["admissionid"])
192 |         no = len(uniq_id)
193 |         dim = len(ori_data.columns) - 1
194 | 
195 |         # Output initialization
196 |         assert np.any(ori_data == padding_indicator) == False, f"Padding indicator value {padding_indicator} found in data"
197 |         loaded_data = np.empty([no, self.max_seq_len, dim])  # Shape: [no, max_seq_len, dim]
198 |         loaded_data.fill(padding_indicator)
199 | 
200 |         # For each unique id
201 |         print("Reshaping data...")
202 |         for i in tqdm(range(no)):
203 | 
204 |             # Extract the time-series data with a certain admissionid
205 |             idx = ori_data.index[ori_data["admissionid"] == uniq_id[i]]
206 |             curr_data = ori_data.iloc[idx].to_numpy()  # Shape: [curr_no, dim + 1]
207 | 
208 |             # Assign to the preprocessed data (Excluding ID)
209 |             curr_no = len(curr_data)
210 |             if curr_no >= self.max_seq_len:
211 |                 loaded_data[i, :, :] = curr_data[:self.max_seq_len, 1:]  # Shape: [1, max_seq_len, dim]
212 |             else:
213 |                 if self.pad_before:
214 |                     loaded_data[i, -curr_no:, :] = curr_data[:, 1:]  # Shape: [1, max_seq_len, dim]
215 |                 else:
216 |                     loaded_data[i, :curr_no, :] = curr_data[:, 1:]  # Shape: [1, max_seq_len, dim]
217 | 
218 |         padding_mask = loaded_data == padding_indicator
219 |         loaded_data = np.where(padding_mask, self.padding_fill, loaded_data)
220 | 
221 |         return loaded_data, padding_mask
222 | 
223 | 
224 | # ----------------------------------------------------------------------------------------------------------------------
225 | # Data preprocessing.
226 | 
227 | def preprocess_data(
228 |     data: np.ndarray, 
229 |     padding_mask: np.ndarray, 
230 |     padding_fill: float,
231 |     time_feature_included: bool
232 | ) -> Tuple[np.ndarray, np.ndarray]:
233 |     """Preprocess and impute `data`.
234 | 
235 |     Note:
236 |         If `time_feature_included=True`, the 0th feature is time, and it is preprocessed differently to the other 
237 |         features: not normalized to [0, 1] but shifted by -max_time_for_example.
238 | 
239 |     Args:
240 |         data (np.ndarray of float): 
241 |             Data as loaded (and reshaped to 3D). Shape [num_examples, max_seq_len, num_features].
242 |         padding_mask (np.ndarray of bool): 
243 |             Padding mask of data, indicating True where time series were shorter than max_seq_len and were padded. 
244 |             Same shape as data.
245 |         padding_fill (float): 
246 |             Pad timeseries vectors shorter than max_seq_len with this value.
247 |         time_feature_included (bool): 
248 |             Whether to include time as the 0th feature in each example.
249 | 
250 |     Returns:
251 |         Tuple[np.ndarray, np.ndarray]: [0] preprocessed data, [1] preprocessed and imputed data.
252 |     """
253 |     print("Preprocessing data...")
254 | 
255 |     median_vals = _get_medians(data, padding_mask)
256 |     imputed_data = _impute(data, padding_mask, median_vals, padding_fill)
257 | 
258 |     scaler_imputed = _get_scaler(imputed_data, padding_mask)
259 |     imputed_processed_data = \
260 |         _preprocess(imputed_data, padding_mask, scaler_imputed, padding_fill, time_feature_included)
261 | 
262 |     scaler_original = _get_scaler(data, padding_mask)
263 |     processed_data = \
264 |         _preprocess(data, padding_mask, scaler_original, padding_fill, time_feature_included)
265 | 
266 |     return processed_data, imputed_processed_data
267 | 
268 | def _imputation(curr_data: np.ndarray, median_vals: np.ndarray, zero_fill: bool = True) -> np.ndarray:
269 |     """Impute missing data using bfill, ffill and median imputation.
270 | 
271 |     Args:
272 |         curr_data (np.ndarray): Data before imputation.
273 |         median_vals (np.ndarray): Median values for each column.
274 |         zero_fill (bool, optional): Whather to Fill with zeros the cases where median_val is nan. Defaults to True.
275 | 
276 |     Returns:
277 |         np.ndarray: Imputed data.
278 |     """
279 | 
280 |     curr_data = pd.DataFrame(data=curr_data)
281 |     median_vals = pd.Series(median_vals)
282 | 
283 |     # Backward fill
284 |     imputed_data = curr_data.bfill(axis="rows")
285 |     # Forward fill
286 |     imputed_data = imputed_data.ffill(axis="rows")
287 |     # Median fill
288 |     imputed_data = imputed_data.fillna(median_vals)
289 | 
290 |     # Zero-fill, in case the `median_vals` for a particular feature is `nan`.
291 |     if zero_fill:
292 |         imputed_data = imputed_data.fillna(0.0)
293 | 
294 |     if imputed_data.isnull().any().any():
295 |         raise ValueError("NaN values remain after imputation")
296 | 
297 |     return imputed_data.to_numpy()
298 | 
299 | def _get_medians(data: np.ndarray, padding_mask: np.ndarray):
300 |     assert len(data.shape) == 3
301 | 
302 |     data = _to_2d(data)
303 |     if padding_mask is not None:
304 |         padding_mask = _to_2d(padding_mask)
305 |         data_temp = np.where(padding_mask, np.nan, data)  # To avoid PADDING_INDICATOR affecting results.
306 |     else:
307 |         data_temp = data
308 | 
309 |     # Medians
310 |     median_vals = np.nanmedian(data_temp, axis=0)  # Shape: [dim + 1]
311 | 
312 |     return median_vals
313 | 
314 | def _get_scaler(data: np.ndarray, padding_mask: np.ndarray):
315 |     assert len(data.shape) == 3
316 | 
317 |     data = _to_2d(data)
318 |     if padding_mask is not None:
319 |         padding_mask = _to_2d(padding_mask)
320 |         data_temp = np.where(padding_mask, np.nan, data)  # To avoid PADDING_INDICATOR affecting results.
321 |     else:
322 |         data_temp = data
323 | 
324 |     # Scaler
325 |     scaler = MinMaxScaler()
326 |     scaler.fit(data_temp)  # Note that np.nan's will be left untouched.
327 | 
328 |     return scaler
329 | 
330 | def _impute(
331 |     data: np.ndarray, 
332 |     padding_mask: np.ndarray, 
333 |     median_vals: np.ndarray, 
334 |     padding_fill: float
335 | ) -> Tuple[np.ndarray, np.ndarray]:
336 | 
337 |     assert len(data.shape) == 3
338 | 
339 |     data_imputed_ = np.zeros_like(data)
340 | 
341 |     for i in range(data.shape[0]):
342 |         cur_data = data[i, :, :]
343 |         if padding_mask is not None:
344 |             cur_data = np.where(padding_mask[i, :, :], np.nan, cur_data)
345 | 
346 |         # Scale and impute (excluding time)
347 |         cur_data_imputed = _imputation(cur_data, median_vals)
348 | 
349 |         # Update
350 |         data_imputed_[i, :, :] = cur_data_imputed
351 | 
352 |     # Set padding
353 |     if padding_mask is not None:
354 |         data_imputed_ = np.where(padding_mask, padding_fill, data_imputed_)
355 | 
356 |     return data_imputed_
357 | 
358 | def _preprocess(
359 |     data: np.ndarray, 
360 |     padding_mask: np.ndarray, 
361 |     scaler: MinMaxScaler,
362 |     padding_fill: float,
363 |     time_feature_included: bool,
364 | ) -> Tuple[np.ndarray, np.ndarray]:
365 | 
366 |     assert len(data.shape) == 3
367 | 
368 |     data_ = np.zeros_like(data)
369 | 
370 |     for i in range(data.shape[0]):
371 |         cur_data = data[i, :, :]
372 |         if padding_mask is not None:
373 |             cur_data = np.where(padding_mask[i, :, :], np.nan, cur_data)
374 | 
375 |         # Preprocess time (0th element of dim. 2):
376 |         if time_feature_included:
377 |             preprocessed_time = cur_data[:, 0] - np.nanmin(cur_data[:, 0])
378 | 
379 |         # Scale and impute (excluding time)
380 |         cur_data = scaler.transform(cur_data)
381 | 
382 |         # Set time
383 |         if time_feature_included:
384 |             cur_data[:, 0] = preprocessed_time
385 | 
386 |         # Update
387 |         data_[i, :, :] = cur_data
388 | 
389 |     # Set padding
390 |     if padding_mask is not None:
391 |         data_ = np.where(padding_mask, padding_fill, data_)
392 | 
393 |     return data_
394 | 


--------------------------------------------------------------------------------
/data/amsterdam/data_scripts.py:
--------------------------------------------------------------------------------
 1 | """Amsterdam UMCdb data preprocessing: scripts.
 2 | 
 3 | Author: Evgeny Saveliev (e.s.saveliev@gmail.com)
 4 | """
 5 | from .data_preprocess import combine_csvs, downsample_csv_by_admissionids
 6 | 
 7 | 
 8 | # Script settings:
 9 | run_script = "combine_downsample"
10 | filepaths = {
11 |     "source": {
12 |         "train_data_filepath": "./train_longitudinal_data.csv",
13 |         "test_data_filepath": "./test_longitudinal_data.csv"
14 |     },
15 |     "output": {
16 |         "out_combined_filepath": "./combined_longitudinal_data.csv",
17 |         "out_combined_downsampled_filepath": "./combined_downsampled_longitudinal_data.csv"
18 |     }
19 | }
20 | downsample_n_ids = 1000
21 | downsample_seed = 12345
22 | 
23 | 
24 | def main():
25 |     
26 |     if run_script == "combine_downsample":
27 |         # Note: requires between 64 and 128 GB of memory.
28 |         combine_csvs(
29 |             path_train=filepaths["source"]["train_data_filepath"], 
30 |             path_test=filepaths["source"]["test_data_filepath"],
31 |             path_combined=filepaths["output"]["out_combined_filepath"]
32 |         )
33 |         downsample_csv_by_admissionids(
34 |             path=filepaths["output"]["out_combined_filepath"],
35 |             path_downsampled=filepaths["output"]["out_combined_downsampled_filepath"],
36 |             downsample_n_ids=downsample_n_ids,
37 |             seed=downsample_seed
38 |         )
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/data/amsterdam/data_utils.py:
--------------------------------------------------------------------------------
 1 | """Amsterdam UMCdb data preprocessing: utilities.
 2 | 
 3 | Author: Jinsung Yoon (jsyoon0823@gmail.com)
 4 | """
 5 | 
 6 | import numpy as np
 7 | import random
 8 | 
 9 | 
10 | def data_division(data: np.ndarray, seed: int, divide_rates: list):
11 |     """Divide the dataset into sub datasets.
12 | 
13 |     Args:
14 |         data (np.ndarray): Data.
15 |         seed (int): Random seed for data division.
16 |         divide_rates (list of float): Ratio for each division.
17 | 
18 |     Returns:
19 |         divided_data: Divided data (list format).
20 |         divided_index: Divided data index (list format).
21 |     """
22 |     # sum of the division rates should be 1
23 |     assert sum(divide_rates) == 1
24 | 
25 |     # Output initialization
26 |     divided_data = list()
27 |     divided_index = list()
28 | 
29 |     # Set index
30 |     no = len(data)
31 |     random.seed(seed)
32 |     np.random.seed(seed)
33 |     index = np.random.permutation(no)
34 | 
35 |     # Set divided index & data
36 |     for i in range(len(divide_rates)):
37 |         temp_idx = index[int(no * sum(divide_rates[:i])) : int(no * sum(divide_rates[: (i + 1)]))]
38 |         divided_index.append(temp_idx)
39 | 
40 |         temp_data = [data[j] for j in temp_idx]
41 |         divided_data.append(temp_data)
42 | 
43 |     return divided_data, divided_index
44 | 


--------------------------------------------------------------------------------
/data/googlestock/__init__.py:
--------------------------------------------------------------------------------
1 | """Loading and preprocessing of Google Stock data from: https://www.kaggle.com/thevirusx3/google-stock-market-data
2 | 
3 | Author: Evgeny Saveliev (e.s.saveliev@gmail.com)
4 | """
5 | 
6 | from .data_preprocess import load_stock_data, split_stock_data
7 | 


--------------------------------------------------------------------------------
/data/googlestock/data_preprocess.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pandas as pd
 4 | import numpy as np
 5 | from sklearn.model_selection import train_test_split
 6 | from sklearn.preprocessing import MinMaxScaler
 7 | 
 8 | 
 9 | def _read_stock_csv(path):
10 |     df = pd.read_csv(path, parse_dates=["Date"], thousands=",")
11 |     df["Volume"] = df["Volume"].astype("float")
12 |     return df
13 | 
14 | 
15 | def load_stock_data(train_path, test_path, normalize=True, time=False):
16 |     train_path = os.path.abspath(train_path)
17 |     test_path = os.path.abspath(test_path)
18 | 
19 |     df_train = _read_stock_csv(train_path)
20 |     df_test = _read_stock_csv(test_path)
21 | 
22 |     df = df_train.append(df_test, ignore_index=True)  # Combine so that can do custom train/val/test split.
23 | 
24 |     df["Date"] = (df["Date"] - df["Date"].min())  / np.timedelta64(1, "D")  # Days since start.
25 | 
26 |     data = df.to_numpy()
27 | 
28 |     if normalize:
29 |         scaler = MinMaxScaler()
30 |         data_no_time = data[:, 1:]
31 |         scaler.fit(data_no_time)
32 |         data[:, 1:] = scaler.transform(data_no_time)
33 |     
34 |     if not time:
35 |         data = data[:, 1:]
36 |     
37 |     return data
38 | 
39 | 
40 | DEFAULT_SPLIT_ORDER = {
41 |     "train": 1,
42 |     "val": 2,
43 |     "test": 3,
44 | }
45 | 
46 | 
47 | def split_stock_data(data, frac_train, frac_val, split_order=None):
48 | 
49 |     assert frac_train > 0. and frac_train < 1.
50 |     assert frac_val >= 0. and frac_val < 1.
51 |     
52 |     frac_test = 1. - frac_train - frac_val
53 |     assert frac_test + frac_val + frac_train == 1.
54 |     
55 |     frac_dict = dict()
56 |     for k, v in split_order.items():
57 |         if k == "train":
58 |             frac_dict[v] = frac_train
59 |         elif k == "val":
60 |             frac_dict[v] = frac_val
61 |         else:
62 |             frac_dict[v] = frac_test
63 | 
64 |     #print(frac_dict)
65 |     frac_1_2_of_all = frac_dict[1] + frac_dict[2]
66 |     frac_1_of_1_2 = frac_dict[1] / frac_1_2_of_all
67 |     #print("frac_1_of_1_2", frac_1_of_1_2)
68 | 
69 |     if split_order is None:
70 |         split_order = DEFAULT_SPLIT_ORDER
71 |     assert tuple(sorted(list(split_order.keys()))) == ("test", "train", "val")
72 |     assert tuple(sorted(list(split_order.values()))) == (1, 2, 3)
73 | 
74 |     # Note that shuffle=False.
75 |     data_1_2, data_3 = train_test_split(data, train_size=frac_1_2_of_all, shuffle=False)
76 |     data_1, data_2 = train_test_split(data_1_2, train_size=frac_1_of_1_2, shuffle=False)
77 |     
78 |     split_content = dict()
79 |     for k, v in split_order.items():
80 |         if v == 1:
81 |             split_content[k] = data_1
82 |         elif v == 2:
83 |             split_content[k] = data_2
84 |         else:
85 |             split_content[k] = data_3
86 |     
87 |     print("Split Google Stock data over time in fractions:\n"
88 |         f"'train'={frac_train:.3f}, 'val'={frac_val:.3f}, 'test'={frac_test:.3f}\n"
89 |         f"and the subsets are in the following chronological order: {split_order}")
90 | 
91 |     return split_content["train"], split_content["val"], split_content["test"]
92 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: py36_egm
 2 | channels:
 3 |   - pytorch
 4 |   - defaults
 5 | dependencies:
 6 |   - cudatoolkit=11.0.221
 7 |   - jupyterlab=2.2.6
 8 |   - matplotlib=3.3.2
 9 |   - notebook=6.1.6
10 |   - numpy=1.19.2
11 |   - pandas=1.1.3
12 |   - pip=20.3.3
13 |   - python=3.6.12
14 |   - pytorch=1.7.1
15 |   - scikit-learn=0.23.2
16 |   - scipy=1.5.2
17 |   - tqdm=4.55.1
18 |   - pip:
19 |     - tensorflow-gpu==1.15.0
20 | 


--------------------------------------------------------------------------------
/generative_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedmalaa/evaluating-generative-models/093910e487d07959db7d87b54698da60aaeb50c0/generative_models/__init__.py


--------------------------------------------------------------------------------
/generative_models/adsgan.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Reference: Jinsung Yoon, Lydia N. Drumright, Mihaela van der Schaar, 
  3 | "Anonymization through Data Synthesis using Generative Adversarial Networks (ADS-GAN):
  4 | A harmonizing advancement for AI in medicine," 
  5 | IEEE Journal of Biomedical and Health Informatics (JBHI), 2019.
  6 | Paper link: https://ieeexplore.ieee.org/document/9034117
  7 | Last updated Date: December 22th 2020
  8 | Code author: Jinsung Yoon (jsyoon0823@gmail.com)
  9 | 
 10 | Minor modifications made by Boris van Breugel (bv292@cam.ac.uk) and Evgeny Saveliev (e.s.saveliev@gmail.com).
 11 | -----------------------------
 12 | adsgan.py
 13 | - Generate synthetic data for GAN framework
 14 | (1) Use original data to generate synthetic data
 15 | """
 16 | 
 17 | #%% Import necessary packages
 18 | import tensorflow as tf
 19 | import numpy as np
 20 | 
 21 | from tqdm import tqdm
 22 | 
 23 | tf.compat.v1.disable_eager_execution()
 24 | 
 25 | def adsgan(orig_data, params):
 26 |     """Generate synthetic data for ADSGAN framework.
 27 |     
 28 |     Args:
 29 |         orig_data: original data
 30 |         params: Network parameters
 31 |             mb_size: mini-batch size
 32 |             z_dim: random state dimension
 33 |             h_dim: hidden state dimension
 34 |             lambda: identifiability parameter
 35 |             iterations: training iterations
 36 |             
 37 |     Returns:
 38 |         synth_data: synthetically generated data
 39 |     """
 40 |         
 41 |     # Reset the tensorflow graph
 42 |     tf.compat.v1.reset_default_graph()
 43 |     
 44 |     ## Parameters        
 45 |     # Feature no
 46 |     x_dim = len(orig_data.columns)        
 47 | 
 48 |     try:
 49 |         no = params['sample_no']
 50 |     except KeyError:
 51 |         no = len(orig_data)        
 52 |     
 53 |     # Batch size        
 54 |     mb_size = params['mb_size']
 55 |     # Random variable dimension
 56 |     z_dim = params['z_dim'] 
 57 |     # Hidden unit dimensions
 58 |     h_dim = params['h_dim']        
 59 |     # Identifiability parameter
 60 |     lambda_ = params['lambda']
 61 |     # Training iterations
 62 |     iterations = params['iterations']
 63 |     # WGAN-GP parameters
 64 |     lam = 10
 65 |     lr = 1e-4   
 66 | 
 67 |     # Adam optimization
 68 |     beta_1 = 0.5
 69 | 
 70 |     try:
 71 |         lambda_tester = params['lambda_tester']
 72 |     except KeyError:
 73 |         lambda_tester = False
 74 |     #%% Data Preprocessing
 75 |     orig_data = np.asarray(orig_data)
 76 | 
 77 |     def data_normalization(orig_data, epsilon = 1e-8):
 78 |                 
 79 |         min_val = np.min(orig_data, axis=0)
 80 |         
 81 |         normalized_data = orig_data - min_val
 82 |         
 83 |         max_val = np.max(normalized_data, axis=0)
 84 |         normalized_data = normalized_data / (max_val + epsilon)
 85 |         
 86 |         normalization_params = {"min_val": min_val, "max_val": max_val}
 87 |         
 88 |         return normalized_data, normalization_params
 89 |     
 90 |     def data_renormalization(normalized_data, normalization_params, epsilon = 1e-8):
 91 |         
 92 |         renormalized_data = normalized_data * (normalization_params['max_val'] + epsilon)
 93 |         renormalized_data = renormalized_data + normalization_params['min_val']
 94 |         
 95 |         return renormalized_data
 96 |     
 97 |     orig_data, normalization_params = data_normalization(orig_data)
 98 |         
 99 |     #%% Necessary Functions
100 | 
101 |     # Xavier Initialization Definition
102 |     def xavier_init(size):
103 |         in_dim = size[0]
104 |         xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
105 |         return tf.random.normal(shape = size, stddev = xavier_stddev)   
106 | 
107 |     def xavier_init_I(size):
108 |         if lambda_tester:
109 | 
110 |             return tf.eye(size[0],size[1]) + xavier_init(size)/10
111 |         else:
112 |             return xavier_init(size)
113 | 
114 |     # Sample from uniform distribution
115 |     def sample_Z(m, n):
116 |         return np.random.uniform(-1., 1., size = [m, n])
117 |                 
118 |     # Sample from the real data
119 |     def sample_X(m, n):
120 |         return np.random.permutation(m)[:n]    
121 |          
122 |     #%% Placeholder
123 |     # Feature
124 |     X = tf.compat.v1.placeholder(tf.float32, shape = [None, x_dim])     
125 |     # Random Variable        
126 |     Z = tf.compat.v1.placeholder(tf.float32, shape = [None, z_dim])
127 |             
128 |     #%% Discriminator
129 |     # Discriminator
130 |     D_W1 = tf.Variable(xavier_init([x_dim, h_dim]))
131 |     D_b1 = tf.Variable(tf.zeros(shape=[h_dim]))
132 | 
133 |     D_W2 = tf.Variable(xavier_init([h_dim,h_dim]))
134 |     D_b2 = tf.Variable(tf.zeros(shape=[h_dim]))
135 |     
136 |     D_W3 = tf.Variable(xavier_init([h_dim,1]))
137 |     D_b3 = tf.Variable(tf.zeros(shape=[1]))
138 | 
139 |     theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
140 |     
141 |     if lambda_tester:
142 |         D_W4 = tf.Variable(xavier_init([h_dim,h_dim]))
143 |         D_b4 = tf.Variable(tf.zeros(shape=[h_dim]))
144 |         theta_D+= [D_W4, D_b4]
145 |     
146 |         
147 |     #%% Generator
148 | 
149 |     G_W1 = tf.Variable(xavier_init_I([z_dim + x_dim, h_dim]))
150 |     G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))
151 | 
152 |     G_W2 = tf.Variable(xavier_init_I([h_dim,h_dim]))
153 |     G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))
154 |     
155 |     G_W3 = tf.Variable(xavier_init_I([h_dim,h_dim]))
156 |     G_b3 = tf.Variable(tf.zeros(shape=[h_dim]))
157 | 
158 |     G_W4 = tf.Variable(xavier_init_I([h_dim, x_dim]))
159 |     G_b4 = tf.Variable(tf.zeros(shape=[x_dim]))
160 |     theta_G = [G_W1, G_W2, G_W3, G_W4, G_b1, G_b2, G_b3, G_b4]
161 |         
162 |     #%% Generator and discriminator functions
163 |     def generator(z, x):
164 |         inputs = tf.concat([x, z], axis = 1)
165 |         G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
166 |         G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
167 |         G_h3 = tf.nn.relu(tf.matmul(G_h2, G_W3) + G_b3)
168 |         G_log_prob = tf.nn.sigmoid(tf.matmul(G_h3, G_W4) + G_b4)
169 |                 
170 |         return G_log_prob
171 |         
172 |     def discriminator(x):
173 |         D_h1 = tf.nn.relu(tf.matmul(x, D_W1) + D_b1)
174 |         D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
175 |         if lambda_tester:
176 |             D_h4 = tf.nn.relu(tf.matmul(D_h2, D_W4) + D_b4)
177 |         else:
178 |             D_h4 = D_h2
179 |         
180 |         out = tf.matmul(D_h4, D_W3) + D_b3     
181 |         return out
182 |         
183 |     #%% Structure
184 |     G_sample = generator(Z,X)
185 |     D_real = discriminator(X)
186 |     D_fake = discriminator(G_sample) 
187 |     
188 | 
189 |         
190 |     # Replacement of Clipping algorithm to Penalty term
191 |     # 1. Line 6 in Algorithm 1
192 |     eps = tf.random.uniform([mb_size, 1], minval = 0., maxval = 1.)
193 |     X_inter = eps*X + (1. - eps) * G_sample
194 | 
195 |     # 2. Line 7 in Algorithm 1
196 |     grad = tf.gradients(ys=discriminator(X_inter), xs=[X_inter])[0]
197 |     grad_norm = tf.sqrt(tf.reduce_sum(input_tensor=(grad)**2 + 1e-8, axis = 1))
198 |     grad_pen = lam * tf.reduce_mean(input_tensor=(grad_norm - 1)**2)
199 | 
200 |     # Loss function
201 |     D_loss = tf.reduce_mean(input_tensor=D_fake) - tf.reduce_mean(input_tensor=D_real) + grad_pen
202 | 
203 | 
204 | 
205 |     
206 |     G_loss1 = -tf.sqrt(tf.reduce_mean(input_tensor=tf.square(X - G_sample)))
207 |     G_loss2 = -tf.reduce_mean(input_tensor=D_fake)
208 |     
209 |     G_loss = G_loss2 + lambda_ * G_loss1
210 |     
211 |     # Solver
212 |     D_solver = (tf.compat.v1.train.AdamOptimizer(learning_rate = lr, beta1 = beta_1).minimize(D_loss, var_list = theta_D))
213 |     G_solver = (tf.compat.v1.train.AdamOptimizer(learning_rate = lr, beta1 = beta_1).minimize(G_loss, var_list = theta_G))
214 |                         
215 |     #%% Iterations
216 |     sess = tf.compat.v1.Session()
217 |     sess.run(tf.compat.v1.global_variables_initializer())
218 |                 
219 |     # Iterations
220 |     for it in tqdm(range(iterations)):
221 |         # Discriminator training
222 |         for _ in range(5):        
223 |             Z_mb = sample_Z(mb_size, z_dim)                        
224 | 
225 |             X_idx = sample_X(no, mb_size)                
226 |             X_mb = orig_data[X_idx,:]    
227 |                         
228 |             _, D_loss_curr = sess.run([D_solver, D_loss], feed_dict = {X: X_mb, Z: Z_mb})
229 |                     
230 |         # Generator Training
231 |         Z_mb = sample_Z(mb_size, z_dim)     
232 |                 
233 |         X_idx = sample_X(no, mb_size)                
234 |         X_mb = orig_data[X_idx,:]    
235 |                                         
236 |         _, G_loss1_curr, G_loss2_curr = sess.run([G_solver, G_loss1, G_loss2], feed_dict = {X: X_mb, Z: Z_mb})
237 |         #if it%10==0:
238 |         #    print(G_loss1_curr, G_loss2_curr)
239 | 
240 |     #%% Output Generation
241 |     synth_data = sess.run([G_sample], feed_dict = {Z: sample_Z(no, z_dim), X: orig_data})
242 |     synth_data = synth_data[0]
243 |         
244 |     # Renormalization
245 |     synth_data = data_renormalization(synth_data, normalization_params)
246 |     
247 |     # Binary features
248 |     for i in range(x_dim):
249 |         if len(np.unique(orig_data[:, i])) == 2:
250 |             synth_data[:, i] = np.array(np.round(synth_data[:, i]),dtype='int')
251 |      
252 |     return synth_data


--------------------------------------------------------------------------------
/generative_models/dpgan/__init__.py:
--------------------------------------------------------------------------------
 1 | """DPGAN baseline.
 2 | 
 3 | Source: https://github.com/illidanlab/dpgan
 4 | Authors: Liyang Xie, Kaixiang Lin, Shu Wang, Fei Wang, Jiayu Zhou
 5 | Paper link: https://arxiv.org/abs/1802.06739
 6 | 
 7 | Modified by: Evgeny Saveliev (e.s.saveliev@gmail.com)
 8 | """
 9 | 
10 | from .main import DPGAN, dpgan
11 | 


--------------------------------------------------------------------------------
/generative_models/dpgan/impl/__init__.py:
--------------------------------------------------------------------------------
1 | """Based on the implementation: dpgan/MIMIC-III/
2 | 
3 | Modified by: Evgeny Saveliev (e.s.saveliev@gmail.com)
4 | """
5 | 


--------------------------------------------------------------------------------
/generative_models/dpgan/impl/data.py:
--------------------------------------------------------------------------------
1 | from numpy import random
2 | 
3 | 
4 | class NoiseSampler(object):
5 |     def __call__(self, batch_size, z_dim):
6 |         return random.normal(size=(batch_size, z_dim))
7 |         # the shape of return is: batch_size*z_dim
8 |         # see Medgan line 209, use np.random.normal(), which has defauld std = 1.0
9 | 


--------------------------------------------------------------------------------
/generative_models/dpgan/impl/fc.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import tensorflow.contrib.layers as tcl  # pylint: disable=import-error
  3 | from tensorflow.contrib.layers import batch_norm  # pylint: disable=import-error
  4 | 
  5 | 
  6 | class Autoencoder(object):
  7 |     def __init__(self, inputDim, l2scale, compressDims, aeActivation, decompressDims, dataType):
  8 |         self.x_dim = inputDim
  9 |         self.l2scale = l2scale
 10 |         self.compressDims = compressDims
 11 |         self.aeActivation = aeActivation
 12 |         self.decompressDims = decompressDims
 13 |         self.dataType = dataType
 14 |         self.name = 'dpgan/fc/autoencoder'
 15 | 
 16 |     def __call__(self, x_input):
 17 |         decodeVariables = {}
 18 |         with tf.variable_scope(self.name, regularizer=tcl.l2_regularizer(self.l2scale)):
 19 |             tempVec = x_input
 20 |             tempDim = self.x_dim
 21 |             i = 0
 22 |             for compressDim in self.compressDims:
 23 |                 W = tf.get_variable('aee_W_' + str(i), shape=[tempDim, compressDim])
 24 |                 b = tf.get_variable('aee_b_' + str(i), shape=[compressDim])
 25 |                 tempVec = self.aeActivation(tf.add(tf.matmul(tempVec, W), b))
 26 |                 tempDim = compressDim
 27 |                 i += 1
 28 | 
 29 |             i = 0
 30 |             for decompressDim in self.decompressDims[:-1]:
 31 |                 W = tf.get_variable('aed_W_' + str(i), shape=[tempDim, decompressDim])
 32 |                 b = tf.get_variable('aed_b_' + str(i), shape=[decompressDim])
 33 |                 tempVec = self.aeActivation(tf.add(tf.matmul(tempVec, W), b))
 34 |                 tempDim = decompressDim
 35 |                 decodeVariables['aed_W_' + str(i)] = W
 36 |                 decodeVariables['aed_b_' + str(i)] = b
 37 |                 i += 1
 38 |             W = tf.get_variable('aed_W_' + str(i), shape=[tempDim, self.decompressDims[-1]])
 39 |             b = tf.get_variable('aed_b_' + str(i), shape=[self.decompressDims[-1]])
 40 |             decodeVariables['aed_W_' + str(i)] = W
 41 |             decodeVariables['aed_b_' + str(i)] = b
 42 | 
 43 |             if self.dataType == 'binary':
 44 |                 x_reconst = tf.nn.sigmoid(tf.add(tf.matmul(tempVec, W), b))
 45 |                 loss = tf.reduce_mean(-tf.reduce_sum(x_input * tf.log(x_reconst + 1e-12) + (1. - x_input) * tf.log(1. - x_reconst + 1e-12), 1), 0)
 46 |             else:
 47 |                 x_reconst = tf.nn.relu(tf.add(tf.matmul(tempVec, W), b))
 48 |                 loss = tf.reduce_mean((x_input - x_reconst) ** 2)
 49 | 
 50 |         return loss, decodeVariables
 51 | 
 52 |     @property
 53 |     def vars(self):
 54 |         return [var for var in tf.trainable_variables() if self.name in var.name]
 55 | 
 56 | 
 57 | class Generator(object):
 58 |     def __init__(self, randomDim, l2scale, generatorDims, generatorActivation, dataType):
 59 |         self.randomDim = randomDim
 60 |         self.l2scale = l2scale
 61 |         self.generatorDims = generatorDims
 62 |         # self.bn_train = bn_train
 63 |         self.generatorActivation = generatorActivation
 64 |         # self.bnDecay = bnDecay
 65 |         self.dataType = dataType
 66 |         self.name = 'dpgan/fc/g_net'
 67 | 
 68 |     def __call__(self, z):
 69 |         tempVec = z
 70 |         tempDim = self.randomDim
 71 |         with tf.variable_scope(self.name, regularizer=tcl.l2_regularizer(self.l2scale)):
 72 |             for i, genDim in enumerate(self.generatorDims[:-1]):
 73 |                 W = tf.get_variable('W_' + str(i), shape=[tempDim, genDim])
 74 |                 h = tf.matmul(tempVec, W)
 75 |                 h2 = batch_norm(h) # GAN: batch_norm(h, decay=self.bnDecay, scale=True, is_training=self.bn_train, updates_collections=None)
 76 |                 h3 = self.generatorActivation(h2)
 77 |                 tempVec = h3 # GAN: + tempVec
 78 |                 tempDim = genDim
 79 |             W = tf.get_variable('W' + str(i), shape=[tempDim, self.generatorDims[-1]])  # pylint: disable=undefined-loop-variable
 80 |             h = tf.matmul(tempVec, W)
 81 |             h2 = h # GAN: batch_norm(h, decay=self.bnDecay, scale=True, is_training=self.bn_train, updates_collections=None)
 82 | 
 83 |             if self.dataType == 'binary':
 84 |                 h3 = tf.nn.sigmoid(h2) # GAN: tf.nn.tanh(h2)
 85 |             else:
 86 |                 h3 = tf.nn.relu(h2)
 87 | 
 88 |             output = h3 # GAN: + tempVec
 89 |         return output
 90 | 
 91 |     @property
 92 |     def vars(self):
 93 |         return [var for var in tf.trainable_variables() if self.name in var.name]
 94 | 
 95 | 
 96 | class Discriminator(object):
 97 |     def __init__(self, inputDim, discriminatorDims, discriminatorActivation, l2scale):
 98 |         self.inputDim = inputDim
 99 |         self.discriminatorDims = discriminatorDims
100 |         self.discriminatorActivation = discriminatorActivation
101 |         self.l2scale = l2scale
102 |         self.name = 'dpgan/fc/d_net'
103 | 
104 |     def __call__(self, x_input, keepRate, reuse=False):
105 |         # batchSize = tf.shape(x_input)[0]
106 |         # inputMean = tf.reshape(tf.tile(tf.reduce_mean(x_input, 0), [batchSize]), (batchSize, self.inputDim))
107 |         # tempVec = tf.concat(axis = 1, values = [x_input, inputMean]) # https://stackoverflow.com/questions/41813665/tensorflow-slim-typeerror-expected-int32-got-list-containing-tensors-of-type
108 |         # tempDim = self.inputDim * 2 # need in GAN
109 |         tempVec = x_input
110 |         tempDim = self.inputDim # remove in GAN
111 |         with tf.variable_scope(self.name, reuse=reuse): # GAN: regularizer=tcl.l2_regularizer(self.l2scale)
112 |             for i, discDim in enumerate(self.discriminatorDims[:-1]):
113 |                 W = tf.get_variable('W_' + str(i), shape=[tempDim, discDim])
114 |                 b = tf.get_variable('b_' + str(i), shape=[discDim])
115 |                 h = self.discriminatorActivation(tf.add(tf.matmul(tempVec, W), b))
116 |                 # h = tf.nn.dropout(h, keepRate) # need in GAN
117 |                 tempVec = h
118 |                 tempDim = discDim
119 |             W = tf.get_variable('W', shape=[tempDim, 1])
120 |             b = tf.get_variable('b', shape=[1])
121 |             y_hat = tf.squeeze(tf.add(tf.matmul(tempVec, W), b)) # need sigmoid in GAN
122 | 
123 |         return y_hat, self.name
124 | 
125 | 
126 | class buildDiscriminator(object):
127 |     '''Generated data need to go through a decoder before enter discriminator, real data enter discriminator directly'''
128 |     def __init__(self, inputDim, discriminatorDims, discriminatorActivation, decompressDims, aeActivation, dataType, l2scale):
129 |         self.d = Discriminator(inputDim, discriminatorDims, discriminatorActivation, l2scale) # it contains a discriminator
130 |         self.inputDim = inputDim
131 |         self.decompressDims = decompressDims
132 |         self.aeActivation = aeActivation
133 |         self.dataType = dataType
134 |         self.name = 'dpgan/fc/build_d_net'
135 | 
136 |     def __call__(self, x_real, x_fake, keepRate, decodeVariables, reuse=True):
137 |         y_hat_real, self.name = self.d(x_real, keepRate, reuse=False)
138 |         tempVec = x_fake
139 |         i = 0
140 |         for _ in self.decompressDims[:-1]:
141 |             tempVec = self.aeActivation(tf.add(tf.matmul(tempVec, decodeVariables['aed_W_' + str(i)]), decodeVariables['aed_b_' + str(i)]))
142 |             i += 1
143 |         if self.dataType == 'binary':
144 |             x_decoded = tf.nn.sigmoid(tf.add(tf.matmul(tempVec, decodeVariables['aed_W_' + str(i)]), decodeVariables['aed_b_' + str(i)]))
145 |         else:
146 |             x_decoded = tf.nn.relu(tf.add(tf.matmul(tempVec, decodeVariables['aed_W_' + str(i)]), decodeVariables['aed_b_' + str(i)]))
147 |         y_hat_fake, self.name = self.d(x_decoded, keepRate, reuse=True)
148 |         d_loss = -tf.reduce_mean(y_hat_real) + tf.reduce_mean(y_hat_fake)
149 |         g_loss = -tf.reduce_mean(y_hat_fake)
150 | 
151 |         return d_loss, g_loss, y_hat_real, y_hat_fake, x_decoded
152 | 
153 |     @property
154 |     def vars(self):
155 |         return [var for var in tf.trainable_variables() if self.name in var.name]
156 | 


--------------------------------------------------------------------------------
/generative_models/dpgan/utilize.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=unbalanced-tuple-unpacking
  2 | import pickle
  3 | 
  4 | from matplotlib.pylab import (
  5 |     mean, array, nonzero, count_nonzero, putmask, around, split, clip, unique, where, concatenate, random
  6 | )
  7 | 
  8 | from sklearn import linear_model
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score
 11 | 
 12 | 
 13 | def data_readf(top):  # pylint: disable=unused-argument
 14 |     '''Read MIMIC-III data'''
 15 |     with open('/home/xieliyan/Dropbox/GPU/Data/MIMIC-III/patient_vectors_1071.pickle', 'rb') as f_: # Original MIMIC-III data is in GPU1
 16 |         MIMIC_ICD9 = pickle.load(f_) # dictionary, each one is a list
 17 |     MIMIC_data = []
 18 |     for value in MIMIC_ICD9: # dictionary to numpy array
 19 |         if mean(value) == 0.0: # skip all zero vectors, each patients should have as least one disease of course
 20 |             continue
 21 |         MIMIC_data.append(value) # amax(MIMIC_data): 540
 22 |     # MIMIC_data = age_filter(MIMIC_data) # remove those patients with age 18 or younger
 23 |     # MIMIC_data = binarize(array(MIMIC_data)) # binarize, non zero -> 1, average(MIMIC_data): , type(MIMIC_data[][]): <type 'numpy.int64'>
 24 |     # index, MIMIC_data = select_code(MIMIC_data, top) # should be done after binarize because we consider the frequency among different patients, select top codes and remove the patients that don't have at least one of these codes, see "applying deep learning to icd-9 multi-label classification from medical records"
 25 |     # MIMIC_data = MIMIC_data[:, index] # keep only those coordinates (features) correspondent to top ICD9 codes
 26 |     num_data = (array(MIMIC_data).shape)[0] # data number
 27 |     dim_data = (array(MIMIC_data).shape)[1] # data dimension
 28 |     return array(MIMIC_data), num_data, dim_data # (46520, 942) 46520 942 for whole dataset
 29 | 
 30 | 
 31 | def c2b(train, generated, adj):
 32 |     '''Set the number of 1 in generated data as multiple time of in training data, the rest is set to 0 (or not)'''
 33 | 
 34 |     if count_nonzero(generated) <= count_nonzero(train): # special case: number of 1 in generated is <= train, all nonzero in train = 1
 35 |         putmask(generated, generated > 0, 1.0)
 36 |         return generated
 37 | 
 38 |     p = float(count_nonzero(train))/train.size # percentage of nonzero elements
 39 |     g = sorted(generated.flatten(), reverse=True)
 40 |     idx = int(around(adj*p*len(g))) # with adjustment
 41 |     v = g[idx] # any value large than this set to 1, o.w. to 0
 42 |     putmask(generated, generated<=v, 0.0) # due to the property of putmask, must first set 0 then set 1
 43 |     putmask(generated, generated>v, 1.0)
 44 |     print("Nonzero element portion in training data and adjustment value are:")
 45 |     print(p, adj)
 46 |     print("Nonzero element portion in generated data after adjustment of c2b function:")
 47 |     print(float(count_nonzero(generated))/generated.size)
 48 |     return generated
 49 | 
 50 | 
 51 | def c2bcolwise(train, generated, adj):
 52 |     '''Set the number of 1 in each column in generated data the same as the same column in training data, the rest is set to 0.
 53 |     Network learn the joint distribution p(x1,...xd), then it should also learn the marginal distribution p(x1),...,p(xd), which
 54 |     is approximately the frequent of 1 (and 0) in each feature (coordinate) x1...xd, hence it make sense to do so. But
 55 |     by doing so we "force" the generated data have the same portion of 1 in each feature (coordinate) no matter how the network
 56 |     is trained (even not trained at all), this doesn't matters since features (coordinates) are dependent, p(x1,...xd) != p(x1)*...*p(xd)
 57 |     only setting the frequency of 1 in each feature (coordinate) is not enough, it also relies on the training of NN to learn the
 58 |     dependency among features (coordinates), i.e. conditional probability of x1...xd'''
 59 |     generated_new = [] # store new one
 60 |     s = train.sum(axis=0)
 61 |     print('Nonzero element in each feature (coordinate) in training data: ')
 62 |     print(list(map(int, s))) # not in scientific notation
 63 |     print("Adjustment value is: " + str(adj))
 64 |     for col in range(len(s)):
 65 |         col_train = train[:,col]
 66 |         col_generated = generated[:,col]
 67 |         if count_nonzero(col_generated) <= count_nonzero(col_train): # special case: number of 1 in generated is <= train, all nonzero in train = 1
 68 |             putmask(col_generated, generated > 0, 1.0)
 69 |             generated_new.append(col_generated)
 70 |             continue
 71 |         g = sorted(col_generated, reverse=True)
 72 |         idx = int(adj*s[col]) # with adjustment
 73 |         v = g[idx]
 74 |         putmask(col_generated, col_generated<=v, 0.0)
 75 |         putmask(col_generated, col_generated>v, 1.0)
 76 |         generated_new.append(col_generated)
 77 |     generated_new = array(generated_new).T
 78 |     print('Nonzero element in each feature (coordinate) in generated data: ')
 79 |     print(list(map(int, generated_new.sum(axis=0))))
 80 |     print('Portion of element that is match between training data and generated data')
 81 |     print(float(sum(train == generated_new))/(train.shape[0]*train.shape[1]))
 82 |     return generated_new
 83 | 
 84 | 
 85 | def splitbycol(dataType, _VALIDATION_RATIO, col, MIMIC_data):
 86 |     '''Separate training and testing for each dimension (col), if we fix column col as label,
 87 |     we need to take _VALIDATION_RATIO of data with label 1 and _VALIDATION_RATIO of data with label 0
 88 |     and merge them together as testing set and leave the rest. Then balance the rest as training set
 89 |     by keeping whomever (0 or 1) is smaller and random select same number from the other one.
 90 |     Finally return training and testing set'''
 91 |     if dataType == 'binary':
 92 |         MIMIC_data = clip(MIMIC_data, 0, 1)
 93 |     _, c = split(MIMIC_data, col) # get column col
 94 |     if (unique(c).size == 1): # skip column: only one class
 95 |         return [], []
 96 |     MIMIC_data_1 = MIMIC_data[nonzero(c), :][0]  # Separate data matrix by label, label==1
 97 |     MIMIC_data_0 = MIMIC_data[where(c == 0)[0], :]
 98 |     trainX_1, testX_1 = train_test_split(MIMIC_data_1, test_size=_VALIDATION_RATIO, random_state=0)
 99 |     trainX_0, testX_0 = train_test_split(MIMIC_data_0, test_size=_VALIDATION_RATIO, random_state=0)
100 |     testX = concatenate((testX_1, testX_0), axis=0)
101 |     if len(trainX_1) == len(trainX_0):
102 |         trainX = concatenate((trainX_1, trainX_0), axis=0)
103 |     elif len(trainX_1) < len(trainX_0):
104 |         temp_train, temp_test = train_test_split(trainX_0, test_size=len(trainX_1), random_state=0)
105 |         trainX = concatenate((trainX_1, temp_test), axis=0)
106 |         # testX = concatenate((testX, temp_train), axis=0) # can't merge, test set is already done
107 |     else:
108 |         temp_train, temp_test = train_test_split(trainX_1, test_size=len(trainX_0), random_state=0)
109 |         trainX = concatenate((trainX_0, temp_test), axis=0)
110 |         # testX = concatenate((testX, temp_train), axis=0)
111 |     if ((array(trainX).shape)[0] == 0 or (array(testX).shape)[0] == 0): # skip column: no data point in training or testing set
112 |         return [], []
113 |     return trainX, testX # <type 'numpy.ndarray'> <type 'numpy.ndarray'>
114 | 
115 | 
116 | 
117 | def gene_check(col, x_gene):
118 |     '''check if each column (coordinate) has one class or not, balance the data set then output'''
119 |     _, c = split(x_gene, col)  # get column col
120 |     if (unique(c).size == 1):  # skip column: only one class
121 |         return []
122 |     x_gene_1 = x_gene[nonzero(c), :][0]
123 |     x_gene_0 = x_gene[where(c == 0)[0], :]
124 |     if len(x_gene_1) == len(x_gene_0):
125 |         geneX = x_gene
126 |     elif len(x_gene_1) < len(x_gene_0):
127 |         temp_train, temp_test = train_test_split(x_gene_0, test_size=len(x_gene_1), random_state=0)
128 |         geneX = concatenate((x_gene_1, temp_test), axis=0)
129 |     else:
130 |         temp_train, temp_test = train_test_split(x_gene_1, test_size=len(x_gene_0), random_state=0)
131 |         geneX = concatenate((x_gene_0, temp_test), axis=0)
132 |     if (array(geneX).shape)[0] == 0:
133 |         return []
134 |     return x_gene
135 | 
136 | 
137 | def statistics(r, g, te, col):
138 |     '''Column specific statistics (precision, recall(Sensitivity), f1-score, AUC)'''
139 |     f_r, t_r = split(r, col)  # separate feature and target
140 |     f_g, t_g = split(g, col)
141 |     f_te, t_te = split(te, col)  # these 6 parts are all numpy array
142 |     # t_g[t_g < 1.0] = 0  # hard decision boundary
143 |     # t_g[t_g >= 0.5] = 1
144 |     if (unique(t_r).size == 1) or (unique(t_g).size == 1):  # if only those coordinates correspondent to top codes are kept, no coordinate should be skipped, if those patients that doesn't contain top ICD9 codes were removed, more coordinates will be skipped
145 |         return [], [], [], [], [], [], [], [], [], []
146 |     model_r = linear_model.LogisticRegression()  # logistic regression, if labels are all 0, this will cause: ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0
147 |     model_r.fit(f_r, t_r)
148 |     label_r = model_r.predict(f_te) # decision boundary is 0
149 |     model_g = linear_model.LogisticRegression()
150 |     model_g.fit(f_g, t_g)
151 |     label_g = model_r.predict(f_te)
152 |     precision_r = precision_score(t_te, label_r) # precision
153 |     precision_g = precision_score(t_te, label_g)
154 |     recall_r = recall_score(t_te, label_r) # recall
155 |     recall_g = recall_score(t_te, label_g)
156 |     acc_r = accuracy_score(t_te, label_r) # accuracy
157 |     acc_g = accuracy_score(t_te, label_g)
158 |     f1score_r = f1_score(t_te, label_r)  # f1-score
159 |     f1score_g = f1_score(t_te, label_g)
160 |     auc_r = roc_auc_score(t_te, label_r) # AUC
161 |     auc_g = roc_auc_score(t_te, label_g)
162 | 
163 |     return precision_r, precision_g, recall_r, recall_g, acc_r, acc_g, f1score_r, f1score_g, auc_r, auc_g
164 | 
165 | 
166 | def dwp(r, g, te, db=0.5, C=1.0):
167 |     '''Dimension-wise prediction & dimension-wise probability, r for real, g for generated, t for test, all without separated feature and target, all are numpy array'''
168 |     rv_pre = []
169 |     gv_pre = []
170 |     rv_pro = []
171 |     gv_pro = []
172 |     for i in range(len(r[0])):
173 |         print(i)
174 |         f_r, t_r = split(r, i) # separate feature and target
175 |         f_g, t_g = split(g, i)
176 |         f_te, t_te = split(te, i) # these 6 are all numpy array
177 |         t_g[t_g < db ] = 0  # hard decision boundary
178 |         t_g[t_g >= db ] = 1
179 |         if (unique(t_r).size == 1) or (unique(t_g).size == 1): # if only those coordinates correspondent to top codes are kept, no coordinate should be skipped, if those patients that doesn't contain top ICD9 codes were removed, more coordinates will be skipped
180 |             print("skip this coordinate")
181 |             continue
182 |         model_r = linear_model.LogisticRegression(C=C) # logistic regression, if labels are all 0, this will cause: ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0
183 |         model_r.fit(f_r, t_r)
184 |         label_r = model_r.predict(f_te)
185 |         model_g = linear_model.LogisticRegression(C=C)
186 |         model_g.fit(f_g, t_g)
187 |         label_g = model_g.predict(f_te)
188 |         # print(label_r)
189 |         # print(mean(model_r.coef_), count_nonzero(model_r.coef_), mean(model_g.coef_), count_nonzero(model_g.coef_)) # statistics of classifiers
190 |         # rv.append(match(label_r, t_te)/(len(t_te)+10**(-10))) # simply match
191 |         # gv.append(match(label_g, t_te)/(len(t_te)+10**(-10)))
192 |         rv_pre.append(f1_score(t_te, label_r)) # F1 score
193 |         gv_pre.append(f1_score(t_te, label_g))
194 |         # reg = linear_model.LinearRegression() # least square error
195 |         # reg.fit(f_r, t_r)
196 |         # target_r = reg.predict(f_te)
197 |         # reg = linear_model.LinearRegression()
198 |         # reg.fit(f_g, t_g)
199 |         # target_g = reg.predict(f_te)
200 |         # rv.append(square(linalg.norm(target_r-t_te)))
201 |         # gv.append(square(linalg.norm(target_g-t_te)))
202 |         rv_pro.append(float(count_nonzero(t_r))/len(t_r))  # dimension-wise probability, see "https://onlinecourses.science.psu.edu/stat504/node/28"
203 |         gv_pro.append(float(count_nonzero(t_g))/len(t_g))
204 | 
205 |     return rv_pre, gv_pre, rv_pro, gv_pro
206 | 
207 | 
208 | def load_MIMICIII(dataType, _VALIDATION_RATIO, top):
209 |     MIMIC_data, num_data, dim_data = data_readf(top)
210 |     if dataType == 'binary':
211 |         MIMIC_data = clip(MIMIC_data, 0, 1)
212 |     trainX, testX = train_test_split(MIMIC_data, test_size=_VALIDATION_RATIO, random_state=0)
213 |     return trainX, testX, dim_data
214 | 
215 | 
216 | def fig_add_noise(List):
217 |     '''adding noise to results to make them distinguishable on figure'''
218 |     print(len(List))
219 |     print(0.0001*random.randn(len(List)))
220 |     List_new = List + 0.0001*random.randn(len(List))
221 |     return List_new
222 | 


--------------------------------------------------------------------------------
/generative_models/gan.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Reference: Jinsung Yoon, Lydia N. Drumright, Mihaela van der Schaar, 
  3 | "Anonymization through Data Synthesis using Generative Adversarial Networks (ADS-GAN):
  4 | A harmonizing advancement for AI in medicine," 
  5 | IEEE Journal of Biomedical and Health Informatics (JBHI), 2019.
  6 | Paper link: https://ieeexplore.ieee.org/document/9034117
  7 | Last updated Date: December 22th 2020
  8 | Code author: Jinsung Yoon (jsyoon0823@gmail.com)
  9 | 
 10 | Minor modifications made by Boris van Breugel (bv292@cam.ac.uk) and Evgeny Saveliev (e.s.saveliev@gmail.com).
 11 | -----------------------------
 12 | adsgan.py
 13 | - Generate synthetic data for GAN framework
 14 | (1) Use original data to generate synthetic data
 15 | """
 16 | 
 17 | #%% Import necessary packages
 18 | import tensorflow as tf
 19 | import numpy as np
 20 | 
 21 | from tqdm import tqdm
 22 | 
 23 | tf.compat.v1.disable_eager_execution()
 24 | 
 25 | def gan(orig_data, params):
 26 |     """Generate synthetic data for ADSGAN framework.
 27 |     
 28 |     Args:
 29 |         orig_data: original data
 30 |         params: Network parameters
 31 |             mb_size: mini-batch size
 32 |             z_dim: random state dimension
 33 |             h_dim: hidden state dimension
 34 |             lambda: identifiability parameter
 35 |             iterations: training iterations
 36 |             
 37 |     Returns:
 38 |         synth_data: synthetically generated data
 39 |     """
 40 |         
 41 |     # Reset the tensorflow graph
 42 |     tf.compat.v1.reset_default_graph()
 43 |     
 44 |     ## Parameters        
 45 |     # Feature no
 46 |     x_dim = len(orig_data.columns)        
 47 |     # Sample no
 48 |     no = len(orig_data)        
 49 |     
 50 |     # Batch size        
 51 |     mb_size = params['mb_size']
 52 |     # Random variable dimension
 53 |     z_dim = params['z_dim'] 
 54 |     # Hidden unit dimensions
 55 |     h_dim = params['h_dim']        
 56 |     # Training iterations
 57 |     iterations = params['iterations']
 58 |     # GAN type
 59 |     gen_model_name = params['gen_model_name']
 60 |     # WGAN-GP parameters
 61 |     lam = 10
 62 |     lr = 1e-4        
 63 | 
 64 |     #%% Data Preprocessing
 65 |     orig_data = np.asarray(orig_data)
 66 | 
 67 |     def data_normalization(orig_data, epsilon = 1e-8):
 68 |                 
 69 |         min_val = np.min(orig_data, axis=0)
 70 |         
 71 |         normalized_data = orig_data - min_val
 72 |         
 73 |         max_val = np.max(normalized_data, axis=0)
 74 |         normalized_data = normalized_data / (max_val + epsilon)
 75 |         
 76 |         normalization_params = {"min_val": min_val, "max_val": max_val}
 77 |         
 78 |         return normalized_data, normalization_params
 79 |     
 80 |     def data_renormalization(normalized_data, normalization_params, epsilon = 1e-8):
 81 |         
 82 |         renormalized_data = normalized_data * (normalization_params['max_val'] + epsilon)
 83 |         renormalized_data = renormalized_data + normalization_params['min_val']
 84 |         
 85 |         return renormalized_data
 86 |     
 87 |     orig_data, normalization_params = data_normalization(orig_data)
 88 |         
 89 |     #%% Necessary Functions
 90 | 
 91 |     # Xavier Initialization Definition
 92 |     def xavier_init(size):
 93 |         in_dim = size[0]
 94 |         xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
 95 |         return tf.random.normal(shape = size, stddev = xavier_stddev)        
 96 |                 
 97 |     # Sample from uniform distribution
 98 |     def sample_Z(m, n):
 99 |         return np.random.uniform(-1., 1., size = [m, n])
100 |                 
101 |     # Sample from the real data
102 |     def sample_X(m, n):
103 |         return np.random.permutation(m)[:n]    
104 |          
105 |     #%% Placeholder
106 |     # Feature
107 |     X = tf.compat.v1.placeholder(tf.float32, shape = [None, x_dim])     
108 |     # Random Variable        
109 |     Z = tf.compat.v1.placeholder(tf.float32, shape = [None, z_dim])
110 |             
111 |     #%% Discriminator
112 |     # Discriminator
113 |     D_W1 = tf.Variable(xavier_init([x_dim, h_dim]))
114 |     D_b1 = tf.Variable(tf.zeros(shape=[h_dim]))
115 | 
116 |     D_W2 = tf.Variable(xavier_init([h_dim,h_dim]))
117 |     D_b2 = tf.Variable(tf.zeros(shape=[h_dim]))
118 |         
119 |     D_W3 = tf.Variable(xavier_init([h_dim,1]))
120 |     D_b3 = tf.Variable(tf.zeros(shape=[1]))
121 | 
122 |     theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
123 |         
124 |     #%% Generator
125 |     G_W1 = tf.Variable(xavier_init([z_dim, h_dim]))
126 |     G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))
127 | 
128 |     G_W2 = tf.Variable(xavier_init([h_dim,h_dim]))
129 |     G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))
130 |     
131 |     G_W3 = tf.Variable(xavier_init([h_dim,h_dim]))
132 |     G_b3 = tf.Variable(tf.zeros(shape=[h_dim]))
133 | 
134 |     G_W4 = tf.Variable(xavier_init([h_dim, x_dim]))
135 |     G_b4 = tf.Variable(tf.zeros(shape=[x_dim]))
136 |         
137 |     theta_G = [G_W1, G_W2, G_W3, G_W4, G_b1, G_b2, G_b3, G_b4]
138 | 
139 |     #%% Generator and discriminator functions
140 |     def generator(z):
141 |         G_h1 = tf.nn.tanh(tf.matmul(z, G_W1) + G_b1)
142 |         G_h2 = tf.nn.tanh(tf.matmul(G_h1, G_W2) + G_b2)
143 |         G_h3 = tf.nn.tanh(tf.matmul(G_h2, G_W3) + G_b3)
144 |         G_log_prob = tf.nn.sigmoid(tf.matmul(G_h3, G_W4) + G_b4)
145 |                 
146 |         return G_log_prob
147 |         
148 |     def discriminator(x):
149 |         D_h1 = tf.nn.relu(tf.matmul(x, D_W1) + D_b1)
150 |         D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
151 |         out = (tf.matmul(D_h2, D_W3) + D_b3)
152 |                 
153 |         return out
154 |         
155 |     #%% Structure
156 |     G_sample = generator(Z)
157 |     D_real = discriminator(X)
158 |     D_fake = discriminator(G_sample) 
159 |     
160 |     if gen_model_name=='wgan':
161 |             
162 |         # Replacement of Clipping algorithm to Penalty term
163 |         # 1. Line 6 in Algorithm 1
164 |         eps = tf.random.uniform([mb_size, 1], minval = 0., maxval = 1.)
165 |         X_inter = eps*X + (1. - eps) * G_sample
166 |     
167 |         # 2. Line 7 in Algorithm 1
168 |         grad = tf.gradients(ys=discriminator(X_inter), xs=[X_inter])[0]
169 |         grad_norm = tf.sqrt(tf.reduce_sum(input_tensor=(grad)**2 + 1e-8, axis = 1))
170 |         grad_pen = lam * tf.reduce_mean(input_tensor=(grad_norm - 1)**2)
171 |     
172 |         # Loss function
173 |         D_loss = tf.reduce_mean(input_tensor=D_fake) - tf.reduce_mean(input_tensor=D_real) + grad_pen
174 |     
175 |     
176 |     
177 |     elif gen_model_name == 'gan':
178 |         D_loss = tf.reduce_mean(input_tensor=D_fake) - tf.reduce_mean(input_tensor=D_real)
179 |     
180 |     G_loss = -tf.reduce_mean(input_tensor=D_fake)
181 |     
182 |     
183 |     # Solver
184 |     D_solver = (tf.compat.v1.train.AdamOptimizer(learning_rate = lr, beta1 = 0.5).minimize(D_loss, var_list = theta_D))
185 |     G_solver = (tf.compat.v1.train.AdamOptimizer(learning_rate = lr, beta1 = 0.5).minimize(G_loss, var_list = theta_G))
186 |                         
187 |     #%% Iterations
188 |     sess = tf.compat.v1.Session()
189 |     sess.run(tf.compat.v1.global_variables_initializer())
190 |                 
191 |     # Iterations
192 |     for it in tqdm(range(iterations)):
193 |         # Discriminator training
194 |         for _ in range(5):        
195 |             Z_mb = sample_Z(mb_size, z_dim)                        
196 | 
197 |             X_idx = sample_X(no,mb_size)                
198 |             X_mb = orig_data[X_idx,:]    
199 |                         
200 |             _, D_loss_curr = sess.run([D_solver, D_loss], feed_dict = {X: X_mb, Z: Z_mb})
201 |                         
202 |         # Generator Training
203 |         Z_mb = sample_Z(mb_size, z_dim)     
204 |                 
205 |         X_idx = sample_X(no,mb_size)                
206 |         X_mb = orig_data[X_idx,:]    
207 |                                         
208 |         _, G_loss_curr = sess.run([G_solver, G_loss], feed_dict = {X: X_mb, Z: Z_mb})
209 |         
210 |     #%% Output Generation
211 |     synth_data = sess.run([G_sample], feed_dict = {Z: sample_Z(no, z_dim)})
212 |     synth_data = synth_data[0]
213 |         
214 |     # Renormalization
215 |     synth_data = data_renormalization(synth_data, normalization_params)
216 |     
217 |     # Binary features
218 |     for i in range(x_dim):
219 |         if len(np.unique(orig_data[:, i])) == 2:
220 |             synth_data[:, i] = np.round(synth_data[:, i])
221 |      
222 |     return synth_data


--------------------------------------------------------------------------------
/generative_models/pategan.py:
--------------------------------------------------------------------------------
  1 | '''PATE-GAN function'''
  2 | 
  3 | # Necessary packages
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import warnings
  7 | #warnings.filterwarnings("ignore")
  8 | 
  9 | tf.compat.v1.disable_eager_execution()
 10 | 
 11 | from sklearn.linear_model import LogisticRegression
 12 | 
 13 | 
 14 | def pate_lambda (x, teacher_models, lambda_):
 15 |     '''Returns PATE_lambda(x).
 16 |     
 17 |     Args:
 18 |         - x: feature vector
 19 |         - teacher_models: a list of teacher models
 20 |         - lambda_: parameter
 21 |         
 22 |     Returns:
 23 |         - n0, n1: the number of label 0 and 1, respectively
 24 |         - out: label after adding laplace noise.
 25 |   '''
 26 |       
 27 |     y_hat = list()
 28 |                 
 29 |     for teacher in teacher_models:                        
 30 |         temp_y = teacher.predict(np.reshape(x, [1,-1]))
 31 |         y_hat = y_hat + [temp_y]
 32 |     
 33 |     y_hat = np.asarray(y_hat)
 34 |     n0 = sum(y_hat == 0)
 35 |     n1 = sum(y_hat == 1)
 36 |     
 37 |     lap_noise = np.random.laplace(loc=0.0, scale=lambda_)
 38 |     
 39 |     out = (n1+lap_noise) / float(n0+n1)
 40 |     out = int(out>0.5)
 41 |                 
 42 |     return n0, n1, out 
 43 | 
 44 | 
 45 | def pategan(x_train, parameters):
 46 |     '''Basic PATE-GAN framework.
 47 |     
 48 |     Args:
 49 |         - x_train: training data
 50 |         - parameters: PATE-GAN parameters
 51 |             - n_s: the number of student training iterations
 52 |             - batch_size: the number of batch size for training student and generator
 53 |             - k: the number of teachers
 54 |             - epsilon, delta: Differential privacy parameters
 55 |             - lambda_: noise size
 56 |             
 57 |     Returns:
 58 |         - x_train_hat: generated training data by differentially private generator
 59 |     '''
 60 |     
 61 |     # Reset the graph
 62 |     tf.compat.v1.reset_default_graph()
 63 |         
 64 |     # PATE-GAN parameters
 65 |     # number of student training iterations
 66 |     n_s = parameters['n_s']
 67 |     # number of batch size for student and generator training
 68 |     batch_size = parameters['batch_size']
 69 |     # number of teachers
 70 |     k = parameters['k']
 71 |     # epsilon
 72 |     epsilon = parameters['epsilon']
 73 |     # delta
 74 |     delta = parameters['delta']
 75 |     # lambda_
 76 |     lambda_ = parameters['lambda']
 77 |     
 78 |     # Other parameters
 79 |     # alpha initialize
 80 |     L = 20
 81 |     alpha = np.zeros([L])
 82 |     # initialize epsilon_hat
 83 |     epsilon_hat = 0
 84 |         
 85 |     # Network parameters
 86 |     no, dim = x_train.shape
 87 |     # Random sample dimensions
 88 |     z_dim = int(dim)
 89 |     # Student hidden dimension
 90 |     student_h_dim = int(dim)
 91 |     # Generator hidden dimension
 92 |     generator_h_dim = int(4*dim)    
 93 |     
 94 |     ## Partitioning the data into k subsets
 95 |     x_partition = list()
 96 |     partition_data_no = int(no/k)
 97 |         
 98 |     idx = np.random.permutation(no)
 99 |         
100 |     for i in range(k):
101 |         temp_idx = idx[int(i*partition_data_no):int((i+1)*partition_data_no)]
102 |         temp_x = x_train[temp_idx, :]            
103 |         x_partition = x_partition + [temp_x]        
104 |     
105 |     ## Necessary Functions for buidling NN models
106 |     # Xavier Initialization Definition
107 |     def xavier_init(size):
108 |         in_dim = size[0]
109 |         xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
110 |         return tf.random.normal(shape = size, stddev = xavier_stddev)        
111 |                 
112 |     # Sample from uniform distribution
113 |     def sample_Z(m, n):
114 |         return np.random.uniform(0., 1., size = [m, n])
115 |          
116 |     ## Placeholder
117 |     # PATE labels
118 |     Y = tf.compat.v1.placeholder(tf.float32, shape = [None, 1])    
119 |     # Random Variable        
120 |     Z = tf.compat.v1.placeholder(tf.float32, shape = [None, z_dim])
121 |      
122 |     ## NN variables     
123 |     # Student
124 |     S_W1 = tf.Variable(xavier_init([dim, student_h_dim]))
125 |     S_b1 = tf.Variable(tf.zeros(shape=[student_h_dim]))
126 |         
127 |     S_W2 = tf.Variable(xavier_init([student_h_dim,1]))
128 |     S_b2 = tf.Variable(tf.zeros(shape=[1]))
129 | 
130 |     theta_S = [S_W1, S_W2, S_b1, S_b2]
131 |         
132 |     # Generator
133 | 
134 |     G_W1 = tf.Variable(xavier_init([z_dim, generator_h_dim]))
135 |     G_b1 = tf.Variable(tf.zeros(shape=[generator_h_dim]))
136 | 
137 |     G_W2 = tf.Variable(xavier_init([generator_h_dim,generator_h_dim]))
138 |     G_b2 = tf.Variable(tf.zeros(shape=[generator_h_dim]))
139 | 
140 |     G_W3 = tf.Variable(xavier_init([generator_h_dim,dim]))
141 |     G_b3 = tf.Variable(tf.zeros(shape=[dim]))
142 |         
143 |     theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]
144 | 
145 |     ## Models
146 |     def generator(z):
147 |         G_h1 = tf.nn.tanh(tf.matmul(z, G_W1) + G_b1)
148 |         G_h2 = tf.nn.tanh(tf.matmul(G_h1, G_W2) + G_b2)
149 |         G_out = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
150 |                 
151 |         return G_out
152 |         
153 |     def student(x):
154 |         S_h1 = tf.nn.relu(tf.matmul(x, S_W1) + S_b1)
155 |         S_out = tf.matmul(S_h1, S_W2) + S_b2
156 |                 
157 |         return S_out
158 |             
159 |     ## Loss    
160 |     G_sample = generator(Z)
161 |     S_fake = student(G_sample)
162 |     
163 |     S_loss = tf.reduce_mean(input_tensor=Y * S_fake) - tf.reduce_mean(input_tensor=(1-Y) * S_fake)
164 |     G_loss = -tf.reduce_mean(input_tensor=S_fake)
165 |     
166 |     # Optimizer
167 |     S_solver = (tf.compat.v1.train.RMSPropOptimizer(learning_rate=1e-4)
168 |                             .minimize(-S_loss, var_list=theta_S))
169 |     G_solver = (tf.compat.v1.train.RMSPropOptimizer(learning_rate=1e-4)
170 |                             .minimize(G_loss, var_list=theta_G))
171 |     
172 |     clip_S = [p.assign(tf.clip_by_value(p, -0.01, 0.01)) for p in theta_S]
173 |     
174 |     ## Sessions
175 |     sess = tf.compat.v1.Session()
176 |     sess.run(tf.compat.v1.global_variables_initializer())
177 |     
178 | 
179 |     min_iterations = 1
180 |     iteration=0
181 |     ## Iterations
182 |     while epsilon_hat < epsilon or iteration < min_iterations:            
183 |         iteration+=1
184 |             
185 |         # 1. Train teacher models
186 |         teacher_models = list()
187 |         
188 |         for _ in range(k):
189 |                                 
190 |             Z_mb = sample_Z(partition_data_no, z_dim)
191 |             G_mb = sess.run(G_sample, feed_dict = {Z: Z_mb})
192 |                                 
193 |             temp_x = x_partition[i]
194 |             idx = np.random.permutation(len(temp_x[:, 0]))
195 |             X_mb = temp_x[idx[:partition_data_no], :]
196 |                                 
197 |             X_comb = np.concatenate((X_mb, G_mb), axis = 0)
198 |             Y_comb = np.concatenate((np.ones([partition_data_no,]), 
199 |                                                              np.zeros([partition_data_no,])), axis = 0)
200 |                                 
201 |             model = LogisticRegression()
202 |             model.fit(X_comb, Y_comb)
203 |             teacher_models = teacher_models + [model]
204 |                         
205 |         # 2. Student training
206 |         for _ in range(n_s):
207 |                     
208 |             Z_mb = sample_Z(batch_size, z_dim)
209 |             G_mb = sess.run(G_sample, feed_dict = {Z: Z_mb})
210 |             Y_mb = list()
211 |                         
212 |             for j in range(batch_size):                                
213 |                 n0, n1, r_j = pate_lambda(G_mb[j, :], teacher_models, lambda_)
214 |                 Y_mb = Y_mb + [r_j]
215 |              
216 |                 # Update moments accountant
217 |                 q = np.log(2 + lambda_ * abs(n0 - n1)) - np.log(4.0) - \
218 |                         (lambda_ * abs(n0 - n1))
219 |                 q = np.exp(q)
220 |                                 
221 |                 # Compute alpha
222 |                 for l in range(L):
223 |                     temp1 = 2 * (lambda_**2) * (l+1) * (l+2)
224 |                     temp2 = (1-q) * ( ((1-q)/(1-q*np.exp(2*lambda_)))**(l+1) ) + \
225 |                                     q * np.exp(2*lambda_ * (l+1))
226 |                     alpha[l] = alpha[l] + np.min([temp1, np.log(temp2)])
227 |                 
228 |             # PATE labels for G_mb    
229 |             Y_mb = np.reshape(np.asarray(Y_mb), [-1,1])
230 |                                 
231 |             # Update student
232 |             _, D_loss_curr, _ = sess.run([S_solver, S_loss, clip_S], 
233 |                                                                      feed_dict = {Z: Z_mb, Y: Y_mb})
234 |         
235 |         # Generator Update                
236 |         Z_mb = sample_Z(batch_size, z_dim)
237 |         _, G_loss_curr = sess.run([G_solver, G_loss], feed_dict = {Z: Z_mb})
238 |         print('G loss',G_loss_curr)
239 |         print('D_loss', D_loss_curr)
240 |         print(np.mean(Y_mb))
241 |                 
242 |         # epsilon_hat computation
243 |         curr_list = list()                
244 |         for l in range(L):
245 |             temp_alpha = (alpha[l] + np.log(1/delta)) / float(l+1)
246 |             curr_list = curr_list + [temp_alpha]
247 |                 
248 |         epsilon_hat = np.min(curr_list)
249 |         print(epsilon_hat)                
250 | 
251 |     ## Outputs
252 |     x_train_hat = sess.run([G_sample], feed_dict = {Z: sample_Z(no, z_dim)})[0]
253 | 
254 |     for i in range(dim):
255 |         if len(np.unique(x_train[:, i])) == 2:
256 |             x_train_hat[:, i] = np.round(x_train_hat[:, i])
257 |             
258 |     return x_train_hat
259 | 
260 | 
261 | ## Main
262 | if __name__ == '__main__':
263 |     
264 |     x_train = np.random.normal(0, 1, [10000,5])
265 |         
266 |     # Normalization
267 |     for i in range(len(x_train[0, :])):
268 |         x_train[:, i] = x_train[:, i] - np.min(x_train[:, i])
269 |         x_train[:, i] = x_train[:, i] / (np.max(x_train[:, i]) + 1e-8)
270 |     
271 |     
272 |     parameters = {'n_s': 1, 'batch_size': 1000, 
273 |                                 'k': 100, 'epsilon': 100, 'delta': 0.0001, 'lambda': 1}
274 | 
275 |     x_train_new = pategan(x_train, parameters)
276 | 


--------------------------------------------------------------------------------
/generative_models/pategan_from_bitbucket.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Jinsung Yoon (0*/13/2018)
  3 | PATEGAN
  4 | '''
  5 | 
  6 | #%% Packages
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | 
 11 | #%% Function Start
 12 | 
 13 | def pategan(X_train, Y_train, X_test, Y_test, params):
 14 |     epsilon = params['epsilon']
 15 |     delta = params['delta']
 16 |     niter = params['iterations']
 17 |     num_teachers = params['k']
 18 | 
 19 |     #%% Parameters
 20 |     # Batch size    
 21 |     mb_size = 128
 22 |     
 23 |     # Feature no
 24 |     X_dim = len(X_train[0,:])
 25 |     
 26 |     # Sample no
 27 |     no = len(X_train[:,0])
 28 |         
 29 |     # Random variable dimension
 30 |     z_dim = int(X_dim/4)
 31 |     
 32 |     # Hidden unit dimensions
 33 |     h_dim = int(X_dim)
 34 |     
 35 |     C_dim = 1
 36 |             
 37 |     # WGAN-GP Parameters
 38 |     lam = 10
 39 |     lr = 1e-4    
 40 |     
 41 |     lamda =np.sqrt(2*np.log(1.25*(10**(delta))))/epsilon
 42 | 
 43 |     #%% Data Preprocessing
 44 |     X_train = np.asarray(X_train)
 45 |     
 46 |     #%% Data Normalization
 47 |     Min_Val = np.min(X_train,0)
 48 |     
 49 |     X_train = X_train - Min_Val
 50 |     
 51 |     Max_Val = np.max(X_train,0)
 52 |     
 53 |     X_train = X_train / (Max_Val + 1e-8)
 54 |     
 55 |     #%% Algorithm Start
 56 | 
 57 |     #%% Necessary Functions
 58 | 
 59 |     # Xavier Initialization Definition
 60 |     def xavier_init(size):
 61 |         in_dim = size[0]
 62 |         xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
 63 |         return tf.random_normal(shape = size, stddev = xavier_stddev)    
 64 |         
 65 |     # Sample from uniform distribution
 66 |     def sample_Z(m, n):
 67 |         return np.random.uniform(-1., 1., size = [m, n])
 68 |         
 69 |     # Sample from the real data
 70 |     def sample_X(m, n):
 71 |         return np.random.permutation(m)[:n]  
 72 |      
 73 |     #%% Placeholder
 74 |     
 75 |     # Feature
 76 |     X = tf.placeholder(tf.float32, shape = [None, X_dim])      
 77 |     # Label
 78 |     Y = tf.placeholder(tf.float32, shape = [None, C_dim])  
 79 |     # Random Variable    
 80 |     Z = tf.placeholder(tf.float32, shape = [None, z_dim])
 81 |     # Conditional Variable
 82 |     M = tf.placeholder(tf.float32, shape = [None, C_dim])
 83 |       
 84 | #%% Discriminator
 85 |     # Discriminator
 86 |     D_W1 = tf.Variable(xavier_init([X_dim + C_dim, h_dim]))
 87 |     D_b1 = tf.Variable(tf.zeros(shape=[h_dim]))
 88 | 
 89 |     D_W2 = tf.Variable(xavier_init([h_dim,h_dim]))
 90 |     D_b2 = tf.Variable(tf.zeros(shape=[h_dim]))
 91 |     
 92 |     D_W3 = tf.Variable(xavier_init([h_dim,1]))
 93 |     D_b3 = tf.Variable(tf.zeros(shape=[1]))
 94 | 
 95 |     theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
 96 |     
 97 |     #%% Generator
 98 | 
 99 |     G_W1 = tf.Variable(xavier_init([z_dim + C_dim, h_dim]))
100 |     G_b1 = tf.Variable(tf.zeros(shape=[h_dim]))
101 | 
102 |     G_W2 = tf.Variable(xavier_init([h_dim,h_dim]))
103 |     G_b2 = tf.Variable(tf.zeros(shape=[h_dim]))
104 | 
105 |     G_W3 = tf.Variable(xavier_init([h_dim,X_dim]))
106 |     G_b3 = tf.Variable(tf.zeros(shape=[X_dim]))
107 |     
108 |     theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]
109 | 
110 |     #%% Functions
111 |     def generator(z, y):
112 |         inputs = tf.concat([z,y], axis = 1)
113 |         G_h1 = tf.nn.tanh(tf.matmul(inputs, G_W1) + G_b1)
114 |         G_h2 = tf.nn.tanh(tf.matmul(G_h1, G_W2) + G_b2)
115 |         G_log_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)
116 |         
117 |         return G_log_prob
118 |     
119 |     def discriminator(x, y):
120 |         inputs = tf.concat([x,y], axis = 1)
121 |         D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
122 |         D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
123 |         out = (tf.matmul(D_h2, D_W3) + D_b3)
124 |         
125 |         return out
126 |       
127 |     #%% 
128 |     # Structure
129 |     G_sample = generator(Z, Y)
130 |     D_real = discriminator(X, Y)
131 |     D_fake = discriminator(G_sample, Y)
132 | 
133 |     #%%
134 |     D_entire = tf.concat(axis = 0, values = [D_real, D_fake])    
135 |     
136 |     #%%
137 | 
138 |     # Replacement of Clipping algorithm to Penalty term
139 |     # 1. Line 6 in Algorithm 1
140 |     eps = tf.random_uniform([mb_size, 1], minval = 0., maxval = 1.)
141 |     X_inter = eps*X + (1. - eps) * G_sample
142 | 
143 |     # 2. Line 7 in Algorithm 1
144 |     grad = tf.gradients(discriminator(X_inter, Y), [X_inter, Y])[0]
145 |     grad_norm = tf.sqrt(tf.reduce_sum((grad)**2 + 1e-8, axis = 1))
146 |     grad_pen = lam * tf.reduce_mean((grad_norm - 1)**2)
147 | 
148 |     # Loss function
149 |     D_loss = tf.reduce_mean((1-M) * D_entire) - tf.reduce_mean(M * D_entire) + grad_pen
150 |     G_loss = -tf.reduce_mean(D_fake)
151 | 
152 |     # Solver
153 |     D_solver = (tf.train.AdamOptimizer(learning_rate = lr, beta1 = 0.5).minimize(D_loss, var_list = theta_D))
154 |     G_solver = (tf.train.AdamOptimizer(learning_rate = lr, beta1 = 0.5).minimize(G_loss, var_list = theta_G))
155 |             
156 |     #%%
157 |     # Sessions
158 |     sess = tf.Session()
159 |     sess.run(tf.global_variables_initializer())
160 |         
161 |     #%%
162 |     # Iterations
163 |     for it in tqdm(range(niter)):
164 | 
165 |         for _ in range(num_teachers):
166 |             #%% Teacher Training            
167 |             Z_mb = sample_Z(mb_size, z_dim)            
168 |             
169 |             # Teacher 1
170 |             X_idx = sample_X(no,mb_size)        
171 |             X_mb = X_train[X_idx,:]  
172 |             
173 |             Y_mb = np.reshape(Y_train[X_idx], [mb_size,1])
174 |             
175 |             #%%
176 |             
177 |             M_real = np.ones([mb_size,])
178 |             M_fake = np.zeros([mb_size,])
179 | 
180 |             M_entire = np.concatenate((M_real, M_fake),0)
181 |                         
182 |             Normal_Add = np.random.normal(loc=0.0, scale=lamda, size = mb_size*2)
183 |     
184 |             M_entire = M_entire + Normal_Add
185 |             
186 |             M_entire = (M_entire > 0.5)            
187 |             
188 |             M_mb = np.reshape(M_entire.astype(float), (2*mb_size,1))
189 |             
190 |             _, D_loss_curr = sess.run([D_solver, D_loss], feed_dict = {X: X_mb, Z: Z_mb, M: M_mb, Y: Y_mb})
191 |             
192 |                     
193 |         #%% Generator Training
194 |                     
195 |         Z_mb = sample_Z(mb_size, z_dim)   
196 |         
197 |         X_idx = sample_X(no,mb_size)        
198 |         X_mb = X_train[X_idx,:]  
199 |             
200 |         Y_mb = np.reshape(Y_train[X_idx], [mb_size,1])
201 |                     
202 |         _, G_loss_curr = sess.run([G_solver, G_loss], feed_dict = {Z: Z_mb, Y: Y_mb})
203 |         print(G_loss_curr)
204 |     #%%       
205 | 
206 |     #%% Output Generation
207 |     
208 |     New_X_train = sess.run([G_sample], feed_dict = {Z: sample_Z(len(X_train[:,0]), z_dim), Y: np.reshape(Y_train, [len(Y_train),1])})
209 | 
210 |     New_X_train = New_X_train[0]
211 |     
212 |     #### Renormalization
213 |         
214 |     New_X_train = New_X_train * (Max_Val + 1e-8)
215 |     
216 |     New_X_train = New_X_train + Min_Val
217 |     
218 |     ## Testing
219 |     
220 |     New_X_test = sess.run([G_sample], feed_dict = {Z: sample_Z(len(X_test[:,0]), z_dim), Y: np.reshape(Y_test, [len(Y_test),1])})
221 | 
222 |     New_X_test = New_X_test[0]
223 |     
224 |     #### Renormalization
225 |         
226 |     New_X_test = New_X_test * (Max_Val + 1e-8)
227 |     
228 |     New_X_test = New_X_test + Min_Val
229 |     
230 |     return New_X_train, Y_train, New_X_test, Y_test
231 | 


--------------------------------------------------------------------------------
/generative_models/rgan/experiments/settings/rgan-dp.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | "custom_experiment": true,
 3 | "settings_file": "",
 4 | "data": "snp500",
 5 | "num_samples": 505,
 6 | "seq_length": 1259,
 7 | "num_signals": 5,
 8 | "normalise": false,
 9 | "cond_dim": 0,
10 | "max_val": 1,
11 | "one_hot": false,
12 | "predict_labels": false,
13 | "scale": 0.1,
14 | "freq_low": 1.0,
15 | "freq_high": 5.0,
16 | "amplitude_low": 0.1,
17 | "amplitude_high": 0.9,
18 | "multivariate_mnist": false,
19 | "full_mnist": false,
20 | "data_load_from": "",
21 | "resample_rate_in_min": 15,
22 | "hidden_units_g": 100,
23 | "hidden_units_d": 100,
24 | "kappa": 1,
25 | "latent_dim": 10,
26 | "batch_mean": false,
27 | "learn_scale": false,
28 | "learning_rate": 0.1,
29 | "batch_size": 16,
30 | "num_epochs": 100,
31 | "D_rounds": 4,
32 | "G_rounds": 1,
33 | "use_time": false,
34 | "WGAN": false,
35 | "WGAN_clip": false,
36 | "shuffle": true,
37 | "wrong_labels": false,
38 | "identifier": "rgan-dp",
39 | "dp": true,
40 | "l2norm_bound": 1e-05,
41 | "batches_per_lot": 1,
42 | "dp_sigma": 1e-05,
43 | "num_generated_features": 5
44 | }


--------------------------------------------------------------------------------
/generative_models/rgan/experiments/settings/rgan.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | "custom_experiment": true,
 3 | "settings_file": "",
 4 | "data": "snp500",
 5 | "num_samples": 505,
 6 | "seq_length": 1259,
 7 | "num_signals": 5,
 8 | "normalise": false,
 9 | "cond_dim": 0,
10 | "max_val": 1,
11 | "one_hot": false,
12 | "predict_labels": false,
13 | "scale": 0.1,
14 | "freq_low": 1.0,
15 | "freq_high": 5.0,
16 | "amplitude_low": 0.1,
17 | "amplitude_high": 0.9,
18 | "multivariate_mnist": false,
19 | "full_mnist": false,
20 | "data_load_from": "",
21 | "resample_rate_in_min": 15,
22 | "hidden_units_g": 100,
23 | "hidden_units_d": 100,
24 | "kappa": 1,
25 | "latent_dim": 10,
26 | "batch_mean": false,
27 | "learn_scale": false,
28 | "learning_rate": 0.1,
29 | "batch_size": 64,
30 | "num_epochs": 500,
31 | "D_rounds": 1,
32 | "G_rounds": 6,
33 | "use_time": false,
34 | "WGAN": false,
35 | "WGAN_clip": false,
36 | "shuffle": true,
37 | "wrong_labels": false,
38 | "identifier": "rgan",
39 | "dp": false,
40 | "l2norm_bound": 1e-05,
41 | "batches_per_lot": 1,
42 | "dp_sigma": null,
43 | "num_generated_features": 5
44 | }


--------------------------------------------------------------------------------
/generative_models/rgan/experiments/settings/rgan_dp.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | "custom_experiment": true,
 3 | "settings_file": "",
 4 | "data": "amsterdam:combds",
 5 | "num_samples": 1000,
 6 | "seq_length": 100,
 7 | "num_signals": 70,
 8 | "normalise": false,
 9 | "cond_dim": 0,
10 | "max_val": 1,
11 | "one_hot": false,
12 | "predict_labels": false,
13 | "scale": 0.1,
14 | "freq_low": 1.0,
15 | "freq_high": 5.0,
16 | "amplitude_low": 0.1,
17 | "amplitude_high": 0.9,
18 | "multivariate_mnist": false,
19 | "full_mnist": false,
20 | "data_load_from": "",
21 | "resample_rate_in_min": 15,
22 | "hidden_units_g": 100,
23 | "hidden_units_d": 100,
24 | "kappa": 1,
25 | "latent_dim": 10,
26 | "batch_mean": false,
27 | "learn_scale": false,
28 | "learning_rate": 0.1,
29 | "batch_size": 128,
30 | "num_epochs": 500,
31 | "D_rounds": 3,
32 | "G_rounds": 1,
33 | "use_time": false,
34 | "WGAN": false,
35 | "WGAN_clip": false,
36 | "shuffle": true,
37 | "wrong_labels": false,
38 | "identifier": "rgan_dp",
39 | "dp": true,
40 | "l2norm_bound": 1e-05,
41 | "batches_per_lot": 1,
42 | "dp_sigma": 0.001,
43 | "num_generated_features": 70
44 | }


--------------------------------------------------------------------------------
/generative_models/rgan/experiments/settings/sine.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | "settings_file": "",
 3 | "data": "sine",
 4 | "num_samples": 14000,
 5 | "seq_length": 30,
 6 | "num_signals": 4,
 7 | "normalise": false,
 8 | "cond_dim": 0,
 9 | "max_val": 1,
10 | "one_hot": false,
11 | "predict_labels": false,
12 | "scale": 0.1,
13 | "freq_low": 1.0,
14 | "freq_high": 5.0,
15 | "amplitude_low": 0.1,
16 | "amplitude_high": 0.9,
17 | "multivariate_mnist": false,
18 | "full_mnist": false,
19 | "data_load_from": "",
20 | "resample_rate_in_min": 15,
21 | "hidden_units_g": 100,
22 | "hidden_units_d": 100,
23 | "kappa": 1,
24 | "latent_dim": 5,
25 | "batch_mean": false,
26 | "learn_scale": false,
27 | "learning_rate": 0.1,
28 | "batch_size": 28,
29 | "num_epochs": 2,
30 | "D_rounds": 5,
31 | "G_rounds": 1,
32 | "use_time": false,
33 | "WGAN": false,
34 | "WGAN_clip": false,
35 | "shuffle": true,
36 | "wrong_labels": false,
37 | "identifier": "sine",
38 | "dp": false,
39 | "l2norm_bound": 1e-05,
40 | "batches_per_lot": 1,
41 | "dp_sigma": 1e-05,
42 | "num_generated_features": 4
43 | }


--------------------------------------------------------------------------------
/generative_models/rgan/experiments/settings/test_modified.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | "settings_file": "",
 3 | "data": "sine",
 4 | "num_samples": 14000,
 5 | "seq_length": 30,
 6 | "num_signals": 4,
 7 | "normalise": false,
 8 | "cond_dim": 0,
 9 | "max_val": 1,
10 | "one_hot": false,
11 | "predict_labels": false,
12 | "scale": 0.1,
13 | "freq_low": 1.0,
14 | "freq_high": 5.0,
15 | "amplitude_low": 0.1,
16 | "amplitude_high": 0.9,
17 | "multivariate_mnist": false,
18 | "full_mnist": false,
19 | "data_load_from": "",
20 | "resample_rate_in_min": 15,
21 | "hidden_units_g": 100,
22 | "hidden_units_d": 100,
23 | "kappa": 1,
24 | "latent_dim": 5,
25 | "batch_mean": false,
26 | "learn_scale": false,
27 | "learning_rate": 0.1,
28 | "batch_size": 28,
29 | "num_epochs": 2,
30 | "D_rounds": 5,
31 | "G_rounds": 1,
32 | "use_time": false,
33 | "WGAN": false,
34 | "WGAN_clip": false,
35 | "shuffle": true,
36 | "wrong_labels": false,
37 | "identifier": "test",
38 | "dp": false,
39 | "l2norm_bound": 1e-05,
40 | "batches_per_lot": 1,
41 | "dp_sigma": 1e-05,
42 | "num_generated_features": 4
43 | }


--------------------------------------------------------------------------------
/generative_models/timegan/__init__.py:
--------------------------------------------------------------------------------
 1 | """Time-series Generative Adversarial Networks (TimeGAN) Codebase.
 2 | 
 3 | Reference: Jinsung Yoon, Daniel Jarrett, Mihaela van der Schaar, 
 4 | "Time-series Generative Adversarial Networks," 
 5 | Neural Information Processing Systems (NeurIPS), 2019.
 6 | 
 7 | Paper link: https://papers.nips.cc/paper/8789-time-series-generative-adversarial-networks
 8 | 
 9 | Last updated Date: Jan 19th 2021
10 | Code author: Jinsung Yoon (jsyoon0823@gmail.com)
11 | Code updated by: Evgeny Saveliev (e.s.saveliev@gmail.com)
12 | 
13 | -----------------------------
14 | 
15 | timegan.py
16 | utils.py
17 | 
18 | """
19 | 
20 | from .timegan import timegan
21 | 


--------------------------------------------------------------------------------
/generative_models/timegan/timegan.py:
--------------------------------------------------------------------------------
  1 | """
  2 | timegan.py
  3 | 
  4 | Note: Use original data as training set to generator synthetic data (time-series)
  5 | """
  6 | 
  7 | # Necessary Packages
  8 | import numpy as np
  9 | from .utils import (  # pylint: disable=relative-beyond-top-level
 10 |     extract_time,
 11 |     rnn_cell,
 12 |     random_generator,
 13 |     batch_generator,
 14 | )
 15 | 
 16 | import tensorflow as tf
 17 | 
 18 | 
 19 | def timegan(ori_data, parameters):
 20 |     """TimeGAN function.
 21 | 
 22 |     Use original data as training set to generator synthetic data (time-series)
 23 | 
 24 |     Args:
 25 |         - ori_data: original time-series data
 26 |         - parameters: TimeGAN network parameters
 27 | 
 28 |     Returns:
 29 |         - generated_data: generated time-series data
 30 |     """
 31 |     if parameters is None:
 32 |         parameters = dict()
 33 |         parameters["module"] = "gru"
 34 |         parameters["hidden_dim"] = 10
 35 |         parameters["num_layer"] = 3
 36 |         parameters["iterations"] = 20000
 37 |         parameters["batch_size"] = 128
 38 |         parameters["print_every_n_iters"] = 1000
 39 | 
 40 |     # Initialization on the Graph
 41 |     tf.reset_default_graph()
 42 | 
 43 |     # Basic Parameters
 44 |     no, seq_len, dim = np.asarray(ori_data).shape
 45 | 
 46 |     # Maximum sequence length and each sequence length
 47 |     ori_time, max_seq_len = extract_time(ori_data)
 48 | 
 49 |     def MinMaxScaler(data):
 50 |         """Min-Max Normalizer.
 51 | 
 52 |         Args:
 53 |             - data: raw data
 54 | 
 55 |         Returns:
 56 |             - norm_data: normalized data
 57 |             - min_val: minimum values (for renormalization)
 58 |             - max_val: maximum values (for renormalization)
 59 |         """
 60 |         min_val = np.min(np.min(data, axis=0), axis=0)
 61 |         data = data - min_val
 62 | 
 63 |         max_val = np.max(np.max(data, axis=0), axis=0)
 64 |         norm_data = data / (max_val + 1e-7)
 65 | 
 66 |         return norm_data, min_val, max_val
 67 | 
 68 |     # Normalization
 69 |     ori_data, min_val, max_val = MinMaxScaler(ori_data)
 70 | 
 71 |     ## Build a RNN networks
 72 | 
 73 |     # Network Parameters
 74 |     hidden_dim = parameters["hidden_dim"]
 75 |     num_layers = parameters["num_layer"]
 76 |     iterations = parameters["iterations"]
 77 |     batch_size = parameters["batch_size"]
 78 |     module_name = parameters["module"]
 79 |     z_dim = dim
 80 |     gamma = 1
 81 | 
 82 |     batch_size = ori_data.shape[0] if ori_data.shape[0] < batch_size else batch_size
 83 | 
 84 |     # Input place holders
 85 |     X = tf.placeholder(tf.float32, [None, max_seq_len, dim], name="myinput_x")
 86 |     Z = tf.placeholder(tf.float32, [None, max_seq_len, z_dim], name="myinput_z")
 87 |     T = tf.placeholder(tf.int32, [None], name="myinput_t")
 88 | 
 89 |     def embedder(X, T):
 90 |         """Embedding network between original feature space to latent space.
 91 | 
 92 |         Args:
 93 |             - X: input time-series features
 94 |             - T: input time information
 95 | 
 96 |         Returns:
 97 |             - H: embeddings
 98 |         """
 99 |         with tf.variable_scope("embedder", reuse=tf.AUTO_REUSE):
100 |             e_cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell(module_name, hidden_dim) for _ in range(num_layers)])
101 |             e_outputs, e_last_states = tf.nn.dynamic_rnn(e_cell, X, dtype=tf.float32, sequence_length=T)
102 |             H = tf.contrib.layers.fully_connected(e_outputs, hidden_dim, activation_fn=tf.nn.sigmoid)
103 |         return H
104 | 
105 |     def recovery(H, T):
106 |         """Recovery network from latent space to original space.
107 | 
108 |         Args:
109 |             - H: latent representation
110 |             - T: input time information
111 | 
112 |         Returns:
113 |             - X_tilde: recovered data
114 |         """
115 |         with tf.variable_scope("recovery", reuse=tf.AUTO_REUSE):
116 |             r_cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell(module_name, hidden_dim) for _ in range(num_layers)])
117 |             r_outputs, r_last_states = tf.nn.dynamic_rnn(r_cell, H, dtype=tf.float32, sequence_length=T)
118 |             X_tilde = tf.contrib.layers.fully_connected(r_outputs, dim, activation_fn=None)
119 |         return X_tilde
120 | 
121 |     def generator(Z, T):
122 |         """Generator function: Generate time-series data in latent space.
123 | 
124 |         Args:
125 |             - Z: random variables
126 |             - T: input time information
127 | 
128 |         Returns:
129 |             - E: generated embedding
130 |         """
131 |         with tf.variable_scope("generator", reuse=tf.AUTO_REUSE):
132 |             e_cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell(module_name, hidden_dim) for _ in range(num_layers)])
133 |             e_outputs, e_last_states = tf.nn.dynamic_rnn(e_cell, Z, dtype=tf.float32, sequence_length=T)
134 |             E = tf.contrib.layers.fully_connected(e_outputs, hidden_dim, activation_fn=tf.nn.sigmoid)
135 |         return E
136 | 
137 |     def supervisor(H, T):
138 |         """Generate next sequence using the previous sequence.
139 | 
140 |         Args:
141 |             - H: latent representation
142 |             - T: input time information
143 | 
144 |         Returns:
145 |             - S: generated sequence based on the latent representations generated by the generator
146 |         """
147 |         with tf.variable_scope("supervisor", reuse=tf.AUTO_REUSE):
148 |             e_cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell(module_name, hidden_dim) for _ in range(num_layers - 1)])
149 |             e_outputs, e_last_states = tf.nn.dynamic_rnn(e_cell, H, dtype=tf.float32, sequence_length=T)
150 |             S = tf.contrib.layers.fully_connected(e_outputs, hidden_dim, activation_fn=tf.nn.sigmoid)
151 |         return S
152 | 
153 |     def discriminator(H, T):
154 |         """Discriminate the original and synthetic time-series data.
155 | 
156 |         Args:
157 |             - H: latent representation
158 |             - T: input time information
159 | 
160 |         Returns:
161 |             - Y_hat: classification results between original and synthetic time-series
162 |         """
163 |         with tf.variable_scope("discriminator", reuse=tf.AUTO_REUSE):
164 |             d_cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell(module_name, hidden_dim) for _ in range(num_layers)])
165 |             d_outputs, d_last_states = tf.nn.dynamic_rnn(d_cell, H, dtype=tf.float32, sequence_length=T)
166 |             Y_hat = tf.contrib.layers.fully_connected(d_outputs, 1, activation_fn=None)
167 |         return Y_hat
168 | 
169 |     # Embedder & Recovery
170 |     H = embedder(X, T)
171 |     X_tilde = recovery(H, T)
172 | 
173 |     # Generator
174 |     E_hat = generator(Z, T)
175 |     H_hat = supervisor(E_hat, T)
176 |     H_hat_supervise = supervisor(H, T)
177 | 
178 |     # Synthetic data
179 |     X_hat = recovery(H_hat, T)
180 | 
181 |     # Discriminator
182 |     Y_fake = discriminator(H_hat, T)
183 |     Y_real = discriminator(H, T)
184 |     Y_fake_e = discriminator(E_hat, T)
185 | 
186 |     # Variables
187 |     e_vars = [v for v in tf.trainable_variables() if v.name.startswith("embedder")]
188 |     r_vars = [v for v in tf.trainable_variables() if v.name.startswith("recovery")]
189 |     g_vars = [v for v in tf.trainable_variables() if v.name.startswith("generator")]
190 |     s_vars = [v for v in tf.trainable_variables() if v.name.startswith("supervisor")]
191 |     d_vars = [v for v in tf.trainable_variables() if v.name.startswith("discriminator")]
192 | 
193 |     # Discriminator loss
194 |     D_loss_real = tf.losses.sigmoid_cross_entropy(tf.ones_like(Y_real), Y_real)
195 |     D_loss_fake = tf.losses.sigmoid_cross_entropy(tf.zeros_like(Y_fake), Y_fake)
196 |     D_loss_fake_e = tf.losses.sigmoid_cross_entropy(tf.zeros_like(Y_fake_e), Y_fake_e)
197 |     D_loss = D_loss_real + D_loss_fake + gamma * D_loss_fake_e
198 | 
199 |     # Generator loss
200 |     # 1. Adversarial loss
201 |     G_loss_U = tf.losses.sigmoid_cross_entropy(tf.ones_like(Y_fake), Y_fake)
202 |     G_loss_U_e = tf.losses.sigmoid_cross_entropy(tf.ones_like(Y_fake_e), Y_fake_e)
203 | 
204 |     # 2. Supervised loss
205 |     G_loss_S = tf.losses.mean_squared_error(H[:, 1:, :], H_hat_supervise[:, :-1, :])
206 | 
207 |     # 3. Two Moments
208 |     G_loss_V1 = tf.reduce_mean(
209 |         tf.abs(tf.sqrt(tf.nn.moments(X_hat, [0])[1] + 1e-6) - tf.sqrt(tf.nn.moments(X, [0])[1] + 1e-6))
210 |     )
211 |     G_loss_V2 = tf.reduce_mean(tf.abs((tf.nn.moments(X_hat, [0])[0]) - (tf.nn.moments(X, [0])[0])))
212 | 
213 |     G_loss_V = G_loss_V1 + G_loss_V2
214 | 
215 |     # 4. Summation
216 |     G_loss = G_loss_U + gamma * G_loss_U_e + 100 * tf.sqrt(G_loss_S) + 100 * G_loss_V
217 | 
218 |     # Embedder network loss
219 |     E_loss_T0 = tf.losses.mean_squared_error(X, X_tilde)
220 |     E_loss0 = 10 * tf.sqrt(E_loss_T0)
221 |     E_loss = E_loss0 + 0.1 * G_loss_S
222 | 
223 |     # optimizer
224 |     E0_solver = tf.train.AdamOptimizer().minimize(E_loss0, var_list=e_vars + r_vars)
225 |     E_solver = tf.train.AdamOptimizer().minimize(E_loss, var_list=e_vars + r_vars)
226 |     D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=d_vars)
227 |     G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=g_vars + s_vars)
228 |     GS_solver = tf.train.AdamOptimizer().minimize(G_loss_S, var_list=g_vars + s_vars)
229 | 
230 |     ## TimeGAN training
231 |     sess = tf.Session()
232 |     sess.run(tf.global_variables_initializer())
233 | 
234 |     # 1. Embedding network training
235 |     print("Start Embedding Network Training")
236 | 
237 |     for itt in range(iterations):
238 |         # Set mini-batch
239 |         X_mb, T_mb = batch_generator(ori_data, ori_time, batch_size)
240 |         # Train embedder
241 |         _, step_e_loss = sess.run([E0_solver, E_loss_T0], feed_dict={X: X_mb, T: T_mb})
242 |         # Checkpoint
243 |         if itt % parameters["print_every_n_iters"] == 0:
244 |             print("step: " + str(itt) + "/" + str(iterations) + ", e_loss: " + str(np.round(np.sqrt(step_e_loss), 4)))
245 | 
246 |     print("Finish Embedding Network Training")
247 | 
248 |     # 2. Training only with supervised loss
249 |     print("Start Training with Supervised Loss Only")
250 | 
251 |     for itt in range(iterations):
252 |         # Set mini-batch
253 |         X_mb, T_mb = batch_generator(ori_data, ori_time, batch_size)
254 |         # Random vector generation
255 |         Z_mb = random_generator(batch_size, z_dim, T_mb, max_seq_len)
256 |         # Train generator
257 |         _, step_g_loss_s = sess.run([GS_solver, G_loss_S], feed_dict={Z: Z_mb, X: X_mb, T: T_mb})
258 |         # Checkpoint
259 |         if itt % parameters["print_every_n_iters"] == 0:
260 |             print("step: " + str(itt) + "/" + str(iterations) + ", s_loss: " + str(np.round(np.sqrt(step_g_loss_s), 4)))
261 | 
262 |     print("Finish Training with Supervised Loss Only")
263 | 
264 |     # 3. Joint Training
265 |     print("Start Joint Training")
266 | 
267 |     for itt in range(iterations):
268 |         # Generator training (twice more than discriminator training)
269 |         for kk in range(2):
270 |             # Set mini-batch
271 |             X_mb, T_mb = batch_generator(ori_data, ori_time, batch_size)
272 |             # Random vector generation
273 |             Z_mb = random_generator(batch_size, z_dim, T_mb, max_seq_len)
274 |             # Train generator
275 |             _, step_g_loss_u, step_g_loss_s, step_g_loss_v = sess.run(
276 |                 [G_solver, G_loss_U, G_loss_S, G_loss_V], feed_dict={Z: Z_mb, X: X_mb, T: T_mb}
277 |             )
278 |             # Train embedder
279 |             _, step_e_loss_t0 = sess.run([E_solver, E_loss_T0], feed_dict={Z: Z_mb, X: X_mb, T: T_mb})
280 | 
281 |         # Discriminator training
282 |         # Set mini-batch
283 |         X_mb, T_mb = batch_generator(ori_data, ori_time, batch_size)
284 |         # Random vector generation
285 |         Z_mb = random_generator(batch_size, z_dim, T_mb, max_seq_len)
286 |         # Check discriminator loss before updating
287 |         check_d_loss = sess.run(D_loss, feed_dict={X: X_mb, T: T_mb, Z: Z_mb})
288 |         # Train discriminator (only when the discriminator does not work well)
289 |         if check_d_loss > 0.15:
290 |             _, step_d_loss = sess.run([D_solver, D_loss], feed_dict={X: X_mb, T: T_mb, Z: Z_mb})
291 | 
292 |         # Print multiple checkpoints
293 |         if itt % parameters["print_every_n_iters"] == 0:
294 |             print(
295 |                 "step: "
296 |                 + str(itt)
297 |                 + "/"
298 |                 + str(iterations)
299 |                 + ", d_loss: "
300 |                 + str(np.round(step_d_loss, 4))
301 |                 + ", g_loss_u: "
302 |                 + str(np.round(step_g_loss_u, 4))
303 |                 + ", g_loss_s: "
304 |                 + str(np.round(np.sqrt(step_g_loss_s), 4))
305 |                 + ", g_loss_v: "
306 |                 + str(np.round(step_g_loss_v, 4))
307 |                 + ", e_loss_t0: "
308 |                 + str(np.round(np.sqrt(step_e_loss_t0), 4))
309 |             )
310 |     print("Finish Joint Training")
311 | 
312 |     ## Synthetic data generation
313 |     Z_mb = random_generator(no, z_dim, ori_time, max_seq_len)
314 |     generated_data_curr = sess.run(X_hat, feed_dict={Z: Z_mb, X: ori_data, T: ori_time})
315 | 
316 |     generated_data = list()
317 | 
318 |     for i in range(no):
319 |         temp = generated_data_curr[i, : ori_time[i], :]
320 |         generated_data.append(temp)
321 | 
322 |     # Renormalization
323 |     generated_data = generated_data * max_val
324 |     generated_data = generated_data + min_val
325 | 
326 |     return generated_data
327 | 


--------------------------------------------------------------------------------
/generative_models/timegan/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | utils.py
  3 | 
  4 | (1) train_test_divide: Divide train and test data for both original and synthetic data.
  5 | (2) extract_time: Returns Maximum sequence length and each sequence length.
  6 | (3) rnn_cell: Basic RNN Cell.
  7 | (4) random_generator: random vector generator
  8 | (5) batch_generator: mini-batch generator
  9 | """
 10 | 
 11 | ## Necessary Packages
 12 | import numpy as np
 13 | 
 14 | import tensorflow.compat.v1 as tf
 15 | tf.disable_v2_behavior()
 16 | 
 17 | 
 18 | def train_test_divide(data_x, data_x_hat, data_t, data_t_hat, train_rate=0.8):
 19 |     """Divide train and test data for both original and synthetic data.
 20 |     
 21 |     Args:
 22 |         - data_x: original data
 23 |         - data_x_hat: generated data
 24 |         - data_t: original time
 25 |         - data_t_hat: generated time
 26 |         - train_rate: ratio of training data from the original data
 27 |     """
 28 |     # Divide train/test index (original data)
 29 |     no = len(data_x)
 30 |     idx = np.random.permutation(no)
 31 |     train_idx = idx[: int(no * train_rate)]
 32 |     test_idx = idx[int(no * train_rate) :]
 33 | 
 34 |     train_x = [data_x[i] for i in train_idx]
 35 |     test_x = [data_x[i] for i in test_idx]
 36 |     train_t = [data_t[i] for i in train_idx]
 37 |     test_t = [data_t[i] for i in test_idx]
 38 | 
 39 |     # Divide train/test index (synthetic data)
 40 |     no = len(data_x_hat)
 41 |     idx = np.random.permutation(no)
 42 |     train_idx = idx[: int(no * train_rate)]
 43 |     test_idx = idx[int(no * train_rate) :]
 44 | 
 45 |     train_x_hat = [data_x_hat[i] for i in train_idx]
 46 |     test_x_hat = [data_x_hat[i] for i in test_idx]
 47 |     train_t_hat = [data_t_hat[i] for i in train_idx]
 48 |     test_t_hat = [data_t_hat[i] for i in test_idx]
 49 | 
 50 |     return train_x, train_x_hat, test_x, test_x_hat, train_t, train_t_hat, test_t, test_t_hat
 51 | 
 52 | 
 53 | def extract_time(data):
 54 |     """Returns Maximum sequence length and each sequence length.
 55 | 
 56 |     Args:
 57 |         - data: original data
 58 | 
 59 |     Returns:
 60 |         - time: extracted time information
 61 |         - max_seq_len: maximum sequence length
 62 |     """
 63 |     time = list()
 64 |     max_seq_len = 0
 65 |     for i in range(len(data)):
 66 |         max_seq_len = max(max_seq_len, len(data[i][:, 0]))
 67 |         time.append(len(data[i][:, 0]))
 68 | 
 69 |     return time, max_seq_len
 70 | 
 71 | 
 72 | def rnn_cell(module_name, hidden_dim):
 73 |     """Basic RNN Cell.
 74 | 
 75 |     Args:
 76 |         - module_name: gru, lstm, or lstmLN
 77 | 
 78 |     Returns:
 79 |         - rnn_cell: RNN Cell
 80 |     """
 81 |     assert module_name in ["gru", "lstm", "lstmLN"]
 82 | 
 83 |     # GRU
 84 |     if module_name == "gru":
 85 |         rnn_cell = tf.nn.rnn_cell.GRUCell(num_units=hidden_dim, activation=tf.nn.tanh)
 86 |     # LSTM
 87 |     elif module_name == "lstm":
 88 |         rnn_cell = tf.contrib.rnn.BasicLSTMCell(num_units=hidden_dim, activation=tf.nn.tanh)
 89 |     # LSTM Layer Normalization
 90 |     elif module_name == "lstmLN":
 91 |         rnn_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(num_units=hidden_dim, activation=tf.nn.tanh)
 92 |     return rnn_cell
 93 | 
 94 | 
 95 | def random_generator(batch_size, z_dim, T_mb, max_seq_len):
 96 |     """Random vector generation.
 97 | 
 98 |     Args:
 99 |         - batch_size: size of the random vector
100 |         - z_dim: dimension of random vector
101 |         - T_mb: time information for the random vector
102 |         - max_seq_len: maximum sequence length
103 | 
104 |     Returns:
105 |         - Z_mb: generated random vector
106 |     """
107 |     Z_mb = list()
108 |     for i in range(batch_size):
109 |         temp = np.zeros([max_seq_len, z_dim])
110 |         temp_Z = np.random.uniform(0.0, 1, [T_mb[i], z_dim])
111 |         temp[: T_mb[i], :] = temp_Z
112 |         Z_mb.append(temp_Z)
113 |     return Z_mb
114 | 
115 | 
116 | def batch_generator(data, time, batch_size):
117 |     """Mini-batch generator.
118 | 
119 |     Args:
120 |         - data: time-series data
121 |         - time: time information
122 |         - batch_size: the number of samples in each batch
123 | 
124 |     Returns:
125 |         - X_mb: time-series data in each batch
126 |         - T_mb: time information in each batch
127 |     """
128 |     no = len(data)
129 |     idx = np.random.permutation(no)
130 |     train_idx = idx[:batch_size]
131 | 
132 |     X_mb = list(data[i] for i in train_idx)
133 |     T_mb = list(time[i] for i in train_idx)
134 | 
135 |     return X_mb, T_mb
136 | 


--------------------------------------------------------------------------------
/generative_models/vae.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Code author: Boris van Breugel (bv292@cam.ac.uk)
  3 | 
  4 | Based on code by Jinsung Yoon (jsyoon0823@gmail.com)
  5 | 	
  6 | -----------------------------
  7 | 
  8 | Generate synthetic data with VAE framework
  9 | (1) Use original data to generate synthetic data
 10 | """
 11 | 
 12 | #%% Import necessary packages
 13 | import tensorflow as tf
 14 | import numpy as np
 15 | 
 16 | from tqdm import tqdm
 17 | 
 18 | 
 19 | def vae(orig_data, params):
 20 |     """Generate synthetic data for VAE framework.
 21 |     
 22 |     Args:
 23 |             orig_data: original data
 24 |             params: Network parameters
 25 |                     mb_size: mini-batch size
 26 |                     z_dim: random state dimension
 27 |                     h_dim: hidden state dimension
 28 |                     lambda: identifiability parameter
 29 |                     iterations: training iterations
 30 |                     
 31 |     Returns:
 32 |             synth_data: synthetically generated data
 33 |     """
 34 |             
 35 |     # Reset the tensorflow graph
 36 |     tf.compat.v1.reset_default_graph()
 37 |     
 38 |     ## Parameters                
 39 |     # Feature no
 40 |     x_dim = len(orig_data.columns)                
 41 |     # X_recon no
 42 |     no = len(orig_data)                
 43 |     
 44 |     # Batch size                
 45 |     mb_size = params['mb_size']
 46 |     # Latent representation dimension
 47 |     z_dim = params['z_dim']
 48 |     # Hidden unit dimensions
 49 |     h_dim = params['h_dim']                
 50 |     # Identifiability parameter
 51 |     
 52 |     # Training iterations
 53 |     iterations = params['iterations']
 54 |     # VAE type
 55 |     lr = 1e-4                
 56 | 
 57 |     #%% Data Preprocessing
 58 |     orig_data = np.asarray(orig_data)
 59 | 
 60 |     def data_normalization(orig_data, epsilon = 1e-8):
 61 |                         
 62 |         min_val = np.min(orig_data, axis=0)
 63 |         
 64 |         normalized_data = orig_data - min_val
 65 |         
 66 |         max_val = np.max(normalized_data, axis=0)
 67 |         normalized_data = normalized_data / (max_val + epsilon)
 68 |         
 69 |         normalization_params = {"min_val": min_val, "max_val": max_val}
 70 |         
 71 |         return normalized_data, normalization_params
 72 | 
 73 |     def data_renormalization(normalized_data, normalization_params, epsilon = 1e-8):
 74 |         
 75 |         renormalized_data = normalized_data * (normalization_params['max_val'] + epsilon)
 76 |         renormalized_data = renormalized_data + normalization_params['min_val']
 77 |         
 78 |         return renormalized_data
 79 |     
 80 |     orig_data, normalization_params = data_normalization(orig_data)
 81 |             
 82 |     #%% Necessary Functions
 83 | 
 84 |     # Xavier Initialization Definition
 85 |     def xavier_init(size):
 86 |         in_dim = size[0]
 87 |         xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
 88 |         return tf.random.normal(shape = size, stddev = xavier_stddev)                
 89 |                             
 90 |     # X_recon from uniform distribution
 91 |     def X_recon_Z(m, n):
 92 |         return np.random.randn(m, n)
 93 |                             
 94 |     # X_recon from the real data
 95 |     def X_recon_X(m, n):
 96 |         return np.random.permutation(m)[:n]  
 97 | 
 98 |     def sample_Z(m,n):
 99 |         return tf.random.normal((m,n), 0, 1, dtype=tf.float32)
100 |              
101 |     #%% Placeholder
102 |     # Feature
103 |     X = tf.compat.v1.placeholder(tf.float32, shape = [None, x_dim])         
104 |     X_recon = tf.compat.v1.placeholder(tf.float32, shape = [None, x_dim])         
105 |     # Random Variable                
106 |     Z = tf.compat.v1.placeholder(tf.float32, shape = [None, z_dim])
107 |     mu = tf.compat.v1.placeholder(tf.float32, shape = [None, z_dim])
108 |     logvar = tf.compat.v1.placeholder(tf.float32, shape = [None, z_dim])
109 |     
110 |     
111 |     #%% Encoder
112 |     E_W1 = tf.Variable(xavier_init([x_dim, h_dim]))
113 |     E_b1 = tf.Variable(tf.zeros(shape=[h_dim]))
114 |     
115 |     E_W2e = tf.Variable(xavier_init([h_dim, h_dim]))
116 |     E_b2e = tf.Variable(tf.zeros(shape=[h_dim]))
117 |     
118 |     
119 |     E_W_sigma = tf.Variable(xavier_init([h_dim,z_dim]))
120 |     E_b_sigma = tf.Variable(tf.zeros(shape=[z_dim]))
121 |     
122 |     E_W_mu = tf.Variable(xavier_init([h_dim,z_dim]))
123 |     E_b_mu = tf.Variable(tf.zeros(shape=[z_dim]))
124 |     
125 |     
126 |     # Decoder
127 |     
128 |     
129 |     D_W3 = tf.Variable(xavier_init([z_dim,h_dim]))
130 |     D_b3 = tf.Variable(tf.zeros(shape=[h_dim]))
131 |     
132 |     D_W2d = tf.Variable(xavier_init([h_dim, h_dim]))
133 |     D_b2d = tf.Variable(tf.zeros(shape=[h_dim]))
134 |     
135 |     
136 |     D_W4 = tf.Variable(xavier_init([h_dim, x_dim]))
137 |     D_b4 = tf.Variable(tf.zeros(shape=[x_dim]))
138 |         
139 |     theta = [E_W1, E_W_sigma, E_W_mu, D_W3, D_W4, E_b1, 
140 |                E_b_mu, E_b_sigma, D_b3, D_b4,
141 |                E_W2e, E_b2e, D_W2d, D_b2d]
142 |     
143 |     #%% Generator and discriminator functions
144 |     def encoder(x):
145 |         E_h1 = tf.nn.tanh(tf.matmul(x, E_W1) + E_b1)
146 |         E_h2 = tf.nn.tanh(tf.matmul(E_h1, E_W2e) + E_b2e)
147 |         E_hmu = tf.nn.tanh(tf.matmul(E_h2, E_W_mu) + E_b_mu)
148 |         E_hsigma = tf.matmul(E_h1, E_W_sigma) + E_b_sigma
149 |         return E_hmu, E_hsigma
150 |     
151 |     def decoder(z):
152 |         D_h3 = tf.nn.tanh(tf.matmul(z, D_W3) + D_b3)
153 |         D_h4 = tf.nn.tanh(tf.matmul(D_h3, D_W2d) + D_b2d)
154 |         x_recon = tf.nn.sigmoid(tf.matmul(D_h4, D_W4) + D_b4)
155 |         return x_recon
156 |             
157 |         
158 |         
159 |     #%% Structure
160 |     mu, logvar = encoder(X)
161 |     Z = mu + tf.exp(logvar/2) * tf.random.normal(tf.shape(input=mu), 0, 1, dtype=tf.float32)
162 |     
163 |     X_recon = decoder(Z)
164 |     
165 |     
166 |     
167 |     
168 |     loss1 = tf.reduce_mean(input_tensor=tf.square(X_recon-X))
169 |     loss2 = 0.5 * tf.reduce_mean(input_tensor=tf.square(mu) + tf.exp(logvar) - logvar - 1, axis=1)
170 |     
171 |     loss = loss1 + loss2
172 |     # Solver
173 |     
174 |     solver = (tf.compat.v1.train.AdamOptimizer(learning_rate = lr, beta1 = 0.5).minimize(loss, var_list = theta))
175 |                                             
176 |     #%% Iterations
177 |     sess = tf.compat.v1.Session()
178 |     sess.run(tf.compat.v1.global_variables_initializer())
179 |                             
180 |     # Iterations
181 |     for it in tqdm(range(iterations)):
182 |         # Discriminator training
183 |                     
184 |         X_idx = X_recon_X(no,mb_size)
185 |         X_mb = orig_data[X_idx,:]
186 |                                                                         
187 |         _, E_loss1_curr, E_loss2_curr = sess.run([solver, loss1, loss2], feed_dict = {X: X_mb})
188 |             
189 |     #%% Output Generation
190 |     synth_data = sess.run([X_recon], feed_dict = {Z: np.random.randn(no, z_dim)})
191 |     synth_data = synth_data[0]
192 |     print(synth_data.shape)
193 |             
194 |     # Renormalization
195 |     synth_data = data_renormalization(synth_data, normalization_params)
196 |     
197 |     # Binary features
198 |     for i in range(x_dim):
199 |         if len(np.unique(orig_data[:, i])) == 2:
200 |             synth_data[:, i] = np.round(synth_data[:, i])
201 |      
202 |     return synth_data


--------------------------------------------------------------------------------
/main_timeseries.py:
--------------------------------------------------------------------------------
 1 | """Time series data generation.
 2 | 
 3 | Author: Evgeny Saveliev (e.s.saveliev@gmail.com)
 4 | """
 5 | import os
 6 | 
 7 | import numpy as np
 8 | 
 9 | from generative_models.timegan import timegan
10 | from data.amsterdam import AmsterdamLoader, preprocess_data, padding_mask_to_seq_lens
11 | 
12 | # ----------------------------------------------------------------------------------------------------------------------
13 | # Set experiment settings here:
14 | 
15 | use_data = "amsterdam"
16 | use_model = "timegan"
17 | 
18 | generated_data_dir = "./data/ts_generated/"
19 | 
20 | amsterdam_data_settings = {
21 |     "train_frac": 0.4,
22 |     "val_frac": 0.2,
23 |     "n_features": 70,
24 |     "include_time": False,
25 |     "max_timesteps": 100,
26 |     "pad_val": -999.,
27 |     "data_split_seed": 12345,
28 |     "data_loading_force_refresh": True,
29 |     # --------------------
30 |     "data_path": "data/amsterdam/combined_downsampled_longitudinal_data.csv",
31 | }
32 | 
33 | timegan_experiment_settings = {
34 |     "model_params": {
35 |         "module": "gru",
36 |         "hidden_dim": 10,
37 |         "num_layer": 3,
38 |         "iterations": 1000,
39 |         "batch_size": 128,
40 |         "print_every_n_iters": 100,
41 |     },
42 |     "generated_data_filename": "<data>_timegan.npy"  # NOTE: <data> will be replaced with `use_data` value.
43 | }
44 | 
45 | # ----------------------------------------------------------------------------------------------------------------------
46 | # Utilities.
47 | 
48 | def prepare_amsterdam(amsterdam_loader, settings):
49 |     raw_data, padding_mask, (train_idx, val_idx, test_idx) = \
50 |         amsterdam_loader.load_reshape_split_data(force_refresh=settings["data_loading_force_refresh"])
51 |     processed_data, imputed_processed_data = preprocess_data(
52 |         raw_data, 
53 |         padding_mask, 
54 |         padding_fill=settings["pad_val"],
55 |         time_feature_included=settings["include_time"],
56 |     )
57 |     seq_lens = padding_mask_to_seq_lens(padding_mask)
58 |     return imputed_processed_data, seq_lens
59 | 
60 | # ----------------------------------------------------------------------------------------------------------------------
61 | 
62 | def main():
63 |     
64 |     if use_data == "amsterdam":
65 |         active_data_settings = amsterdam_data_settings
66 |         amsterdam_loader = AmsterdamLoader(
67 |             data_path=os.path.abspath(active_data_settings["data_path"]),
68 |             max_seq_len=active_data_settings["max_timesteps"],
69 |             seed=active_data_settings["data_split_seed"],
70 |             train_rate=active_data_settings["train_frac"],
71 |             val_rate=active_data_settings["val_frac"],
72 |             include_time=active_data_settings["include_time"],
73 |             debug_data=False,
74 |             pad_before=False,
75 |             padding_fill=active_data_settings["pad_val"],
76 |         )
77 |         if use_model == "timegan":
78 |             # Timegan doesn't take variable-length sequences, use padding value of 0.
79 |             amsterdam_loader.padding_fill = 0.
80 |         original_data, seq_lens = prepare_amsterdam(amsterdam_loader=amsterdam_loader, settings=active_data_settings)
81 | 
82 |     if use_model == "timegan":
83 |         active_experiment_settings = timegan_experiment_settings
84 |         generated_data = timegan(ori_data=original_data, parameters=active_experiment_settings["model_params"])
85 |     
86 |     generated_data_filepath = os.path.join(
87 |         generated_data_dir, 
88 |         active_experiment_settings["generated_data_filename"].replace("<data>", use_data))
89 |     np.save(generated_data_filepath, generated_data)
90 |     print(f"Generative model: {use_model}, data: {use_data}\n" 
91 |         f"Generated and saved timeseries data of shape: {generated_data.shape}. File: {generated_data_filepath}.")
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     main()
96 | 


--------------------------------------------------------------------------------
/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedmalaa/evaluating-generative-models/093910e487d07959db7d87b54698da60aaeb50c0/metrics/__init__.py


--------------------------------------------------------------------------------
/metrics/combined.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | 
  4 | Created on Fri Jan 15 15:52:59 2021
  5 | 
  6 | @author: boris
  7 | 
  8 | """
  9 | 
 10 | 
 11 | from metrics.feature_distribution import feature_distribution
 12 | from metrics.compute_wd import compute_wd
 13 | from metrics.compute_identifiability import compute_identifiability
 14 | from metrics.fid import compute_frechet_distance
 15 | from metrics.parzen import compute_parzen
 16 | from metrics.precision_recall import compute_prc
 17 | from metrics.prdc import compute_prdc
 18 | from metrics.evaluation import compute_alpha_precision
 19 | 
 20 | import torch
 21 | import numpy as np
 22 | 
 23 | if torch.cuda.is_available():
 24 |     device = 'cuda'
 25 | else:
 26 |     device = 'cpu'
 27 | 
 28 | def compute_metrics(X, Y, which_metric=None, wd_params=None, model=None):
 29 |     results = {}
 30 |     emb_types = ['']
 31 |     
 32 |     if model is not None:
 33 |         emb_types.append('_OC')    
 34 |     else:
 35 |         print('#####################!OC model not defined !##################')
 36 |         
 37 |     if wd_params is None:
 38 |         wd_params = dict()
 39 |         wd_params['iterations'] = 500
 40 |         wd_params['h_dim'] = 30
 41 |         wd_params['z_dim'] = 10
 42 |         wd_params['mb_size'] = 128
 43 |     
 44 |     if which_metric is None:
 45 |             which_metric = [['WD','FD', 'PRDC', 'OC'], # normal
 46 |                             ['OC']]                   # additional OneClass
 47 |             
 48 |     for emb_index, emb in enumerate(emb_types):
 49 |         
 50 |         if emb_index == 1 and len(which_metric[1])>0:
 51 |             print('Computing metrics for OC embedding')
 52 |             print('Embedding data into OC representation')
 53 |             model.to(device)
 54 |             with torch.no_grad():
 55 |                 X = model(torch.tensor(X).float().to(device)).cpu().detach().numpy()
 56 |                 Y = model(torch.tensor(Y).float().to(device)).cpu().detach().numpy()
 57 |             print('Done embedding')
 58 |             print('X, std X', np.mean(X), np.std(X))
 59 |             print('Y, std Y', np.mean(Y), np.std(Y))
 60 |             
 61 |         else:
 62 |             print('Computing metrics for no additional OneClass embedding')
 63 |     
 64 |         
 65 |         
 66 |         # (1) Marginal distributions
 67 |         if 'marg' in which_metric[emb_index]:
 68 |             
 69 |             print('Start computing marginal feature distributions')
 70 |             results[f'feat_dist{emb}'] = feature_distribution(X, Y)
 71 |             print('Finish computing feature distributions')
 72 |             print(results[f'feat_dist{emb}'])
 73 |     
 74 |     
 75 |         # (2) Wasserstein Distance (WD)
 76 |         if 'WD' in which_metric[emb_index]:
 77 |             print('Start computing Wasserstein Distance')
 78 |             results[f'wd_measure{emb}'] = compute_wd(X, Y, wd_params)
 79 |             print('WD measure: ',results[f'wd_measure{emb}'])
 80 |         
 81 |         
 82 |         # (3) Identifiability 
 83 |         if 'ID' in which_metric[emb_index]:
 84 |             print('Start computing identifiability')
 85 |             results[f'identifiability{emb}'] = compute_identifiability(X, Y)
 86 |             print('Identifiability measure: ',results[f'identifiability{emb}'])
 87 |         
 88 |     
 89 |         # (4) Frechet distance
 90 |         if 'FD' in which_metric[emb_index] or 'FID' in which_metric[emb_index]:
 91 |             results[f'fid_value{emb}'] = compute_frechet_distance(X, Y)
 92 |             print('Frechet distance', results[f'fid_value{emb}'])
 93 |             print('Frechet distance/dim', results[f'fid_value{emb}']/Y.shape[-1])
 94 |         
 95 |     
 96 |         # (5) Parzen
 97 |         if 'parzen' in which_metric[emb_index]:
 98 |             results[f'parzen_ll{emb}'], results[f'parzen_std{emb}'] = compute_parzen(X, Y, sigma=0.408)
 99 |             print(f'Parzen Log-Likelihood of test set = {results["parzen_ll"]}, se: {results["parzen_std"]}')
100 |     
101 |                 
102 |         # (6) Precision/Recall
103 |         if 'PR' in which_metric[emb_index]:
104 |             results[f'PR{emb}'] = compute_prc(X,Y)
105 |         elif 'PRDC' in which_metric[emb_index]:
106 |             print('Start computing P&R and D&C')
107 |             prdc_res = compute_prdc(X,Y)
108 |             for key in prdc_res:
109 |                 print('PRDC:', key, prdc_res[key])
110 |                 results[key+emb] = prdc_res[key]
111 |         
112 |         # (7) OneClass
113 |         if 'OC' in which_metric[emb_index]:
114 |             if emb_index==1:
115 |                 emb_center = model.c
116 |             else:
117 |                 emb_center = np.mean(X,axis=0)
118 |             print('Start computing OC metrics')
119 |             OC_res = compute_alpha_precision(X, Y, emb_center)
120 |             alphas, alpha_precision_curve, beta_coverage_curve, Delta_precision_alpha, Delta_coverage_beta, authen = OC_res
121 |             results[f'alphas{emb}'] = alphas
122 |             results[f'alpha_pc{emb}'] = alpha_precision_curve
123 |             results[f'beta_cv{emb}'] = beta_coverage_curve
124 |             results[f'auten{emb}'] = authen
125 |             results[f'Dpa{emb}'] = Delta_precision_alpha
126 |             results[f'Dcb{emb}'] = Delta_coverage_beta
127 |             results[f'Daut{emb}'] = np.mean(authen)
128 |             print('OneClass: Delta_precision_alpha', results[f'Dpa{emb}'])
129 |             print('OneClass: Delta_coverage_beta  ', results[f'Dcb{emb}'])
130 |             print('OneClass: Delta_autenticity    ', results[f'Daut{emb}'])
131 |         
132 | 
133 |     return results


--------------------------------------------------------------------------------
/metrics/compute_identifiability.py:
--------------------------------------------------------------------------------
 1 | """Anonymization through Data Synthesis using Generative Adversarial Networks:
 2 | A harmonizing advancement for AI in medicine (ADS-GAN) Codebase.
 3 | 
 4 | Reference: Jinsung Yoon, Lydia N. Drumright, Mihaela van der Schaar, 
 5 | "Anonymization through Data Synthesis using Generative Adversarial Networks (ADS-GAN):
 6 | A harmonizing advancement for AI in medicine," 
 7 | IEEE Journal of Biomedical and Health Informatics (JBHI), 2019.
 8 | Paper link: https://ieeexplore.ieee.org/document/9034117
 9 | Last updated Date: December 22th 2020
10 | Code author: Jinsung Yoon (jsyoon0823@gmail.com)
11 | -----------------------------
12 | compute_identifiability.py
13 | - Compare Identifiability between original data and synthetic data
14 | """
15 | 
16 | # Necessary packages
17 | import numpy as np
18 | from sklearn.neighbors import NearestNeighbors
19 | from scipy.stats import entropy
20 | 
21 | # Function start
22 | def compute_identifiability (orig_data, synth_data):
23 |     """Compare Wasserstein distance between original data and synthetic data.
24 |     
25 |     Args:
26 |         orig_data: original data
27 |         synth_data: synthetically generated data
28 |             
29 |     Returns:
30 |         WD_value: Wasserstein distance
31 |     """
32 |     
33 |     # Entropy computation
34 |     def compute_entropy(labels):
35 |         value,counts = np.unique(np.round(labels), return_counts=True)
36 |         return entropy(counts)
37 |     
38 |     # Original data
39 |     orig_data = np.asarray(orig_data)
40 |                 
41 |     # Parameters
42 |     no, x_dim = np.shape(orig_data)
43 |         
44 |     #%% Weights
45 |     W = np.zeros([x_dim,])
46 |         
47 |     for i in range(x_dim):
48 |         W[i] = compute_entropy(orig_data[:,i])
49 |         
50 |     # Normalization
51 |     orig_data_hat = orig_data.copy()
52 |     synth_data_hat = synth_data.copy()
53 |         
54 |     eps = 0 #1e-16
55 |     W = np.ones_like(W)
56 |     
57 |     for i in range(x_dim):
58 |         orig_data_hat[:,i] = orig_data[:,i] * 1./(W[i]+eps)
59 |         synth_data_hat[:,i] = synth_data[:,i] * 1./(W[i]+eps)
60 |         
61 |     #%% r_i computation
62 |     nbrs = NearestNeighbors(n_neighbors = 2).fit(orig_data_hat)
63 |     distance, _ = nbrs.kneighbors(orig_data_hat)
64 | 
65 |     # hat{r_i} computation
66 |     nbrs_hat = NearestNeighbors(n_neighbors = 1).fit(synth_data_hat)
67 |     distance_hat, _ = nbrs_hat.kneighbors(orig_data_hat)
68 | 
69 |     # See which one is bigger
70 |     R_Diff = distance_hat[:,0] - distance[:,1]
71 |     identifiability_value = np.sum(R_Diff<0) / float(no)
72 |         
73 |     return identifiability_value


--------------------------------------------------------------------------------
/metrics/compute_wd.py:
--------------------------------------------------------------------------------
  1 | """Anonymization through Data Synthesis using Generative Adversarial Networks:
  2 | A harmonizing advancement for AI in medicine (ADS-GAN) Codebase.
  3 | 
  4 | Reference: Jinsung Yoon, Lydia N. Drumright, Mihaela van der Schaar, 
  5 | "Anonymization through Data Synthesis using Generative Adversarial Networks (ADS-GAN):
  6 | A harmonizing advancement for AI in medicine," 
  7 | IEEE Journal of Biomedical and Health Informatics (JBHI), 2019.
  8 | Paper link: https://ieeexplore.ieee.org/document/9034117
  9 | Last updated Date: December 22th 2020
 10 | Code author: Jinsung Yoon (jsyoon0823@gmail.com)
 11 | Updated by:  Boris van Breugel (bv292@cam.ac.uk)
 12 | 
 13 | -----------------------------
 14 | compute_wd.py
 15 | - Compare Wasserstein distance between original data and synthetic data
 16 | """
 17 | 
 18 | import numpy as np
 19 | import tensorflow as tf
 20 | from tqdm import tqdm
 21 | 
 22 | tf.compat.v1.disable_eager_execution()
 23 | 
 24 | def compute_wd (orig_data, synth_data, params):
 25 |     """Compare Wasserstein distance between original data and synthetic data.
 26 |     
 27 |     Args:
 28 |         orig_data: original data
 29 |         synth_data: synthetically generated data
 30 |         params: Network parameters
 31 |             mb_size: mini-batch size
 32 |             h_dim: hidden state dimension
 33 |             iterations: training iterations
 34 |             
 35 |     Returns:
 36 |         WD_value: Wasserstein distance
 37 |     """
 38 |     
 39 |     # Preprocess the data
 40 |     orig_data = np.asarray(orig_data)
 41 |     synth_data = np.asarray(synth_data)
 42 |         
 43 |     no, x_dim = np.shape(orig_data)
 44 |         
 45 |     # Divide train / test
 46 |     orig_data_train = orig_data[:int(no/2),:]
 47 |     orig_data_test = orig_data[int(no/2):,:]
 48 |         
 49 |     synth_data_train = synth_data[:int(no/2),:]
 50 |     synth_data_test = synth_data[int(no/2):,:]
 51 |         
 52 |     #%% Parameters
 53 |     # Batch size        
 54 |     mb_size = params['mb_size']
 55 |     # Hidden unit dimensions
 56 |     h_dim = int(params['h_dim']/2)
 57 |     # Train iterations
 58 |     iterations = params['iterations']
 59 |         
 60 |     #%% Necessary Functions
 61 | 
 62 |     # Xavier Initialization Definition
 63 |     def xavier_init(size):
 64 |         in_dim = size[0]
 65 |         xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
 66 |         return tf.random.normal(shape = size, stddev = xavier_stddev)     
 67 |                 
 68 |     # Sample from the real data
 69 |     def sample_X(m, n):
 70 |         return np.random.permutation(m)[:n]    
 71 |          
 72 |     #%% Placeholder
 73 |     X = tf.compat.v1.placeholder(tf.float32, shape = [None, x_dim])     
 74 |     X_hat = tf.compat.v1.placeholder(tf.float32, shape = [None, x_dim])    
 75 |             
 76 |     #%% Discriminator
 77 |     # Discriminator
 78 |     D_W1 = tf.Variable(xavier_init([x_dim, h_dim]))
 79 |     D_b1 = tf.Variable(tf.zeros(shape=[h_dim]))
 80 |         
 81 |     D_W2 = tf.Variable(xavier_init([h_dim,1]))
 82 |     D_b2 = tf.Variable(tf.zeros(shape=[1]))
 83 | 
 84 |     theta_D = [D_W1, D_W2, D_b1, D_b2]
 85 |         
 86 |     def discriminator(x):
 87 |         D_h1 = tf.nn.relu(tf.matmul(x, D_W1) + D_b1)
 88 |         out = (tf.matmul(D_h1, D_W2) + D_b2)
 89 |         return out
 90 |         
 91 |     # Structure
 92 |     D_real = discriminator(X)
 93 |     D_fake = discriminator(X_hat) 
 94 |         
 95 |     D_loss = tf.reduce_mean(input_tensor=D_real) - tf.reduce_mean(input_tensor=D_fake)
 96 |         
 97 |     D_solver = (tf.compat.v1.train.RMSPropOptimizer(learning_rate=1e-4)
 98 |                             .minimize(-D_loss, var_list=theta_D))
 99 |         
100 |     clip_D = [p.assign(tf.clip_by_value(p, -0.1, 0.1)) for p in theta_D]
101 |                         
102 |     #%%
103 |     sess = tf.compat.v1.Session()
104 |     sess.run(tf.compat.v1.global_variables_initializer())
105 |                 
106 |     # Iterations
107 |     for it in tqdm(range(iterations)):                                
108 |                         
109 |         X_idx = sample_X(int(no/2),mb_size)                
110 |         X_mb = orig_data_train[X_idx,:]     
111 |         X_hat_mb = synth_data_train[X_idx,:]    
112 |                         
113 |         _, D_loss_curr, _ = sess.run([D_solver, D_loss, clip_D], feed_dict = {X: X_mb, X_hat: X_hat_mb})
114 |              
115 |     #%% Test
116 |     WD_value = sess.run([D_loss], feed_dict = {X: orig_data_test, X_hat: synth_data_test})
117 |         
118 |     return WD_value[0]


--------------------------------------------------------------------------------
/metrics/evaluation.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Copyright (c) 2021, Ahmed M. Alaa, Boris van Breugel
 3 | # Licensed under the BSD 3-clause license (see LICENSE.txt)
 4 | 
 5 | """
 6 |   
 7 |   ----------------------------------------- 
 8 |   Metrics implementation
 9 |   ----------------------------------------- 
10 | 
11 | """
12 | 
13 | from __future__ import absolute_import, division, print_function
14 | 
15 | import numpy as np
16 | import sys
17 | from sklearn.neighbors import NearestNeighbors
18 | 
19 | import logging
20 | import torch
21 | import scipy
22 | 
23 | if not sys.warnoptions:
24 |     import warnings
25 |     warnings.simplefilter("ignore")
26 |     
27 | device = 'cpu' # matrices are too big for gpu
28 | 
29 | 
30 | def compute_alpha_precision(real_data, synthetic_data, emb_center):
31 |     
32 | 
33 |     emb_center = torch.tensor(emb_center, device=device)
34 | 
35 |     n_steps = 30
36 |     nn_size = 2
37 |     alphas  = np.linspace(0, 1, n_steps)
38 |         
39 |     
40 |     Radii   = np.quantile(torch.sqrt(torch.sum((torch.tensor(real_data).float() - emb_center) ** 2, dim=1)), alphas)
41 |     
42 |     synth_center          = torch.tensor(np.mean(synthetic_data, axis=0)).float()
43 |     
44 |     alpha_precision_curve = []
45 |     beta_coverage_curve   = []
46 |     
47 |     synth_to_center       = torch.sqrt(torch.sum((torch.tensor(synthetic_data).float() - emb_center) ** 2, dim=1))
48 |     
49 |     
50 |     nbrs_real = NearestNeighbors(n_neighbors = 2, n_jobs=-1, p=2).fit(real_data)
51 |     real_to_real, _       = nbrs_real.kneighbors(real_data)
52 |     
53 |     nbrs_synth = NearestNeighbors(n_neighbors = 1, n_jobs=-1, p=2).fit(synthetic_data)
54 |     real_to_synth, real_to_synth_args = nbrs_synth.kneighbors(real_data)
55 | 
56 |     # Let us find closest real point to any real point, excluding itself (therefore 1 instead of 0)
57 |     real_to_real          = torch.from_numpy(real_to_real[:,1].squeeze())
58 |     real_to_synth         = torch.from_numpy(real_to_synth.squeeze())
59 |     real_to_synth_args    = real_to_synth_args.squeeze()
60 | 
61 |     real_synth_closest    = synthetic_data[real_to_synth_args]
62 |     
63 |     real_synth_closest_d  = torch.sqrt(torch.sum((torch.tensor(real_synth_closest).float()- synth_center) ** 2, dim=1))
64 |     closest_synth_Radii   = np.quantile(real_synth_closest_d, alphas)
65 | 
66 | 
67 |     
68 |     for k in range(len(Radii)):
69 |         precision_audit_mask = (synth_to_center <= Radii[k]).detach().float().numpy()
70 |         alpha_precision      = np.mean(precision_audit_mask)
71 | 
72 |         beta_coverage        = np.mean(((real_to_synth <= real_to_real) * (real_synth_closest_d <= closest_synth_Radii[k])).detach().float().numpy())
73 |  
74 |         alpha_precision_curve.append(alpha_precision)
75 |         beta_coverage_curve.append(beta_coverage)
76 |     
77 | 
78 |     # See which one is bigger
79 |     
80 |     authen = real_to_real[real_to_synth_args] < real_to_synth
81 |     authenticity = np.mean(authen.numpy())
82 | 
83 |     Delta_precision_alpha = 1 - 2 * np.sum(np.abs(np.array(alphas) - np.array(alpha_precision_curve))) * (alphas[1] - alphas[0])
84 |     Delta_coverage_beta  = 1 - 2 * np.sum(np.abs(np.array(alphas) - np.array(beta_coverage_curve))) * (alphas[1] - alphas[0])
85 |     
86 |     return alphas, alpha_precision_curve, beta_coverage_curve, Delta_precision_alpha, Delta_coverage_beta, authenticity
87 | 


--------------------------------------------------------------------------------
/metrics/evaluation_old.py:
--------------------------------------------------------------------------------
 1 |     
 2 | # Copyright (c) 2021, Ahmed M. Alaa, Boris van Breugel
 3 | # Licensed under the BSD 3-clause license (see LICENSE.txt)
 4 | 
 5 | """
 6 |   
 7 |   ----------------------------------------- 
 8 |   Metrics implementation
 9 |   ----------------------------------------- 
10 | 
11 | """
12 | 
13 | from __future__ import absolute_import, division, print_function
14 | 
15 | import numpy as np
16 | import sys
17 | from sklearn.neighbors import NearestNeighbors
18 | 
19 | import logging
20 | import torch
21 | import scipy
22 | 
23 | if not sys.warnoptions:
24 |     import warnings
25 |     warnings.simplefilter("ignore")
26 |     
27 | device = 'cpu' # matrices are too big for gpu
28 | 
29 | 
30 | def compute_alpha_precision_old(real_data, synthetic_data, emb_center):
31 |     n_steps = 30
32 |     nn_size = 2
33 |     alphas  = np.linspace(0, 1, 30)
34 |     Radii   = [np.quantile(torch.sqrt(torch.sum((torch.tensor(real_data).float() - emb_center) ** 2, dim=1)), alphas[k]) for k in range(len(alphas))]
35 |     
36 |     synth_center          = torch.tensor(np.mean(synthetic_data, axis=0)).float()
37 |     synth_Radii           = [np.quantile(torch.sqrt(torch.sum((torch.tensor(synthetic_data).float() - synth_center) ** 2, dim=1)), alphas[k]) for k in range(len(alphas))]
38 | 
39 |     alpha_precision_curve = []
40 |     beta_coverage_curve   = []
41 |     
42 |     synth_to_center       = torch.sqrt(torch.sum((torch.tensor(synthetic_data).float() - emb_center) ** 2, dim=1))
43 |     synth_to_synth_center = torch.sqrt(torch.sum((torch.tensor(synthetic_data).float() - synth_center) ** 2, dim=1))
44 |     real_to_center        = torch.sqrt(torch.sum((torch.tensor(real_data).float() - emb_center) ** 2, dim=1))
45 |     
46 |     real_to_synth         = [np.min(np.sum(np.abs(real_data[k, :] - synthetic_data), axis=1)) for k in range(real_data.shape[0])]
47 |     real_to_synth_args    = [np.argmin(np.sum(np.abs(real_data[k, :] - synthetic_data), axis=1)) for k in range(real_data.shape[0])]
48 |     real_to_synth         = torch.tensor(np.array(real_to_synth)).float()
49 |     real_synth_closest    = np.array([synthetic_data[real_to_synth_args[k], :] for k in range(len(real_to_synth_args))])
50 |     
51 |     closest_synth_Radii   = [np.quantile(torch.sqrt(torch.sum((torch.tensor(real_synth_closest).float() - synth_center) ** 2, dim=1)), alphas[k]) for k in range(len(alphas))]
52 |     real_synth_closest_d  = torch.sqrt(torch.sum((torch.tensor(real_synth_closest).float()- synth_center) ** 2, dim=1))
53 | 
54 |     real_to_real          = [np.partition(np.sum(np.abs(real_data[k, :] - real_data), axis=1), nn_size)[nn_size-1] for k in range(real_data.shape[0])]
55 |     real_to_real          = torch.tensor(np.array(real_to_real)).float()
56 |     
57 |     real_to_synth_all  = [np.min(np.sum(np.abs(real_data[k, :] - synthetic_data), axis=1)) for k in range(real_data.shape[0])]
58 |     real_to_real_all   = np.array([np.sum(np.abs(real_data[k, :] - real_data), axis=1) for k in range(real_data.shape[0])])
59 |     dist_probs         = [1/np.mean(real_to_synth_all[k] <= real_to_real_all[k, :]) for k in range(real_data.shape[0])]
60 |    
61 |     for k in range(len(Radii)):
62 |         
63 |         precision_audit_mask = (synth_to_center <= Radii[k]).detach().float().numpy()
64 |         alpha_precision      = np.mean(precision_audit_mask)
65 | 
66 |         beta_coverage        = np.mean(((real_to_synth <= real_to_real) * (real_synth_closest_d <= closest_synth_Radii[k])).detach().float().numpy())
67 |  
68 |         alpha_precision_curve.append(alpha_precision)
69 |         beta_coverage_curve.append(beta_coverage)
70 |     
71 |     
72 |     Delta_precision_alpha = 1 - 2 * np.sum(np.abs(np.array(alphas) - np.array(alpha_precision_curve))) * (alphas[1] - alphas[0])
73 |     Delta_coverage_beta  = 1 - 2 * np.sum(np.abs(np.array(alphas) - np.array(beta_coverage_curve))) * (alphas[1] - alphas[0])
74 |     
75 |     dist_ps    = np.array(dist_probs)
76 |     dist_min   = np.min(dist_ps)
77 |     dist_max   = np.max(dist_ps)
78 | 
79 |     thresholds = np.linspace(dist_min, dist_max, 1000) 
80 |     authen     = np.array([np.mean(dist_ps >= thresholds[k]) for k in range(len(thresholds))])
81 |     
82 |     return alphas, alpha_precision_curve, beta_coverage_curve, Delta_precision_alpha, Delta_coverage_beta, (thresholds, authen)


--------------------------------------------------------------------------------
/metrics/feature_distribution.py:
--------------------------------------------------------------------------------
 1 | """Anonymization through Data Synthesis using Generative Adversarial Networks:
 2 | A harmonizing advancement for AI in medicine (ADS-GAN) Codebase.
 3 | 
 4 | Reference: Jinsung Yoon, Lydia N. Drumright, Mihaela van der Schaar, 
 5 | "Anonymization through Data Synthesis using Generative Adversarial Networks (ADS-GAN):
 6 | A harmonizing advancement for AI in medicine," 
 7 | IEEE Journal of Biomedical and Health Informatics (JBHI), 2019.
 8 | Paper link: https://ieeexplore.ieee.org/document/9034117
 9 | Last updated Date: December 22th 2020
10 | Code author: Jinsung Yoon (jsyoon0823@gmail.com)
11 | -----------------------------
12 | feature_distribution.py
13 | - Compare feature distribution between original data and synthetic data
14 | """
15 | 
16 | # Import necessary packages
17 | import numpy as np
18 | 
19 | def feature_distribution (orig_data, synth_data):
20 |     """Compare feature distribution between orig data and synth data
21 |     
22 |     Args:
23 |         orig_data: original data
24 |         synth_data: synthetically generated data
25 |         
26 |     Returns:
27 |         dist_comp_table: distribution comparison table
28 |     """
29 |     
30 |     orig_data = np.asarray(orig_data)
31 |     
32 |     # Parameters
33 |     no, dim = np.shape(orig_data)
34 |         
35 |     # Output initialization
36 |     dist_comp_table = np.zeros([dim, 4])
37 |         
38 |     for i in range(dim):
39 |                         
40 |         if len(np.unique(orig_data[:, i])) > 2:
41 |             dist_comp_table[i,0] = np.mean(synth_data[:,i])
42 |             dist_comp_table[i,1] = np.std(synth_data[:,i])
43 |                             
44 |             dist_comp_table[i,2] = np.mean(orig_data[:,i])
45 |             dist_comp_table[i,3] = np.std(orig_data[:,i])
46 |             
47 |         else:
48 |             dist_comp_table[i,0] = np.sum(synth_data[:,i]==1)
49 |             dist_comp_table[i,1] = np.sum(synth_data[:,i]==1) / float(no)
50 |                         
51 |             dist_comp_table[i,2] = np.sum(orig_data[:,i]==1)
52 |             dist_comp_table[i,3] = np.sum(orig_data[:,i]==1) / float(no)
53 |                         
54 |     return dist_comp_table
55 | 


--------------------------------------------------------------------------------
/metrics/fid.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | ''' Calculates the Frechet Inception Distance (FID) to evalulate GANs.
  3 | 
  4 | Paper: GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium.
  5 | Code derived from https://github.com/bioinf-jku/TTUR
  6 | 
  7 | The FID metric calculates the distance between two distributions of images.
  8 | Typically, we have summary statistics (mean & covariance matrix) of one
  9 | of these distributions, while the 2nd distribution is given by a GAN.
 10 | 
 11 | When run as a stand-alone program, it compares the distribution of
 12 | images that are stored as PNG/JPEG at a specified location with a
 13 | distribution given by summary statistics (in pickle format).
 14 | 
 15 | The FID is calculated by assuming that X_1 and X_2 are the activations of
 16 | the pool_3 layer of the inception net for generated samples and real world
 17 | samples respectivly.
 18 | 
 19 | See --help to see further details.
 20 | '''
 21 | 
 22 | from __future__ import absolute_import, division, print_function
 23 | import numpy as np
 24 | from scipy import linalg
 25 | import warnings
 26 | 
 27 | def compute_frechet_distance(X1, X2):
 28 |     """
 29 |     Frechet distance between two datasets that are both assumed Gaussian
 30 |     
 31 | 
 32 |     """
 33 |     mu1, cov1 = fit_gaussian(X1)
 34 |     mu2, cov2 = fit_gaussian(X2)
 35 |     return calculate_frechet_distance(mu1,cov1,mu2,cov2)
 36 | 
 37 | def fit_gaussian(act):
 38 |     """Calculation of the statistics used by the FID.
 39 |     Params:
 40 |     -- act   : activations
 41 |     Returns:
 42 |     -- mu    : The mean over samples of the activations
 43 |     -- sigma : The covariance matrix of the activations
 44 |     """
 45 |     mu = np.mean(act, axis=0)
 46 |     sigma = np.cov(act.T)
 47 |     return mu, sigma
 48 | 
 49 | 
 50 | def calculate_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
 51 |     """Numpy implementation of the Frechet Distance.
 52 |     The Frechet distance between two multivariate Gaussians X_1 ~ N(mu_1, C_1)
 53 |     and X_2 ~ N(mu_2, C_2) is
 54 |             d^2 = ||mu_1 - mu_2||^2 + Tr(C_1 + C_2 - 2*sqrt(C_1*C_2)).
 55 |             
 56 |     Stable version by Dougal J. Sutherland.
 57 | 
 58 |     Params:
 59 |     -- mu1 : Numpy array containing the activations of the pool_3 layer of the
 60 |              inception net ( like returned by the function 'get_predictions')
 61 |              for generated samples.
 62 |     -- mu2   : The sample mean over activations of the pool_3 layer, precalcualted
 63 |                on an representive data set.
 64 |     -- sigma1: The covariance matrix over activations of the pool_3 layer for
 65 |                generated samples.
 66 |     -- sigma2: The covariance matrix over activations of the pool_3 layer,
 67 |                precalcualted on an representive data set.
 68 | 
 69 |     Returns:
 70 |     --   : The Frechet Distance.
 71 |     """
 72 | 
 73 |     mu1 = np.atleast_1d(mu1)
 74 |     mu2 = np.atleast_1d(mu2)
 75 | 
 76 |     sigma1 = np.atleast_2d(sigma1)
 77 |     sigma2 = np.atleast_2d(sigma2)
 78 | 
 79 |     assert mu1.shape == mu2.shape, "Training and test mean vectors have different lengths"
 80 |     assert sigma1.shape == sigma2.shape, "Training and test covariances have different dimensions"
 81 | 
 82 |     diff = mu1 - mu2
 83 | 
 84 |     # product might be almost singular
 85 |     covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
 86 |     if not np.isfinite(covmean).all():
 87 |         msg = "fid calculation produces singular product; adding %s to diagonal of cov estimates" % eps
 88 |         warnings.warn(msg)
 89 |         offset = np.eye(sigma1.shape[0]) * eps
 90 |         covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
 91 | 
 92 |     # numerical error might give slight imaginary component
 93 |     if np.iscomplexobj(covmean):
 94 |         if not np.allclose(np.diagonal(covmean).imag, 0, atol=2e-3):
 95 |             m = np.max(np.abs(covmean.imag))
 96 |             raise ValueError("Imaginary component {}".format(m))
 97 |         covmean = covmean.real
 98 | 
 99 |     tr_covmean = np.trace(covmean)
100 | 
101 |     return diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2) - 2 * tr_covmean
102 | 


--------------------------------------------------------------------------------
/metrics/improved_precision_recall.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 | #
  3 | # This work is licensed under the Creative Commons Attribution-NonCommercial
  4 | # 4.0 International License. To view a copy of this license, visit
  5 | # http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to
  6 | # Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
  7 | 
  8 | """k-NN precision and recall.
  9 | Taken from https://github.com/kynkaat/improved-precision-and-recall-metric/
 10 | Paper: https://arxiv.org/pdf/1904.06991.pdf
 11 | 
 12 | 
 13 | """
 14 | 
 15 | import numpy as np
 16 | import tensorflow as tf
 17 | from time import time
 18 | 
 19 | #----------------------------------------------------------------------------
 20 | 
 21 | def batch_pairwise_distances(U, V):
 22 |     """Compute pairwise distances between two batches of feature vectors."""
 23 |     with tf.compat.v1.variable_scope('pairwise_dist_block'):
 24 |         # Squared norms of each row in U and V.
 25 |         norm_u = tf.reduce_sum(tf.square(U), 1)
 26 |         norm_v = tf.reduce_sum(tf.square(V), 1)
 27 |         
 28 |         # norm_u as a column and norm_v as a row vectors.
 29 |         norm_u = tf.reshape(norm_u, [-1, 1])
 30 |         norm_v = tf.reshape(norm_v, [1, -1])
 31 | 
 32 |         # Pairwise squared Euclidean distances.
 33 |         D = tf.maximum(norm_u - 2*tf.matmul(U, V, False, True) + norm_v, 0.0)
 34 | 
 35 |     return D
 36 | 
 37 | #----------------------------------------------------------------------------
 38 | 
 39 | class DistanceBlock():
 40 |     """Provides multi-GPU support to calculate pairwise distances between two batches of feature vectors."""
 41 |     def __init__(self, num_features, num_gpus):
 42 |         self.num_features = num_features
 43 |         self.num_gpus = num_gpus
 44 | 
 45 |         # Initialize TF graph to calculate pairwise distances.
 46 |         with tf.device('/cpu:0'):
 47 |             self._features_batch1 = tf.compat.v1.placeholder(tf.float16, shape=[None, self.num_features])
 48 |             self._features_batch2 = tf.compat.v1.placeholder(tf.float16, shape=[None, self.num_features])
 49 |             features_split2 = tf.split(self._features_batch2, self.num_gpus, axis=0)
 50 |             distances_split = []    
 51 |             for gpu_idx in range(self.num_gpus):
 52 |                 with tf.device('/gpu:%d' % gpu_idx):
 53 |                     distances_split.append(batch_pairwise_distances(self._features_batch1, features_split2[gpu_idx]))
 54 |             self._distance_block = tf.concat(distances_split, axis=1)
 55 | 
 56 |     def pairwise_distances(self, U, V):
 57 |         """Evaluate pairwise distances between two batches of feature vectors."""
 58 |         return self._distance_block.eval(feed_dict={self._features_batch1: U, self._features_batch2: V})
 59 | 
 60 | #----------------------------------------------------------------------------
 61 | 
 62 | class ManifoldEstimator():
 63 |     """Estimates the manifold of given feature vectors."""
 64 | 
 65 |     def __init__(self, distance_block, features, row_batch_size=25000, col_batch_size=50000,
 66 |                  nhood_sizes=[3], clamp_to_percentile=None, eps=1e-5):
 67 |         """Estimate the manifold of given feature vectors.
 68 |         
 69 |             Args:
 70 |                 distance_block: DistanceBlock object that distributes pairwise distance
 71 |                     calculation to multiple GPUs.
 72 |                 features (np.array/tf.Tensor): Matrix of feature vectors to estimate their manifold.
 73 |                 row_batch_size (int): Row batch size to compute pairwise distances
 74 |                     (parameter to trade-off between memory usage and performance).
 75 |                 col_batch_size (int): Column batch size to compute pairwise distances.
 76 |                 nhood_sizes (list): Number of neighbors used to estimate the manifold.
 77 |                 clamp_to_percentile (float): Prune hyperspheres that have radius larger than
 78 |                     the given percentile.
 79 |                 eps (float): Small number for numerical stability.
 80 |         """
 81 |         num_images = features.shape[0]
 82 |         self.nhood_sizes = nhood_sizes
 83 |         self.num_nhoods = len(nhood_sizes)
 84 |         self.eps = eps
 85 |         self.row_batch_size = row_batch_size
 86 |         self.col_batch_size = col_batch_size
 87 |         self._ref_features = features
 88 |         self._distance_block = distance_block
 89 | 
 90 |         # Estimate manifold of features by calculating distances to k-NN of each sample.
 91 |         self.D = np.zeros([num_images, self.num_nhoods], dtype=np.float16)
 92 |         distance_batch = np.zeros([row_batch_size, num_images], dtype=np.float16)
 93 |         seq = np.arange(max(self.nhood_sizes) + 1, dtype=np.int32)
 94 | 
 95 |         for begin1 in range(0, num_images, row_batch_size):
 96 |             end1 = min(begin1 + row_batch_size, num_images)
 97 |             row_batch = features[begin1:end1]
 98 | 
 99 |             for begin2 in range(0, num_images, col_batch_size):
100 |                 end2 = min(begin2 + col_batch_size, num_images)
101 |                 col_batch = features[begin2:end2]
102 | 
103 |                 # Compute distances between batches.
104 |                 distance_batch[0:end1-begin1, begin2:end2] = self._distance_block.pairwise_distances(row_batch, col_batch)
105 |     
106 |             # Find the k-nearest neighbor from the current batch.
107 |             self.D[begin1:end1, :] = np.partition(distance_batch[0:end1-begin1, :], seq, axis=1)[:, self.nhood_sizes]
108 | 
109 |         if clamp_to_percentile is not None:
110 |             max_distances = np.percentile(self.D, clamp_to_percentile, axis=0)
111 |             self.D[self.D > max_distances] = 0
112 | 
113 |     def evaluate(self, eval_features, return_realism=False, return_neighbors=False):
114 |         """Evaluate if new feature vectors are at the manifold."""
115 |         num_eval_images = eval_features.shape[0]
116 |         num_ref_images = self.D.shape[0]
117 |         distance_batch = np.zeros([self.row_batch_size, num_ref_images], dtype=np.float32)
118 |         batch_predictions = np.zeros([num_eval_images, self.num_nhoods], dtype=np.int32)
119 |         max_realism_score = np.zeros([num_eval_images,], dtype=np.float32)
120 |         nearest_indices = np.zeros([num_eval_images,], dtype=np.int32)
121 | 
122 |         for begin1 in range(0, num_eval_images, self.row_batch_size):
123 |             end1 = min(begin1 + self.row_batch_size, num_eval_images)
124 |             feature_batch = eval_features[begin1:end1]
125 | 
126 |             for begin2 in range(0, num_ref_images, self.col_batch_size):
127 |                 end2 = min(begin2 + self.col_batch_size, num_ref_images)
128 |                 ref_batch = self._ref_features[begin2:end2]
129 | 
130 |                 distance_batch[0:end1-begin1, begin2:end2] = self._distance_block.pairwise_distances(feature_batch, ref_batch)
131 | 
132 |             # From the minibatch of new feature vectors, determine if they are in the estimated manifold.
133 |             # If a feature vector is inside a hypersphere of some reference sample, then
134 |             # the new sample lies at the estimated manifold.
135 |             # The radii of the hyperspheres are determined from distances of neighborhood size k.
136 |             samples_in_manifold = distance_batch[0:end1-begin1, :, None] <= self.D
137 |             batch_predictions[begin1:end1] = np.any(samples_in_manifold, axis=1).astype(np.int32)
138 | 
139 |             max_realism_score[begin1:end1] = np.max(self.D[:, 0] / (distance_batch[0:end1-begin1, :] + self.eps), axis=1)
140 |             nearest_indices[begin1:end1] = np.argmin(distance_batch[0:end1-begin1, :], axis=1)
141 | 
142 |         if return_realism and return_neighbors:
143 |             return batch_predictions, max_realism_score, nearest_indices
144 |         elif return_realism:
145 |             return batch_predictions, max_realism_score
146 |         elif return_neighbors:
147 |             return batch_predictions, nearest_indices
148 | 
149 |         return batch_predictions
150 | 
151 | #----------------------------------------------------------------------------
152 | 
153 | def knn_precision_recall_features(ref_features, eval_features, nhood_sizes=[3],
154 |                                   row_batch_size=25000, col_batch_size=50000, num_gpus=1):
155 |     """Calculates k-NN precision and recall for two sets of feature vectors.
156 |     
157 |         Args:
158 |             ref_features (np.array/tf.Tensor): Feature vectors of reference images.
159 |             eval_features (np.array/tf.Tensor): Feature vectors of generated images.
160 |             nhood_sizes (list): Number of neighbors used to estimate the manifold.
161 |             row_batch_size (int): Row batch size to compute pairwise distances
162 |                 (parameter to trade-off between memory usage and performance).
163 |             col_batch_size (int): Column batch size to compute pairwise distances.
164 |             num_gpus (int): Number of GPUs used to evaluate precision and recall.
165 | 
166 |         Returns:
167 |             State (dict): Dict that contains precision and recall calculated from
168 |             ref_features and eval_features.
169 |     """
170 |     state = dict()
171 |     num_images = ref_features.shape[0]
172 |     num_features = ref_features.shape[1]
173 | 
174 |     # Initialize DistanceBlock and ManifoldEstimators.
175 |     distance_block = DistanceBlock(num_features, num_gpus)
176 |     ref_manifold = ManifoldEstimator(distance_block, ref_features, row_batch_size, col_batch_size, nhood_sizes) 
177 |     eval_manifold = ManifoldEstimator(distance_block, eval_features, row_batch_size, col_batch_size, nhood_sizes)
178 | 
179 |     # Evaluate precision and recall using k-nearest neighbors.
180 |     print('Evaluating k-NN precision and recall with %i samples...' % num_images)
181 |     start = time()
182 | 
183 |     # Precision: How many points from eval_features are in ref_features manifold.
184 |     precision = ref_manifold.evaluate(eval_features)
185 |     state['precision'] = precision.mean(axis=0)
186 | 
187 |     # Recall: How many points from ref_features are in eval_features manifold.
188 |     recall = eval_manifold.evaluate(ref_features)
189 |     state['recall'] = recall.mean(axis=0)
190 | 
191 |     print('Evaluated k-NN precision and recall in: %gs' % (time() - start))
192 | 
193 |     return state['precision'], state['recall']
194 | 
195 | #----------------------------------------------------------------------------


--------------------------------------------------------------------------------
/metrics/parzen.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Parzen window loglikelihood estimate,
  4 | Breuleux, O., Bengio, Y., and Vincent, P. (2011). Quickly generating representative samples from an
  5 | RBM-derived process. Neural Computation, 23(8), 2053–2073.
  6 | 
  7 | 
  8 | Original code author: Yann N.Dauphin and Ian Goodfellow
  9 | https://github.com/goodfeli/adversarial/blob/master/parzen_ll.py
 10 | Modified by Boris van Breugel (bv292@cam.ac.uk)
 11 | 
 12 | """
 13 | 
 14 | import numpy as np
 15 | import theano.tensor as T
 16 | import theano
 17 | from tqdm import tqdm
 18 | 
 19 | 
 20 | def get_nll(x, parzen, batch_size=10):
 21 |     """
 22 |     Credit: Yann N. Dauphin
 23 |     """
 24 | 
 25 |     inds = range(x.shape[0])
 26 |     n_batches = int(np.ceil(float(len(inds)) / batch_size))
 27 |     
 28 |     nlls = []
 29 |     for i in range(n_batches):
 30 |         nll = parzen(x[inds[i::n_batches]])
 31 |         nlls.extend(nll)
 32 | 
 33 |     return np.array(nlls)
 34 | 
 35 | 
 36 | def log_mean_exp(a):
 37 |     """
 38 |     Credit: Yann N. Dauphin
 39 |     """
 40 | 
 41 |     max_ = a.max(1)
 42 | 
 43 |     return max_ + T.log(T.exp(a - max_.dimshuffle(0, 'x')).mean(1))
 44 | 
 45 | 
 46 | def theano_parzen(mu, sigma):
 47 |     """
 48 |     Credit: Yann N. Dauphin
 49 |     """
 50 | 
 51 |     x = T.matrix()
 52 |     mu = theano.shared(mu)
 53 |     a = ( x.dimshuffle(0, 'x', 1) - mu.dimshuffle('x', 0, 1) ) / sigma
 54 |     E = log_mean_exp(-0.5*(a**2).sum(2))
 55 |     Z = mu.shape[1] * T.log(sigma * np.sqrt(np.pi * 2))
 56 | 
 57 |     return theano.function([x], E - Z)
 58 | 
 59 | 
 60 | def cross_validate_sigma(samples, data, sigmas, batch_size):
 61 |     
 62 |     lls = []
 63 |     for sigma in tqdm(sigmas):
 64 |         print(sigma)
 65 |         parzen = theano_parzen(samples, sigma)
 66 |         tmp = get_nll(data, parzen, batch_size = batch_size)
 67 |         lls.append(np.asarray(tmp).mean())
 68 |         del parzen
 69 |         
 70 |     ind = np.argmax(lls)
 71 |     return sigmas[ind]
 72 | 
 73 | 
 74 | def compute_parzen(orig_data, synth_data, sigma=None, start_sigma=-0.5, end_sigma=0.5, num_cv_evals=10, batch_size = 10):
 75 |     # Preprocess the data
 76 |     orig_data = np.asarray(orig_data)
 77 |     synth_data = np.asarray(synth_data)
 78 |         
 79 |     no, x_dim = np.shape(orig_data)
 80 |         
 81 |     
 82 |     
 83 |     if sigma is None:
 84 |         # Divide train / test
 85 |         orig_data_valid = orig_data[:int(no/5),:]
 86 |         orig_data_test = orig_data[int(no/5):,:]
 87 |             
 88 |         synth_data_valid = synth_data[:int(no/5),:]
 89 |         synth_data_test = synth_data[int(no/5):,:]
 90 |         sigma_range = np.logspace(start_sigma, end_sigma, num=num_cv_evals)
 91 |         sigma = cross_validate_sigma(synth_data_valid, orig_data_valid, sigma_range, batch_size)
 92 |     else:
 93 |         orig_data_test = orig_data
 94 |         synth_data_test = synth_data
 95 |     # fit and evaluate
 96 |     print('Using Sigma:', sigma)
 97 |     parzen = theano_parzen(synth_data_test, sigma)
 98 |     ll = get_nll(orig_data_test, parzen, batch_size = batch_size)
 99 |     se = ll.std() / np.sqrt(orig_data_test.shape[0])
100 | 
101 |     return ll.mean(), se


--------------------------------------------------------------------------------
/metrics/prd_score.py:
--------------------------------------------------------------------------------
  1 | # -
  2 | # coding=utf-8
  3 | # Taken from:
  4 | # https://github.com/google/compare_gan/blob/master/compare_gan/src/prd_score.py
  5 | #
  6 | # Changes:
  7 | #   - default dpi changed from 150 to 300
  8 | #   - added handling of cases where P = Q, where precision/recall may be
  9 | #     just above 1, leading to errors for the f_beta computation
 10 | #
 11 | # Copyright 2018 Google LLC & Hwalsuk Lee.
 12 | #
 13 | # Licensed under the Apache License, Version 2.0 (the "License");
 14 | # you may not use this file except in compliance with the License.
 15 | # You may obtain a copy of the License at
 16 | #
 17 | #     http://www.apache.org/licenses/LICENSE-2.0
 18 | #
 19 | # Unless required by applicable law or agreed to in writing, software
 20 | # distributed under the License is distributed on an "AS IS" BASIS,
 21 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 22 | # See the License for the specific language governing permissions and
 23 | # limitations under the License.
 24 | 
 25 | """Precision and recall computation based on samples from two distributions.
 26 | 
 27 | Given a sample from the true and the fake distribution embedded in some feature
 28 | space (say, Inception), it computes the precision and recall via the algorithm
 29 | presented in [arxiv.org/abs/1806.00035]. Finally, one can plot the resulting
 30 | curves for different models.
 31 | 
 32 | Typical usage example:
 33 | 
 34 | import prd
 35 |   prd_data_1 = prd.compute_prd_from_embedding(eval_feats_1, ref_feats_1)
 36 |   prd_data_2 = prd.compute_prd_from_embedding(eval_feats_2, ref_feats_2)
 37 |   prd.plot([prd_data_1, prd_data_2], ['GAN_1', 'GAN_2'])
 38 | """
 39 | 
 40 | from __future__ import absolute_import
 41 | from __future__ import division
 42 | from __future__ import print_function
 43 | 
 44 | from matplotlib import pyplot as plt
 45 | import numpy as np
 46 | import sklearn.cluster
 47 | 
 48 | 
 49 | def compute_prd(eval_dist, ref_dist, num_angles=1001, epsilon=1e-10):
 50 |   """Computes the PRD curve for discrete distributions.
 51 | 
 52 |   This function computes the PRD curve for the discrete distribution eval_dist
 53 |   with respect to the reference distribution ref_dist. This implements the
 54 |   algorithm in [arxiv.org/abs/1806.2281349]. The PRD will be computed for an
 55 |   equiangular grid of num_angles values between [0, pi/2].
 56 | 
 57 |   Args:
 58 |     eval_dist: 1D NumPy array or list of floats with the probabilities of the
 59 |                different states under the distribution to be evaluated.
 60 |     ref_dist: 1D NumPy array or list of floats with the probabilities of the
 61 |               different states under the reference distribution.
 62 |     num_angles: Number of angles for which to compute PRD. Must be in [3, 1e6].
 63 |                 The default value is 1001.
 64 |     epsilon: Angle for PRD computation in the edge cases 0 and pi/2. The PRD
 65 |              will be computes for epsilon and pi/2-epsilon, respectively.
 66 |              The default value is 1e-10.
 67 | 
 68 |   Returns:
 69 |     precision: NumPy array of shape [num_angles] with the precision for the
 70 |                different ratios.
 71 |     recall: NumPy array of shape [num_angles] with the recall for the different
 72 |             ratios.
 73 | 
 74 |   Raises:
 75 |     ValueError: If not 0 < epsilon <= 0.1.
 76 |     ValueError: If num_angles < 3.
 77 |   """
 78 | 
 79 |   if not (epsilon > 0 and epsilon < 0.1):
 80 |     raise ValueError('epsilon must be in (0, 0.1] but is %s.' % str(epsilon))
 81 |   if not (num_angles >= 3 and num_angles <= 1e6):
 82 |     raise ValueError('num_angles must be in [3, 1e6] but is %d.' % num_angles)
 83 | 
 84 |   # Compute slopes for linearly spaced angles between [0, pi/2]
 85 |   angles = np.linspace(epsilon, np.pi/2 - epsilon, num=num_angles)
 86 |   slopes = np.tan(angles)
 87 | 
 88 |   # Broadcast slopes so that second dimension will be states of the distribution
 89 |   slopes_2d = np.expand_dims(slopes, 1)
 90 | 
 91 |   # Broadcast distributions so that first dimension represents the angles
 92 |   ref_dist_2d = np.expand_dims(ref_dist, 0)
 93 |   eval_dist_2d = np.expand_dims(eval_dist, 0)
 94 | 
 95 |   # Compute precision and recall for all angles in one step via broadcasting
 96 |   precision = np.minimum(ref_dist_2d*slopes_2d, eval_dist_2d).sum(axis=1)
 97 |   recall = precision / slopes
 98 | 
 99 |   # handle numerical instabilities leaing to precision/recall just above 1
100 |   max_val = max(np.max(precision), np.max(recall))
101 |   if max_val > 1.001:
102 |     raise ValueError('Detected value > 1.001, this should not happen.')
103 |   precision = np.clip(precision, 0, 1)
104 |   recall = np.clip(recall, 0, 1)
105 | 
106 |   return precision, recall
107 | 
108 | 
109 | def _cluster_into_bins(eval_data, ref_data, num_clusters):
110 |   """Clusters the union of the data points and returns the cluster distribution.
111 | 
112 |   Clusters the union of eval_data and ref_data into num_clusters using minibatch
113 |   k-means. Then, for each cluster, it computes the number of points from
114 |   eval_data and ref_data.
115 | 
116 |   Args:
117 |     eval_data: NumPy array of data points from the distribution to be evaluated.
118 |     ref_data: NumPy array of data points from the reference distribution.
119 |     num_clusters: Number of cluster centers to fit.
120 | 
121 |   Returns:
122 |     Two NumPy arrays, each of size num_clusters, where i-th entry represents the
123 |     number of points assigned to the i-th cluster.
124 |   """
125 | 
126 |   cluster_data = np.vstack([eval_data, ref_data])
127 |   kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=num_clusters, n_init=10)
128 |   labels = kmeans.fit(cluster_data).labels_
129 | 
130 |   eval_labels = labels[:len(eval_data)]
131 |   ref_labels = labels[len(eval_data):]
132 | 
133 |   eval_bins = np.histogram(eval_labels, bins=num_clusters,
134 |                            range=[0, num_clusters], density=True)[0]
135 |   ref_bins = np.histogram(ref_labels, bins=num_clusters,
136 |                           range=[0, num_clusters], density=True)[0]
137 |   return eval_bins, ref_bins
138 | 
139 | 
140 | def compute_prd_from_embedding(eval_data, ref_data, num_clusters=20,
141 |                                num_angles=1001, num_runs=10,
142 |                                enforce_balance=True):
143 |   """Computes PRD data from sample embeddings.
144 | 
145 |   The points from both distributions are mixed and then clustered. This leads
146 |   to a pair of histograms of discrete distributions over the cluster centers
147 |   on which the PRD algorithm is executed.
148 | 
149 |   The number of points in eval_data and ref_data must be equal since
150 |   unbalanced distributions bias the clustering towards the larger dataset. The
151 |   check can be disabled by setting the enforce_balance flag to False (not
152 |   recommended).
153 | 
154 |   Args:
155 |     eval_data: NumPy array of data points from the distribution to be evaluated.
156 |     ref_data: NumPy array of data points from the reference distribution.
157 |     num_clusters: Number of cluster centers to fit. The default value is 20.
158 |     num_angles: Number of angles for which to compute PRD. Must be in [3, 1e6].
159 |                 The default value is 1001.
160 |     num_runs: Number of independent runs over which to average the PRD data.
161 |     enforce_balance: If enabled, throws exception if eval_data and ref_data do
162 |                      not have the same length. The default value is True.
163 | 
164 |   Returns:
165 |     precision: NumPy array of shape [num_angles] with the precision for the
166 |                different ratios.
167 |     recall: NumPy array of shape [num_angles] with the recall for the different
168 |             ratios.
169 | 
170 |   Raises:
171 |     ValueError: If len(eval_data) != len(ref_data) and enforce_balance is set to
172 |                 True.
173 |   """
174 | 
175 |   if enforce_balance and len(eval_data) != len(ref_data):
176 |     raise ValueError(
177 |         'The number of points in eval_data %d is not equal to the number of '
178 |         'points in ref_data %d. To disable this exception, set enforce_balance '
179 |         'to False (not recommended).' % (len(eval_data), len(ref_data)))
180 | 
181 |   eval_data = np.array(eval_data, dtype=np.float64)
182 |   ref_data = np.array(ref_data, dtype=np.float64)
183 |   precisions = []
184 |   recalls = []
185 |   for _ in range(num_runs):
186 |     eval_dist, ref_dist = _cluster_into_bins(eval_data, ref_data, num_clusters)
187 |     precision, recall = compute_prd(eval_dist, ref_dist, num_angles)
188 |     precisions.append(precision)
189 |     recalls.append(recall)
190 |   precision = np.mean(precisions, axis=0)
191 |   recall = np.mean(recalls, axis=0)
192 |   return precision, recall
193 | 
194 | 
195 | def _prd_to_f_beta(precision, recall, beta=1, epsilon=1e-10):
196 |   """Computes F_beta scores for the given precision/recall values.
197 | 
198 |   The F_beta scores for all precision/recall pairs will be computed and
199 |   returned.
200 | 
201 |   For precision p and recall r, the F_beta score is defined as:
202 |   F_beta = (1 + beta^2) * (p * r) / ((beta^2 * p) + r)
203 | 
204 |   Args:
205 |     precision: 1D NumPy array of precision values in [0, 1].
206 |     recall: 1D NumPy array of precision values in [0, 1].
207 |     beta: Beta parameter. Must be positive. The default value is 1.
208 |     epsilon: Small constant to avoid numerical instability caused by division
209 |              by 0 when precision and recall are close to zero.
210 | 
211 |   Returns:
212 |     NumPy array of same shape as precision and recall with the F_beta scores for
213 |     each pair of precision/recall.
214 | 
215 |   Raises:
216 |     ValueError: If any value in precision or recall is outside of [0, 1].
217 |     ValueError: If beta is not positive.
218 |   """
219 | 
220 |   if not ((precision >= 0).all() and (precision <= 1).all()):
221 |     raise ValueError('All values in precision must be in [0, 1].')
222 |   if not ((recall >= 0).all() and (recall <= 1).all()):
223 |     raise ValueError('All values in recall must be in [0, 1].')
224 |   if beta <= 0:
225 |     raise ValueError('Given parameter beta %s must be positive.' % str(beta))
226 | 
227 |   return (1 + beta**2) * (precision * recall) / (
228 |       (beta**2 * precision) + recall + epsilon)
229 | 
230 | 
231 | def prd_to_max_f_beta_pair(precision, recall, beta=8):
232 |   """Computes max. F_beta and max. F_{1/beta} for precision/recall pairs.
233 | 
234 |   Computes the maximum F_beta and maximum F_{1/beta} score over all pairs of
235 |   precision/recall values. This is useful to compress a PRD plot into a single
236 |   pair of values which correlate with precision and recall.
237 | 
238 |   For precision p and recall r, the F_beta score is defined as:
239 |   F_beta = (1 + beta^2) * (p * r) / ((beta^2 * p) + r)
240 | 
241 |   Args:
242 |     precision: 1D NumPy array or list of precision values in [0, 1].
243 |     recall: 1D NumPy array or list of precision values in [0, 1].
244 |     beta: Beta parameter. Must be positive. The default value is 8.
245 | 
246 |   Returns:
247 |     f_beta: Maximum F_beta score.
248 |     f_beta_inv: Maximum F_{1/beta} score.
249 | 
250 |   Raises:
251 |     ValueError: If beta is not positive.
252 |   """
253 | 
254 |   if not ((precision >= 0).all() and (precision <= 1).all()):
255 |     raise ValueError('All values in precision must be in [0, 1].')
256 |   if not ((recall >= 0).all() and (recall <= 1).all()):
257 |     raise ValueError('All values in recall must be in [0, 1].')
258 |   if beta <= 0:
259 |     raise ValueError('Given parameter beta %s must be positive.' % str(beta))
260 | 
261 |   f_beta = np.max(_prd_to_f_beta(precision, recall, beta))
262 |   f_beta_inv = np.max(_prd_to_f_beta(precision, recall, 1/beta))
263 |   return f_beta, f_beta_inv
264 | 
265 | 
266 | def plot(precision_recall_pairs, labels=None, out_path=None,
267 |          legend_loc='lower left', dpi=300):
268 |   """Plots precision recall curves for distributions.
269 | 
270 |   Creates the PRD plot for the given data and stores the plot in a given path.
271 | 
272 |   Args:
273 |     precision_recall_pairs: List of prd_data to plot. Each item in this list is
274 |                             a 2D array of precision and recall values for the
275 |                             same number of ratios.
276 |     labels: Optional list of labels of same length as list_of_prd_data. The
277 |             default value is None.
278 |     out_path: Output path for the resulting plot. If None, the plot will be
279 |               opened via plt.show(). The default value is None.
280 |     legend_loc: Location of the legend. The default value is 'lower left'.
281 |     dpi: Dots per inch (DPI) for the figure. The default value is 150.
282 | 
283 |   Raises:
284 |     ValueError: If labels is a list of different length than list_of_prd_data.
285 |   """
286 | 
287 |   
288 |   if labels is not None and len(labels) != len(precision_recall_pairs):
289 |     raise ValueError(
290 |         'Length of labels %d must be identical to length of '
291 |         'precision_recall_pairs %d.'
292 |         % (len(labels), len(precision_recall_pairs)))
293 | 
294 |   fig = plt.figure(figsize=(3.5, 3.5), dpi=dpi)
295 |   plot_handle = fig.add_subplot(111)
296 |   plot_handle.tick_params(axis='both', which='major', labelsize=12)
297 | 
298 |   for i in range(len(precision_recall_pairs)):
299 |     precision, recall = precision_recall_pairs[i]
300 |     label = labels[i] if labels is not None else None
301 |     plt.plot(recall, precision, label=label, alpha=0.5, linewidth=3)
302 | 
303 |   if labels is not None:
304 |     plt.legend(loc=legend_loc)
305 | 
306 |   plt.xlim([0, 1])
307 |   plt.ylim([0, 1])
308 |   plt.xlabel('Recall', fontsize=12)
309 |   plt.ylabel('Precision', fontsize=12)
310 |   plt.tight_layout()
311 |   if out_path is None:
312 |     plt.show()
313 |   else:
314 |     plt.savefig(out_path, bbox_inches='tight', dpi=dpi)
315 |     plt.close()


--------------------------------------------------------------------------------
/metrics/prdc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Taken from https://github.com/clovaai/generative-evaluation-prdc
 5 | 
 6 | prdc 
 7 | Copyright (c) 2020-present NAVER Corp.
 8 | MIT license
 9 | 
10 | """
11 | 
12 | import numpy as np
13 | import sklearn.metrics
14 | 
15 | __all__ = ['compute_prdc']
16 | 
17 | 
18 | def compute_pairwise_distance(data_x, data_y=None):
19 |     """
20 |     Args:
21 |         data_x: numpy.ndarray([N, feature_dim], dtype=np.float32)
22 |         data_y: numpy.ndarray([N, feature_dim], dtype=np.float32)
23 |     Returns:
24 |         numpy.ndarray([N, N], dtype=np.float32) of pairwise distances.
25 |     """
26 |     if data_y is None:
27 |         data_y = data_x
28 |     dists = sklearn.metrics.pairwise_distances(
29 |         data_x, data_y, metric='euclidean', n_jobs=8)
30 |     return dists
31 | 
32 | 
33 | def get_kth_value(unsorted, k, axis=-1):
34 |     """
35 |     Args:
36 |         unsorted: numpy.ndarray of any dimensionality.
37 |         k: int
38 |     Returns:
39 |         kth values along the designated axis.
40 |     """
41 |     indices = np.argpartition(unsorted, k, axis=axis)[..., :k]
42 |     k_smallests = np.take_along_axis(unsorted, indices, axis=axis)
43 |     kth_values = k_smallests.max(axis=axis)
44 |     return kth_values
45 | 
46 | 
47 | def compute_nearest_neighbour_distances(input_features, nearest_k):
48 |     """
49 |     Args:
50 |         input_features: numpy.ndarray([N, feature_dim], dtype=np.float32)
51 |         nearest_k: int
52 |     Returns:
53 |         Distances to kth nearest neighbours.
54 |     """
55 |     distances = compute_pairwise_distance(input_features)
56 |     radii = get_kth_value(distances, k=nearest_k + 1, axis=-1)
57 |     return radii
58 | 
59 | 
60 | def compute_prdc(real_features, fake_features, nearest_k=5):
61 |     """
62 |     Computes precision, recall, density, and coverage given two manifolds.
63 |     Args:
64 |         real_features: numpy.ndarray([N, feature_dim], dtype=np.float32)
65 |         fake_features: numpy.ndarray([N, feature_dim], dtype=np.float32)
66 |         nearest_k: int.
67 |     Returns:
68 |         dict of precision, recall, density, and coverage.
69 |     """
70 | 
71 |     real_nearest_neighbour_distances = compute_nearest_neighbour_distances(
72 |         real_features, nearest_k)
73 |     fake_nearest_neighbour_distances = compute_nearest_neighbour_distances(
74 |         fake_features, nearest_k)
75 |     distance_real_fake = compute_pairwise_distance(
76 |         real_features, fake_features)
77 | 
78 |     precision = (
79 |             distance_real_fake <
80 |             np.expand_dims(real_nearest_neighbour_distances, axis=1)
81 |     ).any(axis=0).mean()
82 | 
83 |     recall = (
84 |             distance_real_fake <
85 |             np.expand_dims(fake_nearest_neighbour_distances, axis=0)
86 |     ).any(axis=1).mean()
87 | 
88 |     density = (1. / float(nearest_k)) * (
89 |             distance_real_fake <
90 |             np.expand_dims(real_nearest_neighbour_distances, axis=1)
91 |     ).sum(axis=0).mean()
92 | 
93 |     coverage = (
94 |             distance_real_fake.min(axis=1) <
95 |             real_nearest_neighbour_distances
96 |     ).mean()
97 | 
98 |     return dict(precision=precision, recall=recall,
99 |                 density=density, coverage=coverage)


--------------------------------------------------------------------------------
/metrics/precision_recall.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | 
 4 | From 
 5 | https://github.com/msmsajjadi/precision-recall-distributions/blob/master/prd_from_image_folders.py
 6 | """
 7 | 
 8 | # coding=utf-8
 9 | # Copyright: Mehdi S. M. Sajjadi (msajjadi.com)
10 | 
11 | import metrics.prd_score as prd
12 | from metrics.improved_precision_recall import knn_precision_recall_features
13 | 
14 | def compute_prc(orig_data,synth_data, params=None, plot_path=None, improved_version=True, verbose=True):
15 |     if verbose:
16 |         print('computing PRD')
17 |     if improved_version:
18 |         prd_data = knn_precision_recall_features(orig_data,synth_data)
19 |     else:
20 |         if params is None:
21 |             params = {}
22 |             params['num_clusters'] = 20
23 |             params['num_angles'] = 1001
24 |             params['num_runs'] = 10
25 |         prd_data = prd.compute_prd_from_embedding(
26 |                 eval_data=synth_data,
27 |                 ref_data=orig_data,
28 |                 num_clusters=params['num_clusters'],
29 |                 num_angles=params['num_angles'],
30 |                 num_runs=params['num_runs'])
31 |     
32 |     precision, recall = prd_data
33 |     
34 |     if verbose:
35 |         print('plotting results')
36 | 
37 |     f_beta = prd.prd_to_max_f_beta_pair(precision, recall, beta=8)
38 |     print('%.3f %.3f' % (f_beta[0], f_beta[1]))
39 | 
40 |     return prd_data
41 |     


--------------------------------------------------------------------------------
/predictive_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedmalaa/evaluating-generative-models/093910e487d07959db7d87b54698da60aaeb50c0/predictive_models/__init__.py


--------------------------------------------------------------------------------
/representations/OneClass.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2021, Ahmed M. Alaa
  3 | # Licensed under the BSD 3-clause license (see LICENSE.txt)
  4 | 
  5 | """
  6 |   
  7 |   ----------------------------------------- 
  8 |   One-class representations
  9 |   ----------------------------------------- 
 10 | 
 11 | """
 12 | 
 13 | from __future__ import absolute_import, division, print_function
 14 | 
 15 | import numpy as np
 16 | import sys
 17 | 
 18 | import logging
 19 | import torch
 20 | import torch.nn as nn
 21 | 
 22 | if not sys.warnoptions:
 23 |     import warnings
 24 |     warnings.simplefilter("ignore")
 25 |     
 26 | from representations.networks import *  
 27 | 
 28 | from torch.autograd import Variable
 29 | 
 30 | # One-class loss functions
 31 | # ------------------------
 32 | 
 33 | 
 34 | def OneClassLoss(outputs, c): 
 35 |     
 36 |     dist   = torch.sum((outputs - c) ** 2, dim=1)
 37 |     loss   = torch.mean(dist)
 38 |     
 39 |     return loss
 40 | 
 41 | 
 42 | def SoftBoundaryLoss(outputs, R, c, nu):
 43 |     
 44 |     dist   = torch.sum((outputs - c) ** 2, dim=1)
 45 |     scores = dist - R ** 2
 46 |     loss   = R ** 2 + (1 / nu) * torch.mean(torch.max(torch.zeros_like(scores), scores))
 47 |     
 48 |     scores = dist 
 49 |     loss   = (1 / nu) * torch.mean(torch.max(torch.zeros_like(scores), scores))
 50 |     
 51 |     return loss
 52 | 
 53 | 
 54 | LossFns    = dict({"OneClass": OneClassLoss, "SoftBoundary": SoftBoundaryLoss})
 55 | 
 56 | # Base network
 57 | # ---------------------
 58 | 
 59 | class BaseNet(nn.Module):
 60 |     
 61 |     """Base class for all neural networks."""
 62 | 
 63 |     def __init__(self):
 64 |         
 65 |         super().__init__()
 66 |         
 67 |         self.logger  = logging.getLogger(self.__class__.__name__)
 68 |         self.rep_dim = None  # representation dimensionality, i.e. dim of the last layer
 69 | 
 70 |     def forward(self, *input):
 71 |         
 72 |         """Forward pass logic
 73 |         
 74 |         :return: Network output
 75 |         """
 76 |         raise NotImplementedError
 77 | 
 78 |     def summary(self):
 79 |         
 80 |         """Network summary."""
 81 |         
 82 |         net_parameters = filter(lambda p: p.requires_grad, self.parameters())
 83 |         params         = sum([np.prod(p.size()) for p in net_parameters])
 84 |         
 85 |         self.logger.info('Trainable parameters: {}'.format(params))
 86 |         self.logger.info(self)
 87 | 
 88 | 
 89 | def get_radius(dist:torch.Tensor, nu:float):
 90 |     
 91 |     """Optimally solve for radius R via the (1-nu)-quantile of distances."""
 92 |     
 93 |     return np.quantile(np.sqrt(dist.clone().data.float().numpy()), 1 - nu)
 94 | 
 95 | class OneClassLayer(BaseNet):
 96 | 
 97 |     def __init__(self, params=None, hyperparams=None):
 98 |         
 99 |         super().__init__()
100 |         
101 |         # set all representation parameters - remove these lines
102 |         
103 |         self.rep_dim        = params["rep_dim"] 
104 |         self.input_dim      = params["input_dim"]
105 |         self.num_layers     = params["num_layers"]
106 |         self.num_hidden     = params["num_hidden"]
107 |         self.activation     = params["activation"]
108 |         self.dropout_prob   = params["dropout_prob"]
109 |         self.dropout_active = params["dropout_active"]  
110 |         self.loss_type      = params["LossFn"]
111 |         self.train_prop     = params['train_prop']
112 |         self.learningRate   = params['lr']
113 |         self.epochs         = params['epochs']
114 |         self.warm_up_epochs = params['warm_up_epochs']
115 |         self.weight_decay   = params['weight_decay']
116 |         if torch.cuda.is_available():
117 |             self.device     = torch.device('cuda') # Make this an option
118 |         else:
119 |             self.device     = torch.device('cpu')
120 |         # set up the network
121 |         
122 |         self.model          = build_network(network_name="feedforward", params=params).to(self.device)
123 | 
124 |         # create the loss function
125 | 
126 |         self.c              = hyperparams["center"].to(self.device)
127 |         self.R              = hyperparams["Radius"]
128 |         self.nu             = hyperparams["nu"]
129 | 
130 |         self.loss_fn        = LossFns[self.loss_type]
131 | 
132 |         
133 |     def forward(self, x):
134 |         
135 |         x                   = self.model(x)
136 |         
137 |         return x
138 |     
139 |     
140 |     def fit(self, x_train, verbosity=True):
141 |         
142 |         
143 |         self.optimizer      = torch.optim.AdamW(self.model.parameters(), lr=self.learningRate, weight_decay = self.weight_decay)
144 |         self.X              = torch.tensor(x_train.reshape((-1, self.input_dim))).float()
145 |         
146 |         if self.train_prop != 1:
147 |             x_train, x_val = x_train[:int(self.train_prop*len(x_train))], x_train[int(self.train_prop*len(x_train)):]
148 |             inputs_val = Variable(torch.from_numpy(x_val).to(self.device)).float()
149 |         
150 |         self.losses         = []
151 |         self.loss_vals       = []
152 |                 
153 |         
154 |         for epoch in range(self.epochs):
155 |             
156 |             # Converting inputs and labels to Variable
157 |             
158 |             inputs = Variable(torch.from_numpy(x_train)).to(self.device).float()
159 |             
160 |             self.model.zero_grad()
161 | 
162 |             self.optimizer.zero_grad()
163 | 
164 |             # get output from the model, given the inputs
165 |             outputs = self.model(inputs)
166 | 
167 |             # get loss for the predicted output
168 |             
169 |             if self.loss_type=="SoftBoundary":
170 |                 
171 |                 self.loss = self.loss_fn(outputs=outputs, R=self.R, c=self.c, nu=self.nu) 
172 |                 
173 |             elif self.loss_type=="OneClass":
174 |                 
175 |                 self.loss = self.loss_fn(outputs=outputs, c=self.c) 
176 |             
177 |             
178 |             #self.c    = torch.mean(torch.tensor(outputs).float(), dim=0)
179 |             
180 |             # get gradients w.r.t to parameters
181 |             self.loss.backward(retain_graph=True)
182 |             self.losses.append(self.loss.detach().cpu().numpy())
183 |         
184 |             # update parameters
185 |             self.optimizer.step()
186 |             
187 |             if (epoch >= self.warm_up_epochs) and (self.loss_type=="SoftBoundary"):
188 |                 
189 |                 dist   = torch.sum((outputs - self.c) ** 2, dim=1)
190 |                 #self.R = torch.tensor(get_radius(dist, self.nu))
191 |             
192 |             if self.train_prop != 1.0:
193 |                 with torch.no_grad():
194 |                     
195 |                     # get output from the model, given the inputs
196 |                     outputs = self.model(inputs_val)
197 |         
198 |                     # get loss for the predicted output
199 |                     
200 |                     if self.loss_type=="SoftBoundary":
201 |                         
202 |                         loss_val = self.loss_fn(outputs=outputs, R=self.R, c=self.c, nu=self.nu) 
203 |                         
204 |                     elif self.loss_type=="OneClass":
205 |                         
206 |                         loss_val = self.loss_fn(outputs=outputs, c=self.c).detach.cpu().numpy()
207 |                     
208 |                     self.loss_vals.append(loss_val)
209 |                                         
210 |                 
211 |                 
212 |             
213 |             if verbosity:
214 |                 if self.train_prop == 1:
215 |                     print('epoch {}, loss {}'.format(epoch, self.loss.item()))
216 |                 else:
217 |                     print('epoch {:4}, train loss {:.4e}, val loss {:.4e}'.format(epoch, self.loss.item(),loss_val))
218 |                     
219 |                 
220 | 


--------------------------------------------------------------------------------
/representations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedmalaa/evaluating-generative-models/093910e487d07959db7d87b54698da60aaeb50c0/representations/__init__.py


--------------------------------------------------------------------------------
/representations/networks.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Copyright (c) 2021, Ahmed M. Alaa
  3 | # Licensed under the BSD 3-clause license (see LICENSE.txt)
  4 | 
  5 | """
  6 |   
  7 |   ----------------------------------------- 
  8 |   Construction of feature representations
  9 |   ----------------------------------------- 
 10 |   
 11 |   + build_network:
 12 |     --------------
 13 |             |
 14 |             +--------> feedforward_network:
 15 |             |
 16 |             +--------> recurrent_network:
 17 |             |
 18 |             +--------> MNIST_network: 
 19 | 
 20 | """
 21 | 
 22 | # TODO: add arguments details 
 23 | 
 24 | 
 25 | from __future__ import absolute_import, division, print_function
 26 | 
 27 | # import numpy as np
 28 | # import pandas as pd
 29 | import sys
 30 | 
 31 | if not sys.warnoptions:
 32 |     import warnings
 33 |     warnings.simplefilter("ignore")
 34 | 
 35 | import torch
 36 | # from torch.autograd import Variable 
 37 | # import torch.nn.functional as nnf
 38 | # from torch.utils.data import random_split
 39 | # from torch.optim import SGD 
 40 | from torch import nn
 41 | 
 42 | 
 43 | # from copy import deepcopy
 44 | # import time
 45 | 
 46 | torch.manual_seed(1) 
 47 | 
 48 | # Global variables
 49 | 
 50 | ACTIVATION_DICT = {"ReLU": torch.nn.ReLU(),
 51 |                    "Hardtanh": torch.nn.Hardtanh(),
 52 |                    "ReLU6": torch.nn.ReLU6(), 
 53 |                    "Sigmoid": torch.nn.Sigmoid(),
 54 |                    "Tanh": torch.nn.Tanh(), 
 55 |                    "ELU": torch.nn.ELU(),
 56 |                    "CELU": torch.nn.CELU(), 
 57 |                    "SELU": torch.nn.SELU(), 
 58 |                    "GLU": torch.nn.GLU(), 
 59 |                    "LeakyReLU": torch.nn.LeakyReLU(),
 60 |                    "LogSigmoid": torch.nn.LogSigmoid(), 
 61 |                    "Softplus": torch.nn.Softplus()}
 62 | 
 63 | 
 64 | def build_network(network_name, params):
 65 | 
 66 |     if network_name=="feedforward":
 67 |         
 68 |         net = feedforward_network(params)
 69 | 
 70 |     return net
 71 | 
 72 | 
 73 | def feedforward_network(params):
 74 | 
 75 |     """Architecture for a Feedforward Neural Network
 76 | 	
 77 |     Args:
 78 | 	
 79 |         ::params::
 80 | 	
 81 |         ::params["input_dim"]::
 82 |         ::params[""rep_dim""]::
 83 |         ::params["num_hidden"]::
 84 |         ::params["activation"]::
 85 |         ::params["num_layers"]::
 86 |         ::params["dropout_prob"]::
 87 |         ::params["dropout_active"]:: 
 88 |         ::params["LossFn"]::
 89 | 	
 90 |     Returns:
 91 | 
 92 |         ::_architecture::
 93 | 
 94 |     """
 95 | 
 96 |     modules          = []
 97 | 
 98 |     if params["dropout_active"]: 
 99 | 
100 |         modules.append(torch.nn.Dropout(p=params["dropout_prob"]))
101 | 
102 |     # Input layer    
103 | 
104 |     modules.append(torch.nn.Linear(params["input_dim"], params["num_hidden"],bias=False))
105 |     modules.append(ACTIVATION_DICT[params["activation"]])
106 | 
107 |     # Intermediate layers
108 | 
109 |     for u in range(params["num_layers"] - 1):
110 | 
111 |         if params["dropout_active"]:
112 | 
113 |             modules.append(torch.nn.Dropout(p=params["dropout_prob"]))
114 | 
115 |         modules.append(torch.nn.Linear(params["num_hidden"], params["num_hidden"],
116 |                                        bias=False))
117 |         modules.append(ACTIVATION_DICT[params["activation"]])
118 | 
119 |     
120 |     # Output layer    
121 | 
122 |     modules.append(torch.nn.Linear(params["num_hidden"], params["rep_dim"],bias=False))
123 | 
124 |     _architecture    = nn.Sequential(*modules)
125 | 
126 |     return _architecture
127 | 


--------------------------------------------------------------------------------
/representations/ts_embedding/__init__.py:
--------------------------------------------------------------------------------
1 | """Timeseries encoding to a fixed size vector representation.
2 | 
3 | Author: Evgeny Saveliev (e.s.saveliev@gmail.com)
4 | """
5 | 
6 | from .seq2seq_autoencoder import Encoder, Decoder, Seq2Seq, init_hidden, compute_loss
7 | from .training import train_seq2seq_autoencoder, iterate_eval_set
8 | 


--------------------------------------------------------------------------------
/representations/ts_embedding/seq2seq_autoencoder.py:
--------------------------------------------------------------------------------
 1 | """Seq-2-Seq autoencoder.
 2 | """
 3 | import torch
 4 | import torch.nn as nn
 5 | from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
 6 | 
 7 | 
 8 | class Encoder(nn.Module):
 9 |     def __init__(self, input_size, hidden_size, num_rnn_layers):
10 |         super(Encoder, self).__init__()
11 |         
12 |         self.input_size = input_size
13 |         self.hidden_size = hidden_size
14 |         self.num_rnn_layers = num_rnn_layers
15 |         
16 |         self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_rnn_layers, batch_first=True)
17 | 
18 |     def forward(self, x, x_seq_lengths, hc, padding_value, max_seq_len):
19 |         x = pack_padded_sequence(x, x_seq_lengths, batch_first=True, enforce_sorted=False)
20 |         x, hc = self.lstm(x, hc)
21 |         x, x_seq_lens = pad_packed_sequence(x, batch_first=True, padding_value=padding_value, total_length=max_seq_len)
22 |         return x, x_seq_lens, hc
23 | 
24 | 
25 | class Decoder(nn.Module):
26 |     def __init__(self, input_size, hidden_size, num_rnn_layers):
27 |         super(Decoder, self).__init__()
28 |         
29 |         self.input_size = input_size
30 |         self.hidden_size = hidden_size
31 |         self.num_rnn_layers = num_rnn_layers
32 |         
33 |         self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_rnn_layers, batch_first=True)
34 |         self.linear = nn.Linear(hidden_size, input_size)
35 | 
36 |     def forward(self, x, x_seq_lengths, hc, padding_value, max_seq_len):
37 |         batch_size = x.shape[0]
38 |         x = pack_padded_sequence(x, x_seq_lengths, batch_first=True, enforce_sorted=False)
39 |         x, hc = self.lstm(x, hc)
40 |         x, x_seq_lens = pad_packed_sequence(x, batch_first=True, padding_value=padding_value, total_length=max_seq_len)
41 |         # x = x.contiguous()
42 |         x = x.view(-1, self.hidden_size)
43 |         x = self.linear(x)
44 |         x = x.view(batch_size, -1, self.input_size)
45 |         return x, x_seq_lens, hc
46 | 
47 | 
48 | class Seq2Seq(nn.Module):
49 |     def __init__(self, encoder, decoder):
50 |         super(Seq2Seq, self).__init__()
51 |         assert encoder.input_size == decoder.input_size
52 |         assert encoder.hidden_size == decoder.hidden_size
53 |         self.encoder = encoder
54 |         self.decoder = decoder
55 |     def forward(self, x_enc, x_dec, x_seq_lengths, hc_init, padding_value, max_seq_len):
56 |         # print(x_enc.dtype, x_dec.dtype, x_seq_lengths.dtype, hc_init[0].dtype, hc_init[1].dtype)
57 |         x_enc_out, _, hc_enc = self.encoder(x_enc, x_seq_lengths, hc_init, padding_value, max_seq_len)
58 |         # print("x_enc.shape", x_enc.shape)
59 |         # print("x_enc_out.shape", x_enc_out.shape)
60 |         x_dec_out, _, hc_dec = self.decoder(x_dec, x_seq_lengths, hc_enc, padding_value, max_seq_len)
61 |         return x_dec_out, hc_enc
62 |     def get_embeddings_only(self, x_enc, x_seq_lengths, hc_init, padding_value, max_seq_len):
63 |         _, _, hc_enc = self.encoder(x_enc, x_seq_lengths, hc_init, padding_value, max_seq_len)
64 |         return hc_enc
65 | 
66 | 
67 | def init_hidden(batch_size, hidden_size, num_rnn_layers, device):
68 |     h = torch.zeros(num_rnn_layers, batch_size, hidden_size, device=device, dtype=torch.float32)
69 |     c = torch.zeros(num_rnn_layers, batch_size, hidden_size, device=device, dtype=torch.float32)
70 |     return (h, c)
71 | 
72 | 
73 | def compute_loss(loss_function, x_pred, x_targ, x_seq_len):
74 |     assert x_pred.shape == x_targ.shape
75 |     
76 |     mask = torch.ones_like(x_pred, dtype=int).to(x_pred.device)
77 |     mask_seq_len = x_seq_len - 1  # As target sequence is one shorter.
78 |     for idx, l in enumerate(mask_seq_len):
79 |         mask[idx, l.item():, :] = 0.
80 |     
81 |     x_pred *= mask
82 |     x_targ *= mask
83 | 
84 |     loss = loss_function(x_pred, x_targ)
85 |     return loss
86 | 


--------------------------------------------------------------------------------
/representations/ts_embedding/training.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | try:
  4 |     import IPython.display
  5 | except ImportError:
  6 |     print("IPython not found, ts_embedding > training live plot will not work.")
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | # import torch.optim as optim
 11 | 
 12 | from .seq2seq_autoencoder import init_hidden, compute_loss
 13 | 
 14 | 
 15 | loss_function = nn.MSELoss(reduction="none")
 16 | 
 17 | 
 18 | def iterate_eval_set(seq2seq, dataloader, padding_value, max_seq_len):
 19 |     epoch_test_loss = 0.
 20 |     
 21 |     seq2seq.eval()
 22 |     n_samples_test = 0
 23 |     with torch.no_grad():
 24 |         for iter_, (x, x_len, x_rev, x_rev_shift) in enumerate(dataloader):
 25 |             batch_size = x.shape[0]
 26 |             n_samples_test += batch_size
 27 | 
 28 |             hc_init = init_hidden(
 29 |                 batch_size=batch_size, 
 30 |                 hidden_size=seq2seq.encoder.hidden_size, 
 31 |                 num_rnn_layers=seq2seq.encoder.num_rnn_layers, 
 32 |                 device=x.device)
 33 | 
 34 |             x_dec_out, hc_repr = seq2seq(
 35 |                 x_enc=x, 
 36 |                 x_dec=x_rev, 
 37 |                 x_seq_lengths=x_len, 
 38 |                 hc_init=hc_init, 
 39 |                 padding_value=padding_value, 
 40 |                 max_seq_len=max_seq_len
 41 |             )
 42 |             
 43 |             loss_tensor = compute_loss(
 44 |                 loss_function=loss_function, x_pred=x_dec_out, x_targ=x_rev_shift, x_seq_len=x_len)
 45 |             loss = loss_tensor.mean()
 46 |             epoch_test_loss += loss.item() * batch_size
 47 |         
 48 |     epoch_test_loss /= n_samples_test
 49 | 
 50 |     return epoch_test_loss
 51 | 
 52 | 
 53 | def train_seq2seq_autoencoder(
 54 |     seq2seq, 
 55 |     optimizer, 
 56 |     train_dataloader, 
 57 |     val_dataloader, 
 58 |     n_epochs, 
 59 |     batch_size, 
 60 |     padding_value, 
 61 |     max_seq_len,
 62 |     jupyter_live_plot_enabled=False
 63 | ):
 64 |     
 65 |     train_losses, val_losses = np.full([n_epochs], np.nan), np.full([n_epochs], np.nan)
 66 |     x_axis = list(range(1, n_epochs + 1))
 67 |     
 68 |     for epoch in range(n_epochs):
 69 |         epoch_train_loss = 0.
 70 |         epoch_val_loss = 0.
 71 |         # print(f"Epoch {epoch}")
 72 |         
 73 |         seq2seq.train()
 74 |         n_samples_train = 0
 75 |         for iter_, (x, x_len, x_rev, x_rev_shift) in enumerate(train_dataloader):
 76 |             batch_size = x.shape[0]
 77 |             n_samples_train += batch_size
 78 |             
 79 |             optimizer.zero_grad()
 80 |             hc_init = init_hidden(
 81 |                 batch_size=batch_size, 
 82 |                 hidden_size=seq2seq.encoder.hidden_size, 
 83 |                 num_rnn_layers=seq2seq.encoder.num_rnn_layers, 
 84 |                 device=x.device)
 85 |             
 86 |             x_dec_out, hc_repr = seq2seq(
 87 |                 x_enc=x, 
 88 |                 x_dec=x_rev, 
 89 |                 x_seq_lengths=x_len, 
 90 |                 hc_init=hc_init, 
 91 |                 padding_value=padding_value, 
 92 |                 max_seq_len=max_seq_len
 93 |             )
 94 |             
 95 |             loss_tensor = compute_loss(
 96 |                 loss_function=loss_function, x_pred=x_dec_out, x_targ=x_rev_shift, x_seq_len=x_len)
 97 |             loss = loss_tensor.mean()
 98 |             epoch_train_loss += loss.item() * batch_size
 99 |             
100 |             loss.backward()
101 |             optimizer.step()
102 |         
103 |         epoch_train_loss /= n_samples_train
104 |         
105 |         epoch_val_loss = iterate_eval_set(
106 |             seq2seq=seq2seq, dataloader=val_dataloader, padding_value=padding_value, max_seq_len=max_seq_len)
107 |         
108 |         train_losses[epoch] = epoch_train_loss
109 |         val_losses[epoch] = epoch_val_loss
110 |         
111 |         if jupyter_live_plot_enabled or (not jupyter_live_plot_enabled and epoch == n_epochs-1):
112 |             # A live updating plot showing the training and validation over time (i.e. over epochs).
113 |             plt.plot(x_axis, train_losses, label = "training loss")
114 |             plt.plot(x_axis, val_losses, label = "validation loss")
115 |             plt.title("Training Tracker")
116 |             plt.legend()
117 |             x_max = n_epochs
118 |             y_max = np.nanmax(train_losses)
119 |             plt.xlim(1, x_max)
120 |             plt.ylim(0, y_max)
121 |             if jupyter_live_plot_enabled:
122 |                 IPython.display.clear_output(wait=True)
123 |             plt.show()
124 |             plt.savefig("./training_log.png", dpi=300)
125 |         
126 |         print(f"Epoch {epoch}: Tr.Ls.={epoch_train_loss:.3f} Vl.Ls.={epoch_val_loss:.3f}")
127 | 


--------------------------------------------------------------------------------
/representations/ts_embedding/utils.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | from torch.utils.data import TensorDataset, DataLoader
  7 | 
  8 | from .seq2seq_autoencoder import init_hidden
  9 | 
 10 | 
 11 | def rearrange_data(x, x_len, pad_val, eos_val):
 12 |     """Take in sequence `x` [dims `(n_samples, max_seq_len, n_features)`, data type `float`] and an array of 
 13 |     sequence lengths `x_len` [dims `(n_samples,)`, data type `int`] and return: 
 14 |         * a reversed sequence `x_rev`, same dims as `x`, and padded at the same indices as `x`.
 15 |         * a reversed and shifted (forward by one) sequence `x_rev_shifted`, same dims as `x`, and padded at the same 
 16 |             indices as `x`. Like `x_rev` but sequence elements at x_{t} become x_{t-1}, so element at `t=0` is lost 
 17 |             and the element at `t=t_end_of_sequence` is assigned `eos_val`.
 18 |     Note that `x` is expected to be padded at the end along the sequence dimension, rather than at the beginning.
 19 | 
 20 |     Args:
 21 |         x (np.ndarray): sequence data [dims `(n_samples, max_seq_len, n_features)`, data type `float`].
 22 |         x_len (np.ndarray): array of sequence lengths [dims `(n_samples,)`, data type `int`].
 23 |         pad_val (float): padding value to use in output arrays.
 24 |         eos_val (float): end-of-sequence indicator value to use in the output `x_rev_shifted`.
 25 | 
 26 |     Returns:
 27 |         Tuple[np.ndarray, np.ndarray]: x_rev, x_rev_shifted
 28 |     """
 29 |     x_rev = np.full_like(x, pad_val)
 30 |     x_rev_shifted = np.full_like(x, pad_val)
 31 |     for idx, l in enumerate(x_len):
 32 |         x_rev[idx][:l] = x[idx][:l][::-1].copy()
 33 |         x_rev_shifted[idx][:l-1] = x_rev[idx][1:l]
 34 |         x_rev_shifted[idx][l-1] = eos_val
 35 |     return x_rev, x_rev_shifted
 36 | 
 37 | 
 38 | def data_to_tensors(x, x_len, x_rev, x_rev_shifted, float_type, device):
 39 |     X = torch.tensor(x, device=device, dtype=float_type)
 40 |     X_rev = torch.tensor(x_rev, device=device, dtype=float_type)
 41 |     X_rev_shifted = torch.tensor(x_rev_shifted, device=device, dtype=float_type)
 42 |     X_len = torch.tensor(x_len, dtype=int)  # CPU by requirement of packing.
 43 |     return X, X_len, X_rev, X_rev_shifted
 44 | 
 45 | 
 46 | def inference_data_to_tensors(x, x_len, float_type, device):
 47 |     X = torch.tensor(x, device=device, dtype=float_type)
 48 |     X_len = torch.tensor(x_len, dtype=int)  # CPU by requirement of packing.
 49 |     return X, X_len
 50 | 
 51 | 
 52 | def _generate_dummy_data(n_samples, min_timesteps, max_timesteps, n_features, pad_val, seed):
 53 |     np.random.seed(seed)
 54 |     
 55 |     seq_lengths = np.random.randint(low=min_timesteps, high=max_timesteps+1, size=n_samples)  
 56 |     # ^ We assume all features for the same example have same seq length.
 57 |     
 58 |     data = np.full((n_samples, max_timesteps, n_features), pad_val)
 59 |     for i, length in enumerate(seq_lengths):
 60 |         generated_data = np.random.randn(length, n_features)
 61 |         data[i, 0:length, :] = generated_data
 62 |     
 63 |     return data, seq_lengths
 64 | 
 65 | 
 66 | def generate_dummy_data(
 67 |     n_samples: int, 
 68 |     min_timesteps: int, 
 69 |     max_timesteps: int, 
 70 |     n_features: int, 
 71 |     pad_val: float, 
 72 |     eos_val: float, 
 73 |     seed: int, 
 74 |     to_tensors: bool,
 75 |     float_type: Optional[torch.dtype] = None, 
 76 |     device: Optional[torch.device] = None):
 77 |     
 78 |     x, x_len = _generate_dummy_data(n_samples, min_timesteps, max_timesteps, n_features, pad_val, seed)
 79 |     x_rev, x_rev_shifted = rearrange_data(x, x_len, pad_val, eos_val)
 80 |     
 81 |     if to_tensors:
 82 |         x, x_len, x_rev, x_rev_shifted = data_to_tensors(
 83 |             x, x_len, x_rev, x_rev_shifted, float_type=float_type, device=device)
 84 |     
 85 |     return x, x_len, x_rev, x_rev_shifted 
 86 | 
 87 | 
 88 | def make_dataloader(data_tensors, **dataloader_kwargs):
 89 |     dataset = TensorDataset(*data_tensors)
 90 |     dataloader = DataLoader(dataset, **dataloader_kwargs)
 91 |     return dataset, dataloader
 92 | 
 93 | 
 94 | def _hc_repr_to_np(hc_repr):
 95 |     h, c = hc_repr
 96 |     batch_size = h.shape[1]
 97 |     h, c = h.view(batch_size, -1), c.view(batch_size, -1)
 98 |     h, c = h.detach().cpu().numpy(), c.detach().cpu().numpy()
 99 |     hc = np.hstack([h, c])
100 |     return hc
101 | 
102 | 
103 | def get_embeddings(seq2seq, dataloaders, padding_value, max_seq_len):
104 |     """Put together the embeddings: stack horizontally the arrays of h and c; stack vertically these arrays.
105 |     """
106 |     hc_np_list = []
107 |     for dataloader in dataloaders:
108 |         seq2seq.eval()
109 |         with torch.no_grad():
110 |             for iter_, dataloader_items in enumerate(dataloader):
111 |                 x, x_len = dataloader_items[0], dataloader_items[1]
112 |                 batch_size = x.shape[0]
113 |                 hc_init = init_hidden(
114 |                     batch_size=batch_size, 
115 |                     hidden_size=seq2seq.encoder.hidden_size, 
116 |                     num_rnn_layers=seq2seq.encoder.num_rnn_layers, 
117 |                     device=x.device)
118 |                 hc_repr = seq2seq.get_embeddings_only(
119 |                     x_enc=x, 
120 |                     x_seq_lengths=x_len, 
121 |                     hc_init=hc_init, 
122 |                     padding_value=padding_value, 
123 |                     max_seq_len=max_seq_len)
124 |                 hc_np = _hc_repr_to_np(hc_repr)
125 |                 hc_np_list.append(hc_np)
126 |     hc_all = np.vstack(hc_np_list)
127 |     return hc_all
128 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | keras==2.4.3
2 | scikit-learn==0.23.2
3 | pillow==8.1.0
4 | pandas==1.2.0
5 | matplotlib==3.3.2
6 | tqdm==4.55.1
7 | theano
8 | torch==1.7.1


--------------------------------------------------------------------------------
/requirements_dpgan.txt:
--------------------------------------------------------------------------------
 1 | # Requirements for running main_tabular.py with 'dpgan' option
 2 | # cudatoolkit: 10.0
 3 | # cudnn: 7.6.5
 4 | matplotlib==3.3.2
 5 | numpy==1.19.2
 6 | pandas==1.2.1
 7 | torch==1.4.0
 8 | scikit-learn==0.23.2
 9 | scipy==1.5.2
10 | tqdm==4.56.0
11 | tensorflow-gpu==1.15.0
12 | theano==1.0.5


--------------------------------------------------------------------------------
/requirements_timegan.txt:
--------------------------------------------------------------------------------
1 | numpy==1.19.2
2 | pandas==1.1.3
3 | tqdm==4.55.1
4 | scikit-learn==0.23.2
5 | tensorflow==1.15.0


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Common utilities.
2 | """
3 | 
4 | from .utils import *
5 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
 1 | def check_tf2():
 2 |     found = False
 3 |     message = "Note: TensorFlow 2.x not found, some functionality may not be available."
 4 |     try:
 5 |         import tensorflow as tf
 6 |         if str(tf.__version__).split(".")[0] == "2":  # pylint: disable=no-member
 7 |             found = True
 8 |     except ImportError:
 9 |         pass
10 |     if not found:
11 |         print(message)
12 |     return found
13 | 


--------------------------------------------------------------------------------