├── README.md ├── code ├── Decoder.py ├── Encoder.py ├── Evaluation.py ├── Generator.py ├── Graph.py ├── Helpers.py ├── LaugelEtAl.py ├── Loglik.py └── Sampling.py ├── data ├── givme │ ├── give_me_types.csv │ ├── give_me_types_c.csv │ ├── give_me_x.csv │ ├── give_me_x_c.csv │ └── give_me_y.csv └── heloc │ ├── heloc_types.csv │ ├── heloc_types_alt.csv │ ├── heloc_types_c_alt.csv │ ├── heloc_x.csv │ ├── heloc_x_c.csv │ └── heloc_y.csv └── preprocessing ├── Preprocessing_GiveMeSomeCredit.ipynb └── Preprocessing_Heloc.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # C-CHVAE 2 | 3 | ## Set up 4 | Counterfactual explanations can be obtained by identifying the smallest change made to an input vector to influence a prediction in a positive way. Classic examples can be found in credit scoring or health contexts where one tries to change a classifier's decision from ’loan rejected’ to ’awarded’ or from ’high risk of cardiovascular disease’ to ’low risk’. Our approach ensures that the produced counterfactuals are **proximate** (i.e., not local outliers) and **connected** to regions with substantial data density (i.e., close to correctly classified observations), two requirements known as **counterfactual faithfulness**. 5 | 6 | ## Intution 7 | We suggest embedding counterfactual search into a data density approximator, here a variational autoencoder (VAE). The idea is to use the VAE as a search device to find counterfactuals that are proximate and connected to the input data. Given the original tabular data, the encoder specifies a lower dimensional, realvalued and dense representation of that data, z. Therefore, it is the encoder that determines which low-dimensional neighbourhood we should look to for potential counterfactuals. Next, we perturb the low dimensional data representation, z + $\delta$, and feed the perturbed representation into the decoder. For small perturbations the decoder gives a potential counterfactual by reconstructing the input data from the perturbed representation. This counterfactualmis likely to occur. Next, the potential counterfactual is passed to the pretrained classifier, which we ask whether the prediction was altered. 8 | 9 | ## On running the (C-)HVAE 10 | To run the HVAE you have to predefine each input's type: you can choose one of the following: *real* (for inputs defined on the real line), *pos* (for inputs defined on positive part of R), *count* (for count inputs), *cat* (for categorical inputs) and *ordinal* (for ordinal inputs). To see an example, have a look at the *types*.csv files within the *data* folder. 11 | 12 | 13 | ## Bibtex 14 | ``` 15 | @inproceedings{pawelczyk_learning2019, 16 | author = {Pawelczyk, Martin and Broelemann, Klaus and Kasneci, Gjergji}, 17 | title = {Learning Model-Agnostic Counterfactual Explanations for Tabular Data}, 18 | year = {2020}, 19 | publisher = {Association for Computing Machinery}, 20 | address = {New York, NY, USA}, 21 | booktitle = {Proceedings of The Web Conference 2020}, 22 | pages = {3126–3132}, 23 | numpages = {7}, 24 | keywords = {Transparency, Counterfactual explanations, Interpretability}, 25 | location = {Taipei, Taiwan}, 26 | series = {WWW '20} 27 | } 28 | ``` 29 | 30 | -------------------------------------------------------------------------------- /code/Decoder.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | import numpy as np 4 | from tensorflow.python.ops.parallel_for.gradients import batch_jacobian 5 | 6 | 7 | 8 | 9 | def decoder(samples_z, z_dim, y_dim, y_dim_partition, batch_size, types_list): 10 | 11 | samples = dict.fromkeys(['s', 'z', 'y', 'x'], []) 12 | gradients = dict.fromkeys(['g1', 'g2', 'g3'], []) 13 | 14 | samples['z'] = samples_z 15 | 16 | with tf.GradientTape() as g_1: 17 | g_1.watch(samples_z) 18 | # Create deterministic layer y 19 | samples['y'] = tf.layers.dense(inputs=samples_z, units=y_dim, activation=None, 20 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), name='layer_h1_', reuse=None) 21 | 22 | gradients['g1'] = g_1.gradient(samples['y'], samples_z) 23 | 24 | with tf.GradientTape() as g_2: 25 | g_2.watch(samples['y']) 26 | grouped_samples_y = y_partition(samples['y'], types_list, y_dim_partition) 27 | 28 | gradients['g2'] = g_2.gradient(grouped_samples_y, samples['y']) 29 | 30 | with tf.GradientTape() as g_3: 31 | g_3.watch(grouped_samples_y) 32 | # Compute the parameters h_y 33 | theta = theta_estimation_from_y(grouped_samples_y, types_list, batch_size, reuse=None) 34 | 35 | gradients['g3'] = g_3.gradient(theta, grouped_samples_y) 36 | 37 | 38 | return theta, samples, gradients 39 | 40 | 41 | def y_partition(samples_y, types_list, y_dim_partition): 42 | grouped_samples_y = [] 43 | # First element must be 0 and the length of the partition vector must be len(types_list)+1 44 | if len(y_dim_partition) != len(types_list): 45 | raise Exception("The length of the partition vector must match the number of variables in the data + 1") 46 | 47 | # Insert a 0 at the beginning of the cumsum vector 48 | partition_vector_cumsum = np.insert(np.cumsum(y_dim_partition), 0, 0) 49 | for i in range(len(types_list)): 50 | grouped_samples_y.append(samples_y[:, partition_vector_cumsum[i]:partition_vector_cumsum[i + 1]]) 51 | 52 | return grouped_samples_y 53 | 54 | 55 | def observed_data_layer(observed_data, output_dim, name, reuse): 56 | # Train a layer with the observed data and reuse it for the missing data 57 | obs_output = tf.layers.dense(inputs=observed_data, units=output_dim, activation=None, 58 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), name=name, reuse=reuse, 59 | trainable=True) 60 | 61 | return obs_output 62 | 63 | 64 | def theta_estimation_from_y(samples_y, types_list, batch_size, reuse): 65 | theta = [] 66 | 67 | # Independet yd -> Compute p(xd|yd) 68 | for i, d in enumerate(samples_y): 69 | 70 | observed_y = samples_y[i] 71 | nObs = tf.shape(observed_y)[0] 72 | 73 | # Different layer models for each type of variable 74 | if types_list[i]['type'] == 'real': 75 | params = theta_real(observed_y, types_list, i, reuse) 76 | 77 | elif types_list[i]['type'] == 'pos': 78 | params = theta_pos(observed_y, types_list, i, reuse) 79 | 80 | elif types_list[i]['type'] == 'count': 81 | params = theta_count(observed_y, types_list, i, reuse) 82 | 83 | elif types_list[i]['type'] == 'cat': 84 | params = theta_cat(observed_y, types_list, batch_size, i, reuse) 85 | 86 | elif types_list[i]['type'] == 'ordinal': 87 | params = theta_ordinal(observed_y, types_list, i, reuse) 88 | 89 | theta.append(params) 90 | 91 | return theta 92 | 93 | 94 | def theta_real(observed_y, types_list, i, reuse): 95 | # Mean layer (To DO) 96 | h2_mean = observed_data_layer(observed_y, output_dim=types_list[i]['dim'], name='layer_h2' + str(i), reuse=reuse) 97 | # Sigma Layer (To DO) 98 | h2_sigma = observed_data_layer(observed_y, output_dim=types_list[i]['dim'], name='layer_h2_sigma' + str(i), 99 | reuse=reuse) 100 | 101 | return [h2_mean, h2_sigma] 102 | 103 | 104 | def theta_pos(observed_y, types_list, i, reuse): 105 | # Mean layer 106 | h2_mean = observed_data_layer(observed_y, output_dim=types_list[i]['dim'], name='layer_h2' + str(i), reuse=reuse) 107 | # Sigma Layer 108 | h2_sigma = observed_data_layer(observed_y, output_dim=types_list[i]['dim'], name='layer_h2_sigma' + str(i), 109 | reuse=reuse) 110 | 111 | return [h2_mean, h2_sigma] 112 | 113 | 114 | def theta_count(observed_y, types_list, i, reuse): 115 | # Lambda Layer 116 | h2_lambda = observed_data_layer(observed_y, output_dim=types_list[i]['dim'], name='layer_h2' + str(i), reuse=reuse) 117 | 118 | return h2_lambda 119 | 120 | 121 | def theta_cat(observed_y, types_list, batch_size, i, reuse): 122 | # Log pi layer, with zeros in the first value to avoid the identificability problem 123 | h2_log_pi_partial = observed_data_layer(observed_y, output_dim=int(types_list[i]['dim']) - 1, 124 | name='layer_h2' + str(i), reuse=reuse) 125 | h2_log_pi = tf.concat([tf.zeros([batch_size, 1]), h2_log_pi_partial], 1) 126 | 127 | return h2_log_pi 128 | 129 | 130 | def theta_ordinal(observed_y, types_list, i, reuse): 131 | # Theta layer, Dimension of ordinal - 1 132 | h2_theta = observed_data_layer(observed_y, output_dim=int(types_list[i]['dim']) - 1, name='layer_h2' + str(i), 133 | reuse=reuse) 134 | # Mean layer, a single value 135 | h2_mean = observed_data_layer(observed_y, output_dim=1, name='layer_h2_sigma' + str(i), reuse=reuse) 136 | 137 | return [h2_theta, h2_mean] 138 | 139 | 140 | def decoder_test_time(samples_z, z_dim, y_dim, y_dim_partition, batch_size, types_list): 141 | samples = dict.fromkeys(['s', 'z', 'y', 'x'], []) 142 | 143 | samples['z'] = samples_z 144 | 145 | # Create deterministic layer y 146 | samples['y'] = tf.layers.dense(inputs=samples_z, units=y_dim, activation=None, 147 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), name='layer_h1_', 148 | reuse=True) 149 | 150 | grouped_samples_y = y_partition(samples['y'], types_list, y_dim_partition) 151 | 152 | # Compute the parameters h_y 153 | theta = theta_estimation_from_y(grouped_samples_y, types_list, batch_size, reuse=True) 154 | 155 | return theta, samples -------------------------------------------------------------------------------- /code/Encoder.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | 4 | def encoder(X_list, batch_size, z_dim, s_dim, tau): 5 | 6 | samples = dict.fromkeys(['s', 'z', 'y', 'x'], []) 7 | q_params = dict() 8 | X = tf.concat(X_list, 1) 9 | 10 | # Create the proposal of q(s|x^o): categorical(x^~) 11 | samples['s'], q_params['s'] = s_proposal_multinomial(X, batch_size, s_dim, tau, reuse=None) 12 | 13 | # Create the proposal of q(z|s,x^o): N(mu(x^~,s), SIGMA(x^~,s))??? 14 | samples['z'], q_params['z'] = z_proposal_GMM_factorized(X_list, samples['s'], batch_size, z_dim, reuse=None) 15 | 16 | return samples, q_params 17 | 18 | 19 | def encoder_c(X_list, X_list_c, batch_size, z_dim, s_dim, tau): 20 | 21 | samples = dict.fromkeys(['s', 'z', 'y', 'x'], []) 22 | q_params = dict() 23 | X = tf.concat(X_list, 1) 24 | X_c = tf.concat(X_list_c, 1) 25 | 26 | # Create the proposal of q(s|x^o): categorical(x^~) 27 | samples['s'], q_params['s'] = s_proposal_multinomial_c(X, X_c, batch_size, s_dim, tau, reuse=None) 28 | 29 | # Create the proposal of q(z|s,x^o): N(mu(x^~,s), SIGMA(x^~,s))??? 30 | samples['z'], q_params['z'] = z_proposal_GMM_factorized_c(X_list, X_c, samples['s'], batch_size, z_dim, reuse=None) 31 | 32 | return samples, q_params 33 | 34 | 35 | def encoder_vae(X_list, X_list_c, batch_size, z_dim, s_dim, tau): 36 | 37 | samples = dict.fromkeys(['s', 'z', 'y', 'x'], []) 38 | q_params = dict() 39 | X = tf.concat(X_list, 1) 40 | X_c = tf.concat(X_list_c, 1) 41 | 42 | # Create the proposal of q(s|x^o): categorical(x^~) 43 | samples['s'], q_params['s'] = s_proposal_multinomial_c(X, X_c, batch_size, s_dim, tau, reuse=None) 44 | 45 | # Create the proposal of q(z|s,x^o): N(mu(x^~,s), SIGMA(x^~,s))??? 46 | samples['z'], q_params['z'] = z_proposal_GMM_factorized_c(X_list, X_c, samples['s'], batch_size, z_dim, reuse=None) 47 | 48 | return samples, q_params 49 | 50 | 51 | 52 | def z_proposal_GMM_factorized(X, samples_s, batch_size, z_dim, reuse): 53 | mean_qz = [] 54 | log_var_qz = [] 55 | 56 | for i, d in enumerate(X): 57 | observed_data = d 58 | observed_s = samples_s 59 | 60 | # Mean layer 61 | aux_m_qz = tf.layers.dense(inputs=tf.concat([observed_data, observed_s], 1), units=z_dim, activation=None, 62 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), 63 | name='layer_1_' + 'mean_enc_z' + str(i), reuse=reuse) 64 | 65 | 66 | # Logvar layers 67 | aux_lv_qz = tf.layers.dense(inputs=tf.concat([observed_data, observed_s], 1), units=z_dim, activation=None, 68 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), 69 | name='layer_1_' + 'logvar_enc_z' + str(i), reuse=reuse) 70 | 71 | mean_qz.append(aux_m_qz) 72 | log_var_qz.append(aux_lv_qz) 73 | 74 | # Input prior 75 | log_var_qz.append(tf.zeros([batch_size, z_dim])) 76 | mean_qz.append(tf.zeros([batch_size, z_dim])) 77 | 78 | # Compute full parameters, as a product of Gaussians distribution 79 | log_var_qz_joint = -tf.reduce_logsumexp(tf.negative(log_var_qz), 0) 80 | mean_qz_joint = tf.multiply(tf.exp(log_var_qz_joint), 81 | tf.reduce_sum(tf.multiply(mean_qz, tf.exp(tf.negative(log_var_qz))), 0)) 82 | 83 | # Avoid numerical problems 84 | # log_var_qz = tf.clip_by_value(log_var_qz, -15.0, 15.0) 85 | # Rep-trick 86 | eps = tf.random_normal((batch_size, z_dim), 0, 1, dtype=tf.float32) 87 | samples_z = mean_qz_joint + tf.multiply(tf.exp(log_var_qz_joint / 2), eps) 88 | 89 | return samples_z, [mean_qz_joint, log_var_qz_joint] 90 | 91 | 92 | def z_proposal_GMM_factorized_c(X, X_c, samples_s, batch_size, z_dim, reuse): 93 | mean_qz = [] 94 | log_var_qz = [] 95 | 96 | for i, d in enumerate(X): 97 | observed_data = d 98 | observed_s = samples_s 99 | 100 | # Mean layer 101 | aux_m_qz = tf.layers.dense(inputs=tf.concat([observed_data, observed_s, X_c], 1), units=z_dim, activation=None, 102 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), 103 | name='layer_1_' + 'mean_enc_z' + str(i), reuse=reuse) 104 | 105 | # Logvar layers 106 | aux_lv_qz = tf.layers.dense(inputs=tf.concat([observed_data, observed_s, X_c], 1), units=z_dim, activation=None, 107 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), 108 | name='layer_1_' + 'logvar_enc_z' + str(i), reuse=reuse) 109 | 110 | mean_qz.append(aux_m_qz) 111 | log_var_qz.append(aux_lv_qz) 112 | 113 | # Input prior 114 | log_var_qz.append(tf.zeros([batch_size, z_dim])) 115 | mean_qz.append(tf.zeros([batch_size, z_dim])) 116 | 117 | # Compute full parameters, as a product of Gaussians distribution 118 | log_var_qz_joint = -tf.reduce_logsumexp(tf.negative(log_var_qz), 0) 119 | mean_qz_joint = tf.multiply(tf.exp(log_var_qz_joint), 120 | tf.reduce_sum(tf.multiply(mean_qz, tf.exp(tf.negative(log_var_qz))), 0)) 121 | 122 | # Avoid numerical problems 123 | # log_var_qz = tf.clip_by_value(log_var_qz, -15.0, 15.0) 124 | # Rep-trick 125 | eps = tf.random_normal((batch_size, z_dim), 0, 1, dtype=tf.float32) 126 | samples_z = mean_qz_joint + tf.multiply(tf.exp(log_var_qz_joint / 2), eps) 127 | 128 | return samples_z, [mean_qz_joint, log_var_qz_joint] 129 | 130 | 131 | def z_proposal_distribution_GMM(x_list, x_list_c, samples_s, z_dim, reuse): 132 | # We propose a GMM for z 133 | 134 | x = tf.concat(x_list, 1) 135 | x_c = tf.concat(x_list_c, 1) 136 | 137 | h1 = tf.layers.dense(inputs=tf.concat([x, samples_s, x_c], 1), units=z_dim, activation=tf.nn.relu, 138 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), 139 | name='layer_1_enc', reuse=reuse) 140 | 141 | # Mean layer 142 | aux_m_qz = tf.layers.dense(inputs=h1, units=z_dim, activation=None, 143 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), 144 | name='layer_2_' + 'mean_enc_z', reuse=reuse) 145 | 146 | # Logvar layers 147 | aux_lv_qz = tf.layers.dense(inputs=h1, units=z_dim, activation=None, 148 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), 149 | name='layer_2_' + 'logvar_enc_z', reuse=reuse) 150 | 151 | # Input prior 152 | 153 | log_var_qz.append(tf.zeros([batch_size, z_dim])) 154 | mean_qz.append(tf.zeros([batch_size, z_dim])) 155 | 156 | # Compute full parameters, as a product of Gaussians distribution 157 | log_var_qz_joint = -tf.reduce_logsumexp(tf.negative(log_var_qz), 0) 158 | mean_qz_joint = tf.multiply(tf.exp(log_var_qz_joint),tf.reduce_sum(tf.multiply(mean_qz, tf.exp(tf.negative(log_var_qz))), 0)) 159 | 160 | # Avoid numerical problems 161 | # log_var_qz = tf.clip_by_value(log_var_qz, -15.0, 15.0) 162 | # Rep-trick 163 | eps = tf.random_normal((batch_size, z_dim), 0, 1, dtype=tf.float32) 164 | samples_z = mean_qz_joint + tf.multiply(tf.exp(log_var_qz_joint / 2), eps) 165 | 166 | return mean_pz, log_var_pz 167 | 168 | 169 | 170 | 171 | def s_proposal_multinomial(X, batch_size, s_dim, tau, reuse): 172 | # Categorical(\pi(x^~)) 173 | # We propose a categorical distribution to create a GMM for the latent space z 174 | log_pi = tf.layers.dense(inputs=X, units=s_dim, activation=None, 175 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), name='layer_1_' + 'enc_s', 176 | reuse=reuse) 177 | 178 | # Gumbel-softmax trick (tau is temperature parameter) 179 | U = -tf.log(-tf.log(tf.random_uniform([batch_size, s_dim]))) 180 | samples_s = tf.nn.softmax((log_pi + U) / tau) 181 | 182 | return samples_s, log_pi 183 | 184 | 185 | def s_proposal_multinomial_c(X, X_c, batch_size, s_dim, tau, reuse): 186 | # Categorical(\pi(x^~)) 187 | # We propose a categorical distribution to create a GMM for the latent space z 188 | log_pi = tf.layers.dense(inputs=tf.concat([X, X_c], 1), units=s_dim, activation=None, 189 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), name='layer_1_' + 'enc_s', 190 | reuse=reuse) 191 | 192 | # Gumbel-softmax trick (tau is temperature parameter) 193 | U = -tf.log(-tf.log(tf.random_uniform([batch_size, s_dim]))) 194 | samples_s = tf.nn.softmax((log_pi + U) / tau) 195 | 196 | return samples_s, log_pi 197 | 198 | 199 | 200 | def z_distribution_GMM(samples_s, z_dim, reuse): 201 | # We propose a GMM for z 202 | mean_pz = tf.layers.dense(inputs=samples_s, units=z_dim, activation=None, 203 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), 204 | name='layer_1_' + 'mean_dec_z', reuse=reuse) 205 | 206 | log_var_pz = tf.zeros([tf.shape(samples_s)[0], z_dim]) 207 | 208 | # Avoid numerical problems 209 | log_var_pz = tf.clip_by_value(log_var_pz, -15.0, 15.0) 210 | 211 | return mean_pz, log_var_pz 212 | 213 | -------------------------------------------------------------------------------- /code/Evaluation.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | import Loglik 4 | 5 | def loglik_evaluation(batch_data_list, types_list, theta, normalization_params, reuse): 6 | 7 | log_p_x = [] 8 | samples_x = [] 9 | params_x = [] 10 | 11 | # Independet yd -> Compute log(p(xd|yd)) 12 | # batch data list is a list of tensors with different dimensions depending on data type 13 | 14 | for i, d in enumerate(batch_data_list): 15 | 16 | # Select the likelihood for the types of variables 17 | # For that we need to import 'loglik_models_missing_normalize' as function 18 | loglik_function = getattr(Loglik, 'loglik_' + types_list[i]['type']) 19 | 20 | out = loglik_function(d, types_list[i], theta[i], normalization_params[i], 21 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), name='layer_1_mean_dec_x' + str(i), reuse=reuse) 22 | 23 | log_p_x.append(out['log_p_x']) 24 | samples_x.append(out['samples']) 25 | params_x.append(out['params']) 26 | 27 | return log_p_x, samples_x, params_x 28 | 29 | 30 | 31 | def loglik_evaluation_test(batch_data_list, theta, normalization_params, list_type): 32 | 33 | samples_x_perturbed = [] 34 | params_x_perturbed = [] 35 | 36 | # batch data list is a list of tensors with different dimensions depending on data type 37 | # needed here for loop; nothing else! 38 | 39 | for i, d in enumerate(batch_data_list): 40 | 41 | # Select the likelihood for the types of variables 42 | # For that we need to import 'loglik_models_missing_normalize' as function 43 | loglik_function = getattr(Loglik, 'loglik_test_' + list_type[i]['type']) 44 | 45 | out = loglik_function(theta[i], normalization_params[i], list_type[i]) 46 | 47 | samples_x_perturbed.append(out['samples']) 48 | params_x_perturbed.append(out['params']) 49 | 50 | return samples_x_perturbed, params_x_perturbed 51 | 52 | 53 | 54 | 55 | def cost_function(log_p_x, p_params, q_params, types_list, z_dim, y_dim, s_dim): 56 | # KL(q(s|x)|p(s)) 57 | log_pi = q_params['s'] 58 | pi_param = tf.nn.softmax(log_pi) 59 | KL_s = -tf.nn.softmax_cross_entropy_with_logits(logits=log_pi, labels=pi_param) + tf.log(float(s_dim)) 60 | 61 | # KL(q(z|s,x)|p(z|s)) 62 | mean_pz, log_var_pz = p_params['z'] 63 | mean_qz, log_var_qz = q_params['z'] 64 | KL_z = -0.5 * z_dim + 0.5 * tf.reduce_sum( 65 | tf.exp(log_var_qz - log_var_pz) + tf.square(mean_pz - mean_qz) / tf.exp(log_var_pz) - log_var_qz + log_var_pz, 66 | 1) 67 | 68 | # Eq[log_p(x|y)] 69 | loss_reconstruction = tf.reduce_sum(log_p_x, 0) 70 | 71 | # Complete ELBO 72 | #ELBO = tf.reduce_mean(loss_reconstruction - KL_z - KL_s, 0) 73 | ELBO = tf.reduce_mean(1.20*loss_reconstruction - (KL_z + KL_s), 0) 74 | 75 | return ELBO, loss_reconstruction, KL_z, KL_s 76 | 77 | 78 | def kl_z_diff(p_params, q_params, degree_active, batch_size, z_dim): 79 | # method to check whether one is within the polarized regime 80 | 81 | # parameters 82 | mean_pz, log_var_pz = p_params['z'] 83 | mean_qz, log_var_qz = q_params['z'] 84 | 85 | ones = tf.ones([batch_size, z_dim]) 86 | 87 | # index according to global importance 88 | index = tf.greater(degree_active*ones, tf.reduce_mean(tf.exp(log_var_qz), 0)) 89 | 90 | mean_qz_approx = tf.reshape(tf.boolean_mask(mean_qz, index), [batch_size, -1]) 91 | mean_pz_approx = tf.reshape(tf.boolean_mask(mean_pz, index), [batch_size, -1]) 92 | log_var_qz_approx = tf.reshape(tf.boolean_mask(log_var_qz, index), [batch_size, -1]) 93 | log_var_pz_approx = tf.reshape(tf.boolean_mask(log_var_pz, index), [batch_size, -1]) 94 | 95 | kl_approx = tf.reduce_mean(tf.reduce_sum(tf.exp(log_var_qz_approx - log_var_pz_approx) + tf.square(mean_pz_approx - mean_qz_approx) / tf.exp(log_var_pz_approx) - log_var_qz_approx + log_var_pz_approx, 1), 0) 96 | kl = tf.reduce_mean(tf.reduce_sum(tf.exp(log_var_qz - log_var_pz) + tf.square(mean_pz - mean_qz) / tf.exp(log_var_pz) - log_var_qz + log_var_pz, 1), 0) 97 | 98 | delta_kl = tf.divide(tf.abs(tf.subtract(kl_approx, kl)), kl) 99 | 100 | return [delta_kl, kl_approx, kl, index] 101 | 102 | -------------------------------------------------------------------------------- /code/Generator.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | import Encoder 4 | import Decoder 5 | import Evaluation 6 | 7 | def samples_generator(batch_data_list, X_list, types_list, batch_size, z_dim, y_dim, y_dim_partition, s_dim, tau, normalization_params): 8 | 9 | samples_test = dict.fromkeys(['s' ,'z' ,'y' ,'x'] ,[]) 10 | test_params = dict() 11 | X = tf.concat(X_list ,1) 12 | 13 | # Create the proposal of q(s|x^o) 14 | _, params = Encoder.s_proposal_multinomial(X, batch_size, s_dim, tau, reuse=True) 15 | samples_test['s'] = tf.one_hot(tf.argmax(params ,1) ,depth=s_dim) 16 | 17 | # Create the proposal of q(z|s,x^o) 18 | _, params = Encoder.z_proposal_GMM_factorized(X_list, samples_test['s'], batch_size, z_dim, reuse=True) 19 | samples_test['z'] = params[0] 20 | 21 | # Create deterministic layer y 22 | samples_test['y'] = tf.layers.dense(inputs=samples_test['z'], 23 | units=y_dim, 24 | activation=None, 25 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), 26 | trainable=True, 27 | name= 'layer_h1_', reuse=True) 28 | 29 | grouped_samples_y = Decoder.y_partition(samples_test['y'], types_list, y_dim_partition) 30 | 31 | # Compute the parameters h_y 32 | theta = Decoder.theta_estimation_from_y(grouped_samples_y, types_list, batch_size, reuse=True) 33 | 34 | # Compute loglik and output of the VAE 35 | log_p_x, samples_test['x'], test_params['x'] = Evaluation.loglik_evaluation(batch_data_list, 36 | types_list, 37 | theta, 38 | normalization_params, 39 | reuse=True) 40 | 41 | return samples_test, test_params, log_p_x, theta 42 | 43 | 44 | 45 | def samples_generator_c(batch_data_list, X_list, X_list_c, types_list, batch_size, z_dim, y_dim, y_dim_partition, s_dim, tau, normalization_params): 46 | 47 | samples_test = dict.fromkeys(['s' ,'z' ,'y' ,'x'] ,[]) 48 | test_params = dict() 49 | X = tf.concat(X_list ,1) 50 | X_c = tf.concat(X_list_c, 1) 51 | 52 | # Create the proposal of q(s|x^o) 53 | _, params = Encoder.s_proposal_multinomial_c(X, X_c, batch_size, s_dim, tau, reuse=True) 54 | samples_test['s'] = tf.one_hot(tf.argmax(params, 1), depth=s_dim) 55 | 56 | # Create the proposal of q(z|s,x^o) 57 | _, params = Encoder.z_proposal_GMM_factorized_c(X_list, X_c, samples_test['s'], batch_size, z_dim, reuse=True) 58 | samples_test['z'] = params[0] 59 | 60 | # Create deterministic layer y 61 | samples_test['y'] = tf.layers.dense(inputs=samples_test['z'], 62 | units=y_dim, 63 | activation=None, 64 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), 65 | trainable=True, 66 | name='layer_h1_', reuse=True) 67 | 68 | grouped_samples_y = Decoder.y_partition(samples_test['y'], types_list, y_dim_partition) 69 | 70 | # Compute the parameters h_y 71 | theta = Decoder.theta_estimation_from_y(grouped_samples_y, types_list, batch_size, reuse=True) 72 | 73 | # Compute loglik and output of the VAE 74 | log_p_x, samples_test['x'], test_params['x'] = Evaluation.loglik_evaluation(batch_data_list, 75 | types_list, 76 | theta, 77 | normalization_params, 78 | reuse=True) 79 | 80 | return samples_test, test_params, log_p_x, theta 81 | 82 | 83 | 84 | 85 | def samples_perturbation_z(batch_data_list, X_list, types_list, z_dim, y_dim, y_dim_partition, s_dim, tau, 86 | normalization_params, nsamples, batch_size, p, l, h): 87 | # I ended up not using this one 88 | # should be: batch_size size = nsamples 89 | 90 | samples_test = dict.fromkeys(['s', 'z', 'y_tilde', 'z_tilde', 'x_tilde'], []) 91 | test_params = dict() 92 | X = tf.concat(X_list, 1) 93 | 94 | # -----------------------------------------------------------------------------------# 95 | # Encoder: Test Time 96 | 97 | # Create the proposal of q(s|x^o) 98 | _, params = Encoder.s_proposal_multinomial(X, batch_size, s_dim, tau, reuse=True) 99 | samples_test['s'] = tf.one_hot(tf.argmax(params, 1), depth=s_dim) 100 | 101 | # Create the proposal of q(z|s,x^o) 102 | _, params = Encoder.z_proposal_GMM_factorized(X_list, samples_test['s'], batch_size, z_dim, reuse=True) 103 | samples_test['z'] = params[0] 104 | 105 | # -----------------------------------------------------------------------------------# 106 | # counterfactual step 107 | 108 | # z = samples_test['z'] 109 | delta_z = tf.random_normal((nsamples, z_dim), 0, 1, 110 | dtype=tf.float32) # http://mathworld.wolfram.com/HyperspherePointPicking.html 111 | d = tf.add(tf.multiply(tf.random_uniform((nsamples, 1), 0, 1, dtype=tf.float32), (h - l)), l) # length range [l, h) 112 | norm_p = tf.norm(delta_z, ord=p, axis=1) 113 | norm_p = tf.reshape(norm_p, [-1, 1]) # right format 114 | d_norm = tf.div(d, norm_p) # rescale/normalize factor 115 | delta_z = tf.multiply(delta_z, d_norm) # shape: (nsamples x z_dim) 116 | 117 | # -----------------------------------------------------------------------------------# 118 | # Decoder: Test Time 119 | 120 | # during counterfactual search 121 | z_tilde = tf.add(samples_test['z'], delta_z) # gives (nsamples x z_dim) vector 122 | samples_test['z_tilde'] = tf.reshape(z_tilde, [-1, z_dim]) # use reshape to avoid rank error 123 | 124 | # Create deterministic layer y 125 | samples_test['y_tilde'] = tf.layers.dense(inputs=samples_test['z_tilde'], 126 | units=y_dim, 127 | activation=None, 128 | kernel_initializer=tf.random_normal_initializer(stddev=0.05), 129 | trainable=True, 130 | name='layer_h1_', reuse=True) 131 | 132 | grouped_samples_y = Decoder.y_partition(samples_test['y_tilde'], types_list, y_dim_partition) 133 | 134 | # Compute the parameters h_y 135 | theta = Decoder.theta_estimation_from_y(grouped_samples_y, types_list, batch_size, reuse=True) 136 | 137 | # Compute loglik and output of the VAE 138 | log_p_x, samples_test['x_tilde'], test_params['x'] = Evaluation.loglik_evaluation(batch_data_list, 139 | types_list, 140 | theta, 141 | normalization_params, 142 | reuse=True) 143 | 144 | return samples_test, delta_z, d, theta -------------------------------------------------------------------------------- /code/Graph.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | import numpy as np 4 | import Helpers 5 | import Encoder 6 | import Decoder 7 | import Evaluation 8 | import Generator 9 | 10 | # MASTER of Disaster 11 | 12 | def C_HVAE_graph(types_file, learning_rate=1e-4, z_dim=1, y_dim=1, s_dim=1, y_dim_partition=[], nsamples=1000, p=2): 13 | 14 | # -----------------------------------------------------------------------------------# 15 | # Preliminaries 16 | 17 | # Load remaining placeholders 18 | print('[*] Defining placeholders') 19 | 20 | 21 | # Placeholder for batch_size (required for counterfactual search loop) 22 | batch_size = tf.placeholder(dtype=tf.int32) 23 | # Placeholder for Gumbel-softmax parameter 24 | tau = tf.placeholder(tf.float32, shape=()) 25 | batch_data_list, types_list = Helpers.place_holder_types(types_file, batch_size) 26 | 27 | # Batch normalization of the data 28 | X_list, normalization_params, X_list_noisy = Helpers.batch_normalization(batch_data_list, types_list, batch_size) 29 | 30 | 31 | # Set dimensionality of Y 32 | if y_dim_partition: 33 | y_dim_output = np.sum(y_dim_partition) 34 | else: 35 | y_dim_partition = y_dim * np.ones(len(types_list), dtype=int) 36 | y_dim_output = np.sum(y_dim_partition) 37 | 38 | # -----------------------------------------------------------------------------------# 39 | # (HVAE) Encoder and Decoder for training time 40 | 41 | # Encoder 42 | print('[*] Defining Encoder...') 43 | samples, q_params = Encoder.encoder(X_list_noisy, batch_size, z_dim, s_dim, tau) 44 | 45 | samples_s = samples['s'] 46 | samples_z = samples['z'] 47 | p_params = dict() 48 | 49 | # Create the distribution of p(z|s) 50 | p_params['z'] = Encoder.z_distribution_GMM(samples['s'], z_dim, reuse=None) 51 | 52 | # Decoder 53 | print('[*] Defining Decoder...') 54 | theta, samples, gradient_decoder = Decoder.decoder(samples_z, z_dim, y_dim_output, y_dim_partition, batch_size, types_list) 55 | 56 | samples['s'] = samples_s 57 | # Compute loglik and output of the VAE 58 | log_p_x, samples['x'], p_params['x'] = Evaluation.loglik_evaluation(batch_data_list, 59 | types_list, 60 | theta, 61 | normalization_params, 62 | reuse=None) 63 | 64 | # Evaluate active vs passive variables 65 | degree_active = 0.95# must be less than 1 (not used in paper) 66 | delta_kl = Evaluation.kl_z_diff(p_params, q_params, degree_active, batch_size, z_dim) 67 | 68 | 69 | # -----------------------------------------------------------------------------------# 70 | # optimize ELBO 71 | 72 | print('[*] Defining Cost function...') 73 | ELBO, loss_reconstruction, KL_z, KL_s = Evaluation.cost_function(log_p_x, 74 | p_params, 75 | q_params, 76 | types_list, 77 | z_dim, 78 | y_dim_output, 79 | s_dim) 80 | 81 | optim = tf.train.AdamOptimizer(learning_rate).minimize(-ELBO) 82 | 83 | # -----------------------------------------------------------------------------------# 84 | # Generator function for test time sample generation 85 | samples_test, test_params, log_p_x_test, theta_test = Generator.samples_generator(batch_data_list, 86 | X_list, 87 | types_list, 88 | batch_size, 89 | z_dim, 90 | y_dim_output, 91 | y_dim_partition, 92 | s_dim, 93 | tau, 94 | normalization_params) 95 | 96 | # -----------------------------------------------------------------------------------# 97 | # Decoder for test time counterfactuals 98 | # 'samples_perturbed': does not contain 'x' samples 99 | 100 | print('[*] Defining Test Time Decoder...') 101 | theta_perturbed, samples_perturbed = Decoder.decoder_test_time(samples_z, 102 | z_dim, 103 | y_dim_output, 104 | y_dim_partition, 105 | batch_size, 106 | types_list) 107 | 108 | # Evaluation Function not necessary here 109 | '''log_p_x, samples_perturbed['x'], p_params_x_perturbed = Evaluation.loglik_evaluation(batch_data_list, 110 | types_list, 111 | theta_perturbed, 112 | normalization_params, 113 | reuse=True)''' 114 | 115 | # -----------------------------------------------------------------------------------# 116 | # Packing results 117 | 118 | tf_nodes = {'batch_size': batch_size,#feed 119 | 'ground_batch': batch_data_list,#feed 120 | 'tau_GS': tau,#feed, 121 | #'predict_proba': predict_proba,#feed 122 | 'samples_z': samples_z,#feed 123 | 'samples': samples, 124 | 'log_p_x': log_p_x, 125 | 'loss_re': loss_reconstruction, 126 | 'loss': -ELBO, 127 | 'optim': optim, 128 | 'KL_s': KL_s, 129 | 'KL_z': KL_z, 130 | 'X': X_list, 131 | 'p_params': p_params, 132 | 'q_params': q_params, 133 | 'samples_test': samples_test, 134 | 'test_params': test_params, 135 | 'log_p_x_test': log_p_x_test, 136 | 'samples_perturbed': samples_perturbed, 137 | 'theta_test': theta_test, 138 | 'theta_perturbed': theta_perturbed, 139 | 'normalization_params': normalization_params, 140 | 'gradient_decoder': gradient_decoder, 141 | 'delta_kl': delta_kl} 142 | 143 | return tf_nodes 144 | 145 | 146 | # MASTER of Disaster for conditional density approximations 147 | 148 | def C_CHVAE_graph(types_file, types_file_c, learning_rate=1e-3, z_dim=1, y_dim=1, s_dim=1, y_dim_partition=[], nsamples=1000, p=2, degree_active=0.95): 149 | 150 | # -----------------------------------------------------------------------------------# 151 | # Preliminaries 152 | 153 | # Load placeholders 154 | print('[*] Defining placeholders') 155 | 156 | # c: short for 'conditional' 157 | # Placeholder for batch_size (required for counterfactual search loop) 158 | batch_size = tf.placeholder(dtype=tf.int32) 159 | # Placeholder for Gumbel-softmax parameter 160 | tau = tf.placeholder(tf.float32, shape=()) 161 | batch_data_list, types_list = Helpers.place_holder_types(types_file, batch_size) 162 | batch_data_list_c, types_list_c = Helpers.place_holder_types(types_file_c, batch_size) 163 | 164 | 165 | # Batch normalization of the data 166 | X_list, normalization_params, X_list_noisy = Helpers.batch_normalization(batch_data_list, types_list, batch_size) 167 | # Batch normalization of the data 168 | X_list_c, _, X_list_noisy_c = Helpers.batch_normalization(batch_data_list_c, types_list, batch_size) 169 | 170 | 171 | # Set dimensionality of Y 172 | if y_dim_partition: 173 | y_dim_output = np.sum(y_dim_partition) 174 | else: 175 | y_dim_partition = y_dim * np.ones(len(types_list), dtype=int) 176 | y_dim_output = np.sum(y_dim_partition) 177 | 178 | # -----------------------------------------------------------------------------------# 179 | # (HVAE) Encoder and Decoder for training time 180 | 181 | # Encoder 182 | print('[*] Defining Encoder...') 183 | samples, q_params = Encoder.encoder_c(X_list, X_list_c, batch_size, z_dim, s_dim, tau) 184 | 185 | samples_s = samples['s'] 186 | samples_z = samples['z'] 187 | p_params = dict() 188 | 189 | # Create the distribution of p(z|s) 190 | p_params['z'] = Encoder.z_distribution_GMM(samples['s'], z_dim, reuse=None) 191 | 192 | # Decoder 193 | print('[*] Defining Decoder...') 194 | theta, samples, gradient_decoder = Decoder.decoder(samples_z, z_dim, y_dim_output, y_dim_partition, batch_size, types_list) 195 | 196 | samples['s'] = samples_s 197 | # Compute loglik and output of the VAE 198 | log_p_x, samples['x'], p_params['x'] = Evaluation.loglik_evaluation(batch_data_list, 199 | types_list, 200 | theta, 201 | normalization_params, 202 | reuse=None) 203 | 204 | # -----------------------------------------------------------------------------------# 205 | # optimize ELBO 206 | 207 | print('[*] Defining Cost function...') 208 | ELBO, loss_reconstruction, KL_z, KL_s = Evaluation.cost_function(log_p_x, 209 | p_params, 210 | q_params, 211 | types_list, 212 | z_dim, 213 | y_dim_output, 214 | s_dim) 215 | 216 | optim = tf.train.AdamOptimizer(learning_rate).minimize(-ELBO) 217 | 218 | # -----------------------------------------------------------------------------------# 219 | # Generator function for test time sample generation 220 | samples_test, test_params, log_p_x_test, theta_test = Generator.samples_generator_c(batch_data_list, 221 | X_list, X_list_c, 222 | types_list, 223 | batch_size, 224 | z_dim, 225 | y_dim_output, 226 | y_dim_partition, 227 | s_dim, 228 | tau, 229 | normalization_params) 230 | 231 | # -----------------------------------------------------------------------------------# 232 | # Decoder for test time counterfactuals 233 | # 'samples_perturbed': does not contain 'x' samples 234 | 235 | print('[*] Defining Test Time Decoder...') 236 | theta_perturbed, samples_perturbed = Decoder.decoder_test_time(samples_z, 237 | z_dim, 238 | y_dim_output, 239 | y_dim_partition, 240 | batch_size, 241 | types_list) 242 | 243 | # Evaluation Function not necessary here 244 | degree_active = degree_active# must be less than 1 245 | delta_kl = Evaluation.kl_z_diff(p_params, q_params, degree_active, batch_size, z_dim) 246 | 247 | # -----------------------------------------------------------------------------------# 248 | # Packing results 249 | 250 | tf_nodes = {'batch_size': batch_size, #feed 251 | 'ground_batch': batch_data_list, #feed 252 | 'ground_batch_c': batch_data_list_c, #feed 253 | 'tau_GS': tau, #feed, 254 | 'samples_z': samples_z, #feed 255 | 'samples': samples, 256 | 'log_p_x': log_p_x, 257 | 'loss_re': loss_reconstruction, 258 | 'loss': -ELBO, 259 | 'optim': optim, 260 | 'KL_s': KL_s, 261 | 'KL_z': KL_z, 262 | 'X': X_list, 263 | 'p_params': p_params, 264 | 'q_params': q_params, 265 | 'samples_test': samples_test, 266 | 'test_params': test_params, 267 | 'log_p_x_test': log_p_x_test, 268 | 'samples_perturbed': samples_perturbed, 269 | 'theta_test': theta_test, 270 | 'theta_perturbed': theta_perturbed, 271 | 'normalization_params': normalization_params, 272 | 'gradient_decoder': gradient_decoder, 273 | 'delta_kl': delta_kl} 274 | 275 | return tf_nodes -------------------------------------------------------------------------------- /code/Helpers.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | import numpy as np 4 | from sklearn.metrics import mean_squared_error 5 | from sklearn import preprocessing 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.svm import SVC 10 | from sklearn.model_selection import GridSearchCV 11 | from sklearn.neighbors import LocalOutlierFactor 12 | from sklearn.decomposition import PCA 13 | from sklearn.preprocessing import StandardScaler 14 | from sklearn.cluster import DBSCAN 15 | from scipy.stats import moment 16 | import csv 17 | import argparse 18 | 19 | 20 | 21 | 22 | # Argument Parser 23 | def getArgs(argv=None): 24 | parser = argparse.ArgumentParser(description='Default parameters of the models', formatter_class=argparse.ArgumentDefaultsHelpFormatter) 25 | parser.add_argument('--batch_size', type=int, default=100, help='Size of the batches') 26 | parser.add_argument('--epochs', type=int, default=80, help='Number of epochs of the simulations') 27 | parser.add_argument('--train', type=int, default=1, help='Training model flag') 28 | parser.add_argument('--display', type=int, default=1, help='Display option flag') 29 | parser.add_argument('--save', type=int, default=1000, help='Save variables every save iterations') 30 | parser.add_argument('--restore', type=int, default=0, help='To restore session, to keep training or evaluation') 31 | parser.add_argument('--dim_latent_s', type=int, default=3, help='Dimension of the categorical space') 32 | parser.add_argument('--dim_latent_z', type=int, default=2, help='Dimension of the Z latent space') 33 | parser.add_argument('--dim_latent_y', type=int, default=5, help='Dimension of the Y latent space') 34 | parser.add_argument('--dim_latent_y_partition', type=int, nargs='+', help='Partition of the Y latent space') 35 | parser.add_argument('--save_file', type=str, default='new_mnist_zdim5_ydim10_4images_', help='Save file name') 36 | parser.add_argument('--data_file', type=str, default='MNIST_data', help='File with the data') 37 | parser.add_argument('--data_file_c', type=str, default='MNIST_data', help='File with the conditioning data') 38 | parser.add_argument('--types_file', type=str, default='mnist_train_types2.csv', help='File with the types of the data') 39 | parser.add_argument('--types_file_c', type=str, default='mnist_train_types2.csv', help='File with the types of the conditioning data') 40 | parser.add_argument('--classifier', type=str, default='RLinearR', help='Classification model (RandomForest, SVM or else RLinearR)') 41 | parser.add_argument('--classifier_two', type=str, default='RandomForest', help='Classification model (RandomForest, SVM or else RLinearR)') 42 | parser.add_argument('--norm_latent_space', type=int, default=2, help='To measure distance between latent variables') 43 | parser.add_argument('--step_size', type=float, default=0.5, help='Step size for Random Search') 44 | parser.add_argument('--search_samples', type=int, default=1000, help='Nunber search samples for counterfactual search') 45 | parser.add_argument('--data_y_file', type=str, default='cs_y_training', help='File with the y data') 46 | parser.add_argument('--ncounterfactuals', type=int, default=25, help='First #counterf. test data points for which we find counterf.') 47 | parser.add_argument('--boundary', type=float, default=-0.5, help='Boundary y = def. for simple classifier') 48 | parser.add_argument('--degree_active', type=float, default=1, help='active latent variable threshold') 49 | 50 | return parser.parse_args(argv) 51 | 52 | 53 | def next_batch(data, types_dict, batch_size, index_batch): 54 | 55 | # Create minibath 56 | batch_xs = data[index_batch * batch_size:(index_batch + 1) * batch_size, :] 57 | 58 | # Slipt variables of the batches 59 | data_list = [] 60 | initial_index = 0 61 | for d in types_dict: 62 | dim = int(d['dim']) 63 | data_list.append(batch_xs[:, initial_index:initial_index + dim]) 64 | initial_index += dim 65 | 66 | return data_list 67 | 68 | def next_batch_y(y, batch_size, index_batch): 69 | return y[index_batch * batch_size:(index_batch + 1) * batch_size, :] 70 | 71 | 72 | 73 | 74 | 75 | def samples_concatenation(samples): 76 | for i, batch in enumerate(samples): 77 | if i == 0: 78 | samples_x = np.concatenate(batch['x'], 1) 79 | samples_y = batch['y'] 80 | samples_z = batch['z'] 81 | samples_s = batch['s'] 82 | else: 83 | samples_x = np.concatenate([samples_x, np.concatenate(batch['x'], 1)], 0) 84 | samples_y = np.concatenate([samples_y, batch['y']], 0) 85 | samples_z = np.concatenate([samples_z, batch['z']], 0) 86 | samples_s = np.concatenate([samples_s, batch['s']], 0) 87 | 88 | return samples_s, samples_z, samples_y, samples_x 89 | 90 | 91 | def discrete_variables_transformation(data, types_dict): 92 | ind_ini = 0 93 | output = [] 94 | for d in range(len(types_dict)): 95 | ind_end = ind_ini + int(types_dict[d]['dim']) 96 | if types_dict[d]['type'] == 'cat': 97 | output.append(np.reshape(np.argmax(data[:, ind_ini:ind_end], 1), [-1, 1])) 98 | elif types_dict[d]['type'] == 'ordinal': 99 | output.append(np.reshape(np.sum(data[:, ind_ini:ind_end], 1) - 1, [-1, 1])) 100 | else: 101 | output.append(data[:, ind_ini:ind_end]) 102 | ind_ini = ind_end 103 | 104 | return np.concatenate(output, 1) 105 | 106 | 107 | def read_data(data_file, types_file): 108 | # Read types of data from data file 109 | with open(data_file, 'r') as f: 110 | data = [[float(x) for x in rec] for rec in csv.reader(f, delimiter=',')] 111 | data = np.array(data) 112 | 113 | # Read types of data from data file 114 | with open(types_file) as f: 115 | types_dict = [{k: v for k, v in row.items()} 116 | for row in csv.DictReader(f, skipinitialspace=True)] 117 | 118 | # Construct the data matrices 119 | data_complete = [] 120 | for i in range(np.shape(data)[1]): 121 | 122 | if types_dict[i]['type'] == 'cat': 123 | # Get categories 124 | cat_data = [int(x) for x in data[:, i]] 125 | categories, indexes = np.unique(cat_data, return_inverse=True) 126 | # Transform categories to a vector of 0:n_categories 127 | new_categories = np.arange(int(types_dict[i]['dim'])) 128 | cat_data = new_categories[indexes] 129 | # Create one hot encoding for the categories 130 | aux = np.zeros([np.shape(data)[0], len(new_categories)]) 131 | aux[np.arange(np.shape(data)[0]), cat_data] = 1 132 | data_complete.append(aux) 133 | 134 | elif types_dict[i]['type'] == 'ordinal': 135 | # Get categories 136 | cat_data = [int(x) for x in data[:, i]] 137 | categories, indexes = np.unique(cat_data, return_inverse=True) 138 | # Transform categories to a vector of 0:n_categories 139 | new_categories = np.arange(int(types_dict[i]['dim'])) 140 | cat_data = new_categories[indexes] 141 | # Create thermometer encoding for the categories 142 | aux = np.zeros([np.shape(data)[0], 1 + len(new_categories)]) 143 | aux[:, 0] = 1 144 | aux[np.arange(np.shape(data)[0]), 1 + cat_data] = -1 145 | aux = np.cumsum(aux, 1) 146 | data_complete.append(aux[:, :-1]) 147 | 148 | else: 149 | data_complete.append(np.transpose([data[:, i]])) 150 | 151 | n_samples = np.shape(data)[0] 152 | # n_variables = len(types_dict) 153 | 154 | data = np.concatenate(data_complete, 1) 155 | 156 | return data, types_dict, n_samples 157 | 158 | 159 | def p_distribution_params_concatenation(params, types_dict, z_dim, s_dim): 160 | keys = params[0].keys() 161 | out_dict = {key: [] for key in keys} 162 | 163 | for i, batch in enumerate(params): 164 | 165 | for d, k in enumerate(keys): 166 | 167 | if k == 'z' or k == 'y': 168 | if i == 0: 169 | out_dict[k] = batch[k] 170 | else: 171 | out_dict[k] = np.concatenate([out_dict[k], batch[k]], 1) 172 | 173 | elif k == 'x': 174 | if i == 0: 175 | out_dict[k] = batch[k] 176 | else: 177 | for v in range(len(types_dict)): 178 | if types_dict[v]['type'] == 'pos' or types_dict[v]['type'] == 'real': 179 | out_dict[k][v] = np.concatenate([out_dict[k][v], batch[k][v]], 1) 180 | else: 181 | out_dict[k][v] = np.concatenate([out_dict[k][v], batch[k][v]], 0) 182 | 183 | return out_dict 184 | 185 | 186 | def q_distribution_params_concatenation(params, z_dim, s_dim): 187 | keys = params[0].keys() 188 | out_dict = {key: [] for key in keys} 189 | 190 | for i, batch in enumerate(params): 191 | for d, k in enumerate(keys): 192 | out_dict[k].append(batch[k]) 193 | 194 | out_dict['z'] = np.concatenate(out_dict['z'], 1) 195 | out_dict['s'] = np.concatenate(out_dict['s'], 0) 196 | 197 | return out_dict 198 | 199 | 200 | def statistics(loglik_params, types_dict): 201 | loglik_mean = [] 202 | loglik_mode = [] 203 | 204 | for d, attrib in enumerate(loglik_params): 205 | if types_dict[d]['type'] == 'real': 206 | # Normal distribution (mean, sigma) 207 | loglik_mean.append(attrib[0]) 208 | loglik_mode.append(attrib[0]) 209 | # Only for log-normal 210 | elif types_dict[d]['type'] == 'pos': 211 | # Log-normal distribution (mean, sigma) 212 | loglik_mean.append(np.exp(attrib[0] + 0.5 * attrib[1]) - 1.0) 213 | loglik_mode.append(np.exp(attrib[0] - attrib[1]) - 1.0) 214 | elif types_dict[d]['type'] == 'count': 215 | # Poisson distribution (lambda) 216 | loglik_mean.append(attrib) 217 | loglik_mode.append(np.floor(attrib)) 218 | 219 | else: 220 | # Categorical and ordinal (mode imputation for both) 221 | loglik_mean.append(np.reshape(np.argmax(attrib, 1), [-1, 1])) 222 | loglik_mode.append(np.reshape(np.argmax(attrib, 1), [-1, 1])) 223 | 224 | return np.transpose(np.squeeze(loglik_mean)), np.transpose(np.squeeze(loglik_mode)) 225 | 226 | 227 | def error_computation(x_train, x_hat, types_dict): 228 | error_observed = [] 229 | ind_ini = 0 230 | for dd in range(len(types_dict)): 231 | 232 | # Mean classification error 233 | if types_dict[dd]['type'] == 'cat': 234 | ind_end = ind_ini + 1 235 | error_observed.append(np.mean(x_train[:, ind_ini:ind_end] != x_hat[:, ind_ini:ind_end])) 236 | 237 | # Mean "shift" error 238 | elif types_dict[dd]['type'] == 'ordinal': 239 | ind_end = ind_ini + 1 240 | error_observed.append( 241 | np.mean(np.abs(x_train[:, ind_ini:ind_end] - x_hat[:, ind_ini:ind_end])) / int(types_dict[dd]['dim'])) 242 | 243 | # Normalized root mean square error 244 | else: 245 | ind_end = ind_ini + int(types_dict[dd]['dim']) 246 | norm_term = np.max(x_train[:, dd]) - np.min(x_train[:, dd]) 247 | error_observed.append( 248 | np.sqrt(mean_squared_error(x_train[:, ind_ini:ind_end], x_hat[:, ind_ini:ind_end])) / norm_term) 249 | 250 | ind_ini = ind_end 251 | 252 | return error_observed 253 | 254 | 255 | def place_holder_types(types_file, batch_size): 256 | # Read the types of the data from the files 257 | with open(types_file) as f: 258 | types_list = [{k: v for k, v in row.items()} 259 | for row in csv.DictReader(f, skipinitialspace=True)] 260 | 261 | # Create placeholders for every data type, with appropriate dimensions 262 | batch_data_list = [] 263 | for i in range(len(types_list)): 264 | batch_data_list.append(tf.placeholder(tf.float32, shape=(None, types_list[i]['dim']))) 265 | tf.concat(batch_data_list, axis=1) 266 | 267 | return batch_data_list, types_list 268 | 269 | 270 | def batch_normalization(batch_data_list, types_list, batch_size): 271 | normalized_data = [] 272 | normalization_parameters = [] 273 | noisy_data = [] 274 | 275 | for i, d in enumerate(batch_data_list): 276 | 277 | observed_data = d 278 | 279 | if types_list[i]['type'] == 'real': 280 | # We transform the data to a gaussian with mean 0 and std 1 281 | data_mean, data_var = tf.nn.moments(observed_data, 0) 282 | data_var = tf.clip_by_value(data_var, 1e-6, 1e20) # Avoid zero values 283 | aux_X = tf.nn.batch_normalization(observed_data, data_mean, data_var, offset=0.0, scale=1.0, 284 | variance_epsilon=1e-6) 285 | 286 | aux_X_noisy = aux_X + tf.random_normal((batch_size, 1), 0, 0.05, dtype=tf.float32) 287 | 288 | normalized_data.append(aux_X) 289 | noisy_data.append(aux_X_noisy) 290 | normalization_parameters.append([data_mean, data_var]) 291 | 292 | # When using log-normal 293 | elif types_list[i]['type'] == 'pos': 294 | 295 | # We transform the log of the data to a gaussian with mean 0 and std 1 296 | observed_data_log = tf.log(1 + observed_data) 297 | data_mean_log, data_var_log = tf.nn.moments(observed_data_log, 0) 298 | data_var_log = tf.clip_by_value(data_var_log, 1e-6, 1e20) # Avoid zero values 299 | aux_X = tf.nn.batch_normalization(observed_data_log, data_mean_log, data_var_log, offset=0.0, scale=1.0, 300 | variance_epsilon=1e-6) 301 | 302 | normalized_data.append(aux_X) 303 | normalization_parameters.append([data_mean_log, data_var_log]) 304 | 305 | elif types_list[i]['type'] == 'count': 306 | 307 | # We transform the log of the data to a gaussian with mean 0 and std 1 308 | observed_data_log = tf.log(1 + observed_data) 309 | data_mean_log, data_var_log = tf.nn.moments(observed_data_log, 0) 310 | data_var_log = tf.clip_by_value(data_var_log, 1e-6, 1e20) # Avoid zero values 311 | aux_X = tf.nn.batch_normalization(observed_data_log, data_mean_log, data_var_log, offset=0.0, scale=1.0, 312 | variance_epsilon=1e-6) 313 | 314 | normalized_data.append(aux_X) 315 | normalization_parameters.append([data_mean_log, data_var_log]) 316 | 317 | 318 | else: 319 | # Don't normalize the categorical and ordinal variables 320 | normalized_data.append(d) 321 | normalization_parameters.append(tf.convert_to_tensor([0.0, 1.0], dtype=tf.float32)) # No normalization here 322 | 323 | aux_X_noisy = d + tf.random_normal((batch_size, 1), 0, 0.05, dtype=tf.float32) 324 | noisy_data.append(aux_X_noisy) 325 | 326 | 327 | return normalized_data, normalization_parameters, noisy_data 328 | 329 | 330 | # normalization function 331 | 332 | def normalization_classification(batch_data_list, types_list): 333 | normalized_data = [] 334 | normalization_parameters = [] 335 | 336 | for i in range(len(types_list)): 337 | 338 | observed_data = batch_data_list[:, i] 339 | 340 | if types_list[i]['type'] == 'real': 341 | # We transform the data to a gaussian with mean 0 and std 1 342 | data_mean = np.mean(observed_data) 343 | data_var = moment(observed_data, 2) 344 | data_var = np.clip(data_var, 1e-6, 1e20) 345 | data_std = np.sqrt(data_var) 346 | aux_X = preprocessing.scale(observed_data) 347 | 348 | normalized_data.append(aux_X) 349 | normalization_parameters.append([data_mean, data_std]) 350 | 351 | # When using log-normal 352 | elif types_list[i]['type'] == 'pos': 353 | # #We transform the log of the data to a gaussian with mean 0 and std 1 354 | observed_data = observed_data 355 | data_mean = np.mean(observed_data) 356 | data_var = moment(observed_data, 2) 357 | data_var = np.clip(data_var, 1e-6, 1e20) # Avoid zero values 358 | data_std = np.sqrt(data_var) 359 | 360 | aux_X = preprocessing.scale(observed_data) 361 | 362 | normalized_data.append(aux_X) 363 | normalization_parameters.append([data_mean, data_std]) 364 | 365 | elif types_list[i]['type'] == 'count': 366 | 367 | # Input log of the data 368 | observed_data = observed_data 369 | data_mean = np.mean(observed_data) 370 | data_var = moment(observed_data, 2) 371 | data_var = np.clip(data_var, 1e-6, 1e20) # Avoid zero values 372 | data_std = np.sqrt(data_var) 373 | 374 | aux_X = preprocessing.scale(observed_data) 375 | 376 | normalized_data.append(aux_X) 377 | normalization_parameters.append([data_mean, data_std]) 378 | 379 | else: 380 | # Don't normalize the categorical and ordinal variables 381 | normalized_data.append(observed_data) 382 | normalization_parameters.append([0.0, 1.0]) # No normalization here 383 | 384 | return normalized_data, normalization_parameters 385 | 386 | 387 | 388 | def replicate_data_list(data_list, num_replications): 389 | # data_list: expected to have 1 row 390 | # num_replications: expected to have #rows = nsamples 391 | new_data_list = [] 392 | 393 | for i in range(len(data_list)): 394 | if i == 0: 395 | new_data_list = [np.repeat(data_list[i], num_replications, axis=0)] 396 | else: 397 | new_data_list.append(np.repeat(data_list[i], num_replications, axis=0)) 398 | 399 | return new_data_list 400 | 401 | 402 | # stylised classifier 403 | def f_star(x_tilde, boundary): 404 | y = x_tilde[:,1] > boundary 405 | y = y*1 406 | return y 407 | 408 | 409 | def indices_to_one_hot(data, nb_classes): 410 | """Convert an iterable of indices to one-hot encoded labels.""" 411 | targets = np.array(data).reshape(-1) 412 | return np.eye(nb_classes)[targets] 413 | 414 | 415 | def sequence_mask(pseudo_cat, dim_ord, batch_size): 416 | x = np.linspace(1, dim_ord, dim_ord).reshape(1, -1) 417 | x = ~(np.repeat(x, batch_size, axis=0).T > pseudo_cat).T 418 | x = x * 1 419 | return x 420 | 421 | 422 | def cat_sample(logits): 423 | u = np.random.uniform(0, 1, logits.shape) 424 | return np.argmax(logits - np.log(-np.log(u)), axis=1) 425 | 426 | 427 | def Compute_LOF(neighbors, x_train, x_test): 428 | # x_test: - np array 429 | # x_test_counterfactual: - np array 430 | # x_train: train data - np array 431 | 432 | clf = LocalOutlierFactor(n_neighbors=neighbors, contamination=0.01, novelty=True) 433 | clf.fit(x_train) 434 | 435 | X_outlier = clf.predict(x_test) 436 | 437 | return X_outlier 438 | 439 | 440 | def Connectedness(x_train, x_counter, number, epsilon, min_samples): 441 | x_counter.shape 442 | 443 | dbscan_list = [] 444 | n, _ = x_counter.shape 445 | 446 | for i in range(n): 447 | density_control = np.r_[x_train[0:number, :], x_counter[i, :].reshape(1, -1)] 448 | density_pred = DBSCAN(eps=epsilon, min_samples=min_samples).fit(density_control) 449 | dbscan_list.append(density_pred.labels_[-1]) 450 | 451 | not_connected = np.array(dbscan_list.count(-1)) / n #count occurcene of (-1) labels & divide by number of test set 452 | 453 | return not_connected, np.array(dbscan_list) 454 | 455 | 456 | def Read_Split_Data(test_size, classifier, data_total, data_total_c, y_true, types_dict, types_dict_c, normalization): 457 | out = dict() 458 | 459 | # out_training: training x and y 460 | # out_test: test x and y 461 | # out_train_pos: x with corresponding positive predicted label on train set 462 | # out_test_counter: x with corresponding negative predicted label on test set 463 | 464 | 465 | # Split into test and train data 466 | train_data, test_data, train_data_c, test_data_c, y_train, y_test = train_test_split(data_total, 467 | data_total_c, 468 | y_true, 469 | random_state=619, 470 | test_size=test_size) 471 | 472 | n_train, _ = np.shape(train_data) 473 | df = np.r_[train_data, test_data] 474 | df_c = np.r_[train_data_c, test_data_c] 475 | 476 | df_norm, df_param = normalization_classification(df, types_dict) 477 | df_norm = np.transpose(np.array(df_norm)) 478 | df_c_norm, df_c_param = normalization_classification(df_c, types_dict_c) 479 | df_c_norm = np.transpose(np.array(df_c_norm)) 480 | 481 | train_data_norm = df_norm[0:n_train, :] 482 | test_data_norm = df_norm[n_train::, :] 483 | train_data_c_norm = df_c_norm[0:n_train, :] 484 | test_data_c_norm = df_c_norm[n_train::, :] 485 | 486 | # Concatenate free and conditioning features 487 | train_concat = np.c_[train_data_c, train_data] 488 | test_concat = np.c_[test_data_c, test_data] 489 | train_concat_norm = np.c_[train_data_c_norm, train_data_norm] 490 | test_concat_norm = np.c_[test_data_c_norm, test_data_norm] 491 | 492 | 493 | if normalization == True: 494 | train_concat_x = train_concat_norm 495 | test_concat_x = test_concat_norm 496 | 497 | # not normalized 498 | train_data_not = train_data 499 | train_data_c_not = train_data_c 500 | train_data_concat_not = np.c_[train_data_c_not, train_data_not] 501 | 502 | test_data_not = test_data 503 | test_data_c_not = test_data_c 504 | test_data_concat_not = np.c_[test_data_c_not, test_data_not] 505 | 506 | # normalized data 507 | train_data = train_data_norm 508 | train_data_c = train_data_c_norm 509 | test_data = test_data_norm 510 | test_data_c = test_data_c_norm 511 | 512 | else: 513 | train_concat_x = train_concat 514 | test_concat_x = test_concat 515 | 516 | 517 | # classifcation model training: Random forest or LR model: use default values 518 | if classifier == 'RandomForest': 519 | clf = RandomForestClassifier(random_state=619) 520 | 521 | param_grid = {'bootstrap': [True], 522 | 'max_depth': [3, 5, 7], 523 | 'min_samples_leaf': [5], 524 | 'min_samples_split': [4, 10], 525 | 'n_estimators': [50, 100]} 526 | 527 | grid = GridSearchCV(estimator=clf, 528 | param_grid=param_grid, 529 | scoring='roc_auc', 530 | cv=3, 531 | n_jobs=-1, 532 | verbose=2) 533 | 534 | grid.fit(train_concat_x, y_train.reshape(-1)) 535 | clf = grid.best_estimator_ 536 | 537 | 538 | inv_y_train = 1 - y_train 539 | ## grid search 540 | clf_ar = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=619) 541 | grid_ar = GridSearchCV( 542 | clf_ar, param_grid={'C': np.logspace(-4, 3)}, 543 | cv=5, 544 | scoring='roc_auc', 545 | return_train_score=True) 546 | grid_ar.fit(train_concat_x, inv_y_train.reshape(-1)) 547 | clf_ar = grid_ar.best_estimator_ 548 | 549 | 550 | elif classifier == 'SVM': 551 | 552 | clf = SVC(random_state=619) 553 | 554 | tuned_parameters = [{'kernel': ['rbf'], 'C': [0.01, 1, 10]}] 555 | # tuned_parameters = [{'alpha': [0.0001, 0.001]}] 556 | 557 | grid = GridSearchCV(clf, tuned_parameters, cv=3, n_jobs=-1) 558 | grid.fit(train_concat_x, y_train.reshape(-1)) 559 | clf = grid.best_estimator_ 560 | print(grid.cv_results_) 561 | 562 | # for AR algorithm as placeholder 563 | inv_y_train = 1 - y_train 564 | ## grid search 565 | clf_ar = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=619) 566 | grid_ar = GridSearchCV( 567 | clf_ar, param_grid={'C': np.logspace(-4, 3)}, 568 | cv=5, 569 | scoring='roc_auc', 570 | return_train_score=True) 571 | grid_ar.fit(train_concat_x, inv_y_train.reshape(-1)) 572 | clf_ar = grid_ar.best_estimator_ 573 | 574 | 575 | else: 576 | 577 | ## grid search 578 | clf = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=619) 579 | grid = GridSearchCV( 580 | clf, param_grid={'C': np.logspace(-4, 3)}, 581 | cv=5, 582 | scoring='roc_auc', 583 | return_train_score=True) 584 | grid.fit(train_concat_x, y_train.reshape(-1)) 585 | clf = grid.best_estimator_ 586 | 587 | # for AR algorithm 588 | inv_y_train = 1 - y_train 589 | ## grid search 590 | clf_ar = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=619) 591 | grid_ar = GridSearchCV( 592 | clf_ar, param_grid={'C': np.logspace(-4, 3)}, 593 | cv=5, 594 | scoring='roc_auc', 595 | return_train_score=True) 596 | grid_ar.fit(train_concat_x, inv_y_train.reshape(-1)) 597 | clf_ar = grid_ar.best_estimator_ 598 | 599 | 600 | 601 | # THESE GUYS/GURLS WILL NEED OUR HELP (TEST SET) 602 | index_predicted_denied = np.where(clf.predict(test_concat_x) == 1)[0] 603 | 604 | test_data_c_denied = test_data_c[index_predicted_denied, :] 605 | test_data_denied = test_data[index_predicted_denied, :] 606 | y_test_denied = y_test[index_predicted_denied] 607 | test_concat_x_denied = np.c_[test_data_c_denied, test_data_denied] 608 | ncounterfactuals, _ = test_concat_x_denied.shape 609 | 610 | # FROM THESE GUYS WE HAVE POSTIVE RECORD & they have predicted positive record (TRAINING SET) 611 | index_predicted_nodefault = (clf.predict(train_concat_x) == 0) 612 | index_true_nodefault = (y_train.reshape(-1) == 0) 613 | intersection_no = (index_predicted_nodefault * 1 + index_true_nodefault * 1) 614 | index_intersection_no = (intersection_no == 2) #(nodefault + predicted nodefault) index 615 | 616 | train_data_c_pos = train_data_c[index_intersection_no, :] 617 | train_data_pos = train_data[index_intersection_no, :] 618 | train_concat_x_pos = np.c_[train_data_c_pos, train_data_pos] 619 | y_train_pos = y_train[index_intersection_no] 620 | 621 | if normalization == True: 622 | 623 | test_data_denied_not = test_data_not[index_predicted_denied, :] 624 | test_data_c_denied_not = test_data_c_not[index_predicted_denied, :] 625 | test_concat_x_denied_not = np.c_[test_data_c_denied_not, test_data_denied_not] 626 | 627 | train_data_c_pos_not = train_data_c_not[index_intersection_no, :] 628 | train_data_pos_not = train_data_not[index_intersection_no, :] 629 | train_concat_x_pos_not = np.c_[train_data_c_pos_not, train_data_pos_not] 630 | 631 | else: 632 | 633 | test_concat_x_denied_not = _ 634 | 635 | test_data_denied_not = _ 636 | test_data_c_denied_not = _ 637 | train_concat_x_pos_not = _ 638 | 639 | test_data_concat_not = _ 640 | train_data_not = _ 641 | train_data_c_not = _ 642 | train_data_concat_not = _ 643 | 644 | 645 | # return 646 | out['training'] = [train_concat_x, train_data, train_data_c, y_train] 647 | out['training_not'] = [train_data_concat_not, train_data_not, train_data_c_not, y_train] 648 | out['test'] = [test_concat_x, y_test] 649 | out['test_not'] = [test_data_concat_not, y_test] 650 | out['test_counter'] = [test_concat_x_denied, test_data_denied, test_data_c_denied, y_test_denied] 651 | out['test_counter_not'] = [test_concat_x_denied_not, test_data_denied_not, test_data_c_denied_not, y_test_denied] 652 | out['train_pos'] = [train_concat_x_pos, y_train_pos] 653 | out['train_pos_not'] = [train_concat_x_pos_not, y_train_pos] 654 | out['normalization_parameters'] = [df_param, df_c_param] 655 | 656 | return ncounterfactuals, clf, out, clf_ar, grid 657 | 658 | 659 | def compute_cdf(data): 660 | # per free feature 661 | # relies on computing histogram first 662 | # num_bins: # bins in histogram 663 | # you can use bin_edges & norm_cdf to plot cdf 664 | 665 | n, p = np.shape(data) 666 | # num_bins = n 667 | norm_cdf = np.zeros((n, p)) 668 | 669 | for j in range(p): 670 | counts, bin_edges = np.histogram(data[:, j], bins=n, normed=True) 671 | cdf = np.cumsum(counts) 672 | norm_cdf[:, j] = cdf / cdf[-1] 673 | # plt.plot (bin_edges[1:], norm_cdf) 674 | 675 | return bin_edges[1:], norm_cdf 676 | 677 | 678 | def max_percentile_shift(norm_cdfs, norm_cdfs_counterfactual): 679 | # (3) in ustun et al 680 | delta_cdfs = np.abs(norm_cdfs - norm_cdfs_counterfactual) 681 | cost = np.max(delta_cdfs, 1) 682 | return cost 683 | 684 | 685 | def total_percentile_shift(norm_cdfs, norm_cdfs_counterfactual): 686 | inv_counterfactual = norm_cdfs_counterfactual 687 | inv = norm_cdfs 688 | ratio = np.abs(inv_counterfactual - inv) 689 | cost = np.sum(ratio, 1) 690 | return cost 691 | 692 | 693 | def total_log_percentile_shift(norm_cdfs, norm_cdfs_counterfactual): 694 | # (4) in ustun et al 695 | inv_counterfactual = np.clip(1-norm_cdfs_counterfactual, 0.01, 0.99) 696 | inv = np.clip(1-norm_cdfs, 0.01, 0.99) 697 | ratio = np.abs(np.log(np.clip((inv_counterfactual/inv), 0.01, 10))) 698 | cost = np.sum(ratio, 1) 699 | return cost 700 | 701 | 702 | def denormalization(norm_para, norm_para_c, samples, samples_c): 703 | 704 | # norm_para & norm_para_c: numpy arrays 705 | # samples: numpy arrays 706 | 707 | n, p = np.shape(samples) 708 | n_c, p_c = np.shape(samples_c) 709 | 710 | norm_samples = np.zeros((n, p)) 711 | norm_samples_c = np.zeros((n_c, p_c)) 712 | 713 | for i in range(p): 714 | norm_samples[:, i] = (samples[:, i] - norm_para[i, 0])/norm_para[i, 1] 715 | 716 | for i in range(p_c): 717 | norm_samples_c[:, i] = (samples_c[:, i] - norm_para_c[i, 0])/norm_para_c[i, 1] 718 | 719 | return norm_samples, norm_samples_c 720 | 721 | 722 | # standardize data 723 | def standardize(data): 724 | scaler = StandardScaler() 725 | a = scaler.fit(data) 726 | a = scaler.transform(data) 727 | 728 | return a, scaler 729 | 730 | # reduce dim of data 731 | def reduce_dim(data, dim): 732 | pca = PCA(n_components= dim) 733 | components = pca.fit_transform(data) 734 | return components -------------------------------------------------------------------------------- /code/LaugelEtAl.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy import linalg as LA 3 | from scipy.spatial.distance import cdist 4 | 5 | # rejection sampling algorithm comes from LSE lecture notes 6 | # alternatively see WOLFRAM: http://mathworld.wolfram.com/CirclePointPicking.html 7 | # # http://mathworld.wolfram.com/HyperspherePointPicking.html 8 | 9 | def unit_circumference_coordinates(r, n, coordinates): 10 | # r: radius 11 | # n: number of samples 12 | 13 | x1 = np.random.uniform(-1, 1, n) 14 | x2 = np.random.uniform(-1, 1, n) 15 | index = np.where((x1 ** 2 + x2 ** 2) < 1) # accepted samples 16 | x1 = x1[index] 17 | x2 = x2[index] 18 | # coordinates 19 | x = ((x1) ** 2 - (x2) ** 2) / ((x1) ** 2 + (x2) ** 2) * r 20 | y = (2 * (x1) * (x2)) / ((x1) ** 2 + (x2) ** 2) * r 21 | 22 | a = coordinates[0] 23 | b = coordinates[1] # 1x2 vector 24 | a = a + x 25 | b = b + y 26 | 27 | return a, b 28 | 29 | 30 | def hyper_sphere_coordindates(n_search_samples, x, h, l, p): 31 | 32 | delta_x = np.random.randn(n_search_samples, x.shape[1]) # http://mathworld.wolfram.com/HyperspherePointPicking.html 33 | d = np.random.rand(n_search_samples) * (h - l) + l # length range [l, h) 34 | norm_p = np.linalg.norm(delta_x, ord=p, axis=1) 35 | d_norm = np.divide(d, norm_p).reshape(-1, 1) # rescale/normalize factor 36 | delta_x = np.multiply(delta_x, d_norm) 37 | x_tilde = x + delta_x # x tilde 38 | 39 | return x_tilde, d 40 | 41 | 42 | def Laugel_Search(ncounterfactuals, out, search_samples, clf): 43 | 44 | # this function IS NOT GENERAL: works for "give me credit" 45 | x_tilde_star_list = [] 46 | 47 | # Set parameters 48 | p = 2 49 | 50 | threshold = 200 51 | 52 | for i in range(ncounterfactuals): 53 | 54 | # Test data 55 | test_data_replicated = np.repeat(out['test_counter'][1][i, :].reshape(1, -1), search_samples, axis=0) 56 | test_data_c_replicated = np.repeat(out['test_counter'][2][i, :].reshape(1, -1), search_samples, axis=0) 57 | 58 | l = 0 59 | step = 0.5 60 | h = l + step 61 | 62 | # counter to stop 63 | count = 0 64 | counter_step = 1 65 | 66 | 67 | while True: 68 | 69 | count = count + counter_step 70 | 71 | if (count > threshold) is True: 72 | x_tilde_star = None 73 | break 74 | 75 | # STEP 1 of Algorithm 76 | # sample points on hyper sphere around test point 77 | x_tilde, _ = hyper_sphere_coordindates(search_samples, test_data_replicated, h, l, p) 78 | # one way: #x_tilde = np.ceil(x_tilde); another x_tilde = np.around(x_tilde,1) 79 | x_tilde = np.c_[test_data_c_replicated, x_tilde] 80 | 81 | # STEP 2 of Algorithm 82 | # compute l_1 distance 83 | distances = np.abs((x_tilde - np.c_[test_data_c_replicated, test_data_replicated])).sum(axis=1) 84 | 85 | # counterfactual labels 86 | y_tilde = clf.predict(x_tilde) 87 | cla_index = np.where(y_tilde != 1) 88 | 89 | x_tilde_candidates = x_tilde[cla_index] 90 | candidates_dist = distances[cla_index] 91 | 92 | if len(candidates_dist) == 0: # no candidate generated 93 | l = h 94 | h = l + step 95 | else: # certain candidates generated 96 | min_index = np.argmin(candidates_dist) 97 | x_tilde_star = x_tilde_candidates[min_index] 98 | break 99 | 100 | x_tilde_star_list.append(x_tilde_star) 101 | X_test_counterfactual = np.array(x_tilde_star_list) 102 | 103 | return X_test_counterfactual -------------------------------------------------------------------------------- /code/Loglik.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on 27 05 2019 5 | 6 | @based on Nazabal et al 2018 7 | 8 | List of loglikelihoods for the types of variables considered in this paper. 9 | Basically, we create the different layers needed in the decoder and during the 10 | generation of new samples 11 | 12 | The variable reuse indicates the mode of this functions 13 | - reuse = None -> Decoder implementation 14 | - reuse = True -> Samples generator implementation 15 | 16 | """ 17 | 18 | import tensorflow as tf 19 | import numpy as np 20 | import Helpers 21 | from scipy.special import softmax 22 | from scipy.special import expit 23 | 24 | 25 | def loglik_real(batch_data, list_type, theta, normalization_params, kernel_initializer, name, reuse): 26 | 27 | output=dict() 28 | epsilon = tf.constant(1e-6, dtype=tf.float32) 29 | 30 | #Data outputs 31 | data = batch_data 32 | 33 | data_mean, data_var = normalization_params 34 | data_var = tf.clip_by_value(data_var, epsilon, np.inf) 35 | 36 | est_mean, est_var = theta 37 | est_var = tf.clip_by_value(tf.nn.softplus(est_var), epsilon, 1.0) #Must be positive 38 | 39 | # Affine transformation of the parameters 40 | est_mean = tf.sqrt(data_var)*est_mean + data_mean 41 | est_var = data_var*est_var 42 | 43 | #Compute loglik 44 | log_p_x = -0.5 * tf.reduce_sum(tf.squared_difference(data, est_mean)/est_var, 1) - int(list_type['dim'])*0.5*tf.log(2* np.pi) - 0.5*tf.reduce_sum(tf.log(est_var),1) 45 | 46 | #Outputs 47 | output['log_p_x'] = log_p_x 48 | output['params'] = [est_mean, est_var] 49 | output['samples'] = tf.contrib.distributions.Normal(est_mean, tf.sqrt(est_var)).sample() 50 | 51 | return output 52 | 53 | def loglik_pos(batch_data, list_type, theta, normalization_params, kernel_initializer, name, reuse): 54 | 55 | #Log-normal distribution 56 | output = dict() 57 | epsilon = tf.constant(1e-6, dtype=tf.float32) 58 | 59 | #Data outputs 60 | data_mean_log, data_var_log = normalization_params 61 | data_var_log = tf.clip_by_value(data_var_log, epsilon, np.inf) 62 | 63 | data = batch_data 64 | data_log = tf.log(1.0 + data) 65 | 66 | est_mean, est_var = theta 67 | est_var = tf.clip_by_value(tf.nn.softplus(est_var), epsilon, 1.0) 68 | 69 | # Affine transformation of the parameters 70 | est_mean = tf.sqrt(data_var_log)*est_mean + data_mean_log 71 | est_var = data_var_log*est_var 72 | 73 | #Compute loglik 74 | log_p_x = -0.5 * tf.reduce_sum(tf.squared_difference(data_log,est_mean)/est_var,1) \ 75 | - 0.5*tf.reduce_sum(tf.log(2*np.pi*est_var),1) - tf.reduce_sum(data_log,1) 76 | 77 | output['log_p_x'] = log_p_x 78 | output['params'] = [est_mean, est_var] 79 | output['samples'] = tf.exp(tf.contrib.distributions.Normal(est_mean,tf.sqrt(est_var)).sample()) - 1.0 80 | 81 | return output 82 | 83 | def loglik_cat(batch_data, list_type, theta, normalization_params, kernel_initializer, name, reuse): 84 | 85 | output=dict() 86 | 87 | #Data outputs 88 | data = batch_data 89 | 90 | log_pi = theta 91 | 92 | #Compute loglik 93 | log_p_x = -tf.nn.softmax_cross_entropy_with_logits(logits=log_pi,labels=data) 94 | 95 | output['log_p_x'] = log_p_x 96 | output['params'] = log_pi 97 | output['samples'] = tf.one_hot(tf.contrib.distributions.Categorical(probs=tf.nn.softmax(log_pi)).sample(),depth=int(list_type['dim'])) 98 | 99 | return output 100 | 101 | def loglik_ordinal(batch_data, list_type, theta, normalization_params, kernel_initializer, name, reuse): 102 | 103 | output=dict() 104 | epsilon = tf.constant(1e-6, dtype=tf.float32) 105 | 106 | #Data outputs 107 | data = batch_data 108 | batch_size = tf.shape(data)[0] 109 | 110 | # We need to force that the outputs of the network increase with the categories 111 | partition_param, mean_param = theta 112 | mean_value = tf.reshape(mean_param,[-1,1]) 113 | theta_values = tf.cumsum(tf.clip_by_value(tf.nn.softplus(partition_param), epsilon, 1e20),1) 114 | sigmoid_est_mean = tf.nn.sigmoid(theta_values - mean_value) 115 | mean_probs = tf.concat([sigmoid_est_mean,tf.ones([batch_size,1],tf.float32)],1) - tf.concat([tf.zeros([batch_size,1],tf.float32),sigmoid_est_mean],1) 116 | 117 | #Code needed to compute samples from an ordinal distribution 118 | true_values = tf.one_hot(tf.reduce_sum(tf.cast(data,tf.int32),1)-1,int(list_type['dim'])) 119 | 120 | #Compute loglik 121 | log_p_x = tf.log(tf.clip_by_value(tf.reduce_sum(mean_probs*true_values,1),epsilon,1e20)) 122 | 123 | output['log_p_x'] = log_p_x 124 | output['params'] = mean_probs 125 | output['samples'] = tf.sequence_mask(1+tf.contrib.distributions.Categorical(logits=tf.log(tf.clip_by_value(mean_probs,epsilon,1e20))).sample(), int(list_type['dim']),dtype=tf.float32) 126 | 127 | return output 128 | 129 | def loglik_count(batch_data, list_type, theta, normalization_params, kernel_initializer, name, reuse): 130 | 131 | output=dict() 132 | epsilon = tf.constant(1e-6, dtype=tf.float32) 133 | 134 | #Data outputs 135 | data = batch_data 136 | 137 | est_lambda = theta 138 | est_lambda = tf.clip_by_value(tf.nn.softplus(est_lambda), epsilon, 1e20) 139 | 140 | log_p_x = -tf.reduce_sum(tf.nn.log_poisson_loss(targets=data, log_input=tf.log(est_lambda), compute_full_loss=True), 1) 141 | 142 | output['log_p_x'] = log_p_x 143 | output['params'] = est_lambda 144 | output['samples'] = tf.contrib.distributions.Poisson(est_lambda).sample() 145 | 146 | return output 147 | 148 | 149 | def loglik_test_real(theta, normalization_params, list_type): 150 | 151 | output = dict() 152 | epsilon = 1e-6 153 | 154 | # Data outputs 155 | data_mean, data_var = normalization_params 156 | data_var = np.clip(data_var, epsilon, np.inf) 157 | 158 | # Estimated parameters 159 | est_mean, est_var = theta 160 | soft_plus_est_var = np.log(1 + np.exp(-np.abs(est_var))) + np.maximum(est_var, 0) 161 | est_var = np.clip(soft_plus_est_var, epsilon, 1.0) # Must be positive 162 | 163 | # Affine transformation of the parameters 164 | est_mean = np.sqrt(data_var) * est_mean + data_mean 165 | est_var = data_var * est_var 166 | 167 | # Outputs 168 | output['samples'] = np.random.normal(est_mean, np.sqrt(est_var)) 169 | output['params'] = [est_mean, est_var] 170 | 171 | return output 172 | 173 | 174 | def loglik_test_pos(theta, normalization_params, list_type): 175 | 176 | # Log-normal distribution 177 | output = dict() 178 | epsilon = 1e-6 179 | 180 | # Data outputs 181 | data_mean_log, data_var_log = normalization_params 182 | data_var_log = np.clip(data_var_log, epsilon, np.inf) 183 | 184 | est_mean, est_var = theta 185 | soft_plus_est_var = np.log(1 + np.exp(-np.abs(est_var))) + np.maximum(est_var, 0) 186 | est_var = np.clip(soft_plus_est_var, epsilon, 1.0) 187 | 188 | # Affine transformation of the parameters 189 | est_mean = np.sqrt(data_var_log) * est_mean + data_mean_log 190 | est_var = data_var_log * est_var 191 | 192 | output['samples'] = np.exp(np.random.normal(est_mean, np.sqrt(est_var))) - 1.0 193 | output['params'] = [est_mean, est_var] 194 | 195 | return output 196 | 197 | 198 | def loglik_test_cat(theta, normalization_params, list_type): 199 | output = dict() 200 | 201 | # Data outputs 202 | log_pi = theta 203 | 204 | est_cat = Helpers.cat_sample(log_pi) 205 | estimated_samples = Helpers.indices_to_one_hot(est_cat, int(list_type['dim'])) 206 | 207 | output['samples'] = estimated_samples 208 | output['params'] = log_pi 209 | 210 | return output 211 | 212 | 213 | def loglik_test_ordinal(theta, normalization_params, list_type): 214 | output = dict() 215 | epsilon = 1e-6 216 | 217 | # We need to force that the outputs of the network increase with the categories 218 | partition_param, mean_param = theta 219 | 220 | batch_size = mean_param.shape[0] 221 | 222 | mean_value = mean_param.reshape(-1, 1) 223 | soft_plus_partition_param = np.log(1 + np.exp(-np.abs(partition_param))) + np.maximum(partition_param, 0) 224 | 225 | theta_values = np.cumsum(np.clip(soft_plus_partition_param, epsilon, 1e20), axis=1) 226 | sigmoid_est_mean = expit(theta_values - mean_value) 227 | mean_probs = np.c_[sigmoid_est_mean, np.ones(batch_size)] - np.c_[np.zeros(batch_size), sigmoid_est_mean] 228 | mean_probs = np.clip(mean_probs, epsilon, 1e20) 229 | 230 | mean_logits = np.log(mean_probs/(1-mean_probs)) 231 | 232 | pseudo_cat = 1 + Helpers.cat_sample(mean_logits) 233 | 234 | output['samples'] = Helpers.sequence_mask(pseudo_cat, batch_size, int(list_type['dim'])) 235 | output['params'] = mean_probs 236 | 237 | return output 238 | 239 | 240 | 241 | def loglik_test_count(theta, normalization_params, list_type): 242 | output = dict() 243 | epsilon = 1e-6 244 | 245 | est_lambda = theta 246 | soft_plus_lambda = np.log(1 + np.exp(-np.abs(est_lambda))) + np.maximum(est_lambda, 0) 247 | est_lambda = np.clip(soft_plus_lambda, epsilon, 1e20) 248 | 249 | output['samples'] = np.random.poisson(est_lambda) 250 | output['params'] = est_lambda 251 | 252 | return output 253 | 254 | -------------------------------------------------------------------------------- /code/Sampling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import time 4 | #import csv 5 | #import pandas as pd 6 | #import matplotlib 7 | #import random 8 | #from matplotlib import pyplot as plt 9 | #import seaborn as sns 10 | #from numpy import linalg as LA 11 | #from scipy.spatial.distance import cdist 12 | #from sklearn.model_selection import train_test_split 13 | #from sklearn.neighbors import LocalOutlierFactor 14 | #from sklearn.cluster import DBSCAN 15 | #from sklearn import preprocessing 16 | #from sklearn.linear_model import LogisticRegression 17 | #from sklearn.ensemble import RandomForestClassifier 18 | from sklearn.preprocessing import StandardScaler 19 | 20 | from tqdm import tqdm, tqdm_notebook 21 | 22 | from recourse.builder import RecourseBuilder 23 | from recourse.builder import ActionSet 24 | 25 | # import functions 26 | import Helpers 27 | import Evaluation 28 | import Graph 29 | 30 | #import Encoder 31 | #import Decoder 32 | #import Generator 33 | #import Loglik 34 | #import LaugelEtAl 35 | np.random.seed(619) 36 | 37 | def print_loss(epoch, start_time, avg_loss, avg_KL_s, avg_KL_z): 38 | print("Epoch: [%2d] time: %4.4f, train_loglik: %.8f, KL_z: %.8f, KL_s: %.8f, ELBO: %.8f" 39 | % (epoch, time.time() - start_time, avg_loss, avg_KL_z, avg_KL_s, avg_loss - avg_KL_z - avg_KL_s)) 40 | 41 | # -----------------------------------------------------------------------------------# 42 | ############################# Running the C-CHVAE search ########################## 43 | # -----------------------------------------------------------------------------------# 44 | 45 | def sampling(settings, types_dict, types_dict_c, out, ncounterfactuals, clf, n_batches_train, n_samples_train, k, n_input, degree_active): 46 | 47 | argvals = settings.split() 48 | args = Helpers.getArgs(argvals) 49 | 50 | # Creating graph 51 | sess_HVAE = tf.Graph() 52 | 53 | with sess_HVAE.as_default(): 54 | # args.model_name: excluded 55 | tf_nodes = Graph.C_CHVAE_graph(args.types_file, args.types_file_c, 56 | learning_rate=1e-3, z_dim=args.dim_latent_z, 57 | y_dim=args.dim_latent_y, s_dim=args.dim_latent_s, 58 | y_dim_partition=args.dim_latent_y_partition, nsamples=1000, p=2) 59 | 60 | # start session 61 | with tf.Session(graph=sess_HVAE) as session: 62 | # Add ops to save and restore all the variables. 63 | saver = tf.train.Saver() 64 | print('Initizalizing Variables ...') 65 | tf.global_variables_initializer().run() 66 | 67 | # -----------------------------------------------------------------------------------# 68 | # Apply on training data 69 | 70 | print('Training the CHVAE ...') 71 | if (args.train == 1): 72 | 73 | start_time = time.time() 74 | # Training cycle 75 | 76 | loglik_epoch = [] 77 | KL_s_epoch = [] 78 | KL_z_epoch = [] 79 | for epoch in tqdm(range(args.epochs)): 80 | avg_loss = 0. 81 | avg_KL_s = 0. 82 | avg_KL_z = 0. 83 | samples_list = [] 84 | p_params_list = [] 85 | q_params_list = [] 86 | log_p_x_total = [] 87 | 88 | # Annealing of Gumbel-Softmax parameter 89 | tau = np.max([1.0 - 0.001 * epoch, 1e-3]) 90 | 91 | # Randomize the data in the mini-batches 92 | train_data = out['training'][1] 93 | train_data_c = out['training'][2] 94 | random_perm = np.random.permutation(range(np.shape(train_data)[0])) 95 | train_data_aux = train_data[random_perm, :] 96 | train_data_aux_c = train_data_c[random_perm, :] 97 | 98 | for i in range(n_batches_train): 99 | # Create inputs for the feed_dict 100 | data_list = Helpers.next_batch(train_data_aux, types_dict, args.batch_size, index_batch=i) # DONE 101 | data_list_c = Helpers.next_batch(train_data_aux_c, types_dict_c, args.batch_size, index_batch=i) # DONE 102 | 103 | # Create feed dictionary 104 | feedDict = {i: d for i, d in zip(tf_nodes['ground_batch'], data_list)} 105 | feedDict.update({i: d for i, d in zip(tf_nodes['ground_batch_c'], data_list_c)}) 106 | feedDict[tf_nodes['tau_GS']] = tau 107 | feedDict[tf_nodes['batch_size']] = args.batch_size 108 | 109 | # Running VAE 110 | _, X_list, loss, KL_z, KL_s, samples, log_p_x, p_params, q_params = session.run( 111 | [tf_nodes['optim'], 112 | tf_nodes['X'], 113 | tf_nodes['loss_re'], 114 | tf_nodes['KL_z'], 115 | tf_nodes['KL_s'], 116 | tf_nodes['samples'], 117 | tf_nodes['log_p_x'], 118 | tf_nodes['p_params'], 119 | tf_nodes['q_params']], 120 | feed_dict=feedDict) 121 | 122 | # Collect all samples, distirbution parameters and logliks in lists 123 | if i == 0: 124 | samples_list = [samples] 125 | p_params_list = [p_params] 126 | q_params_list = [q_params] 127 | log_p_x_total = [log_p_x] 128 | else: 129 | samples_list.append(samples) 130 | p_params_list.append(p_params) 131 | q_params_list.append(q_params) 132 | log_p_x_total.append(log_p_x) 133 | 134 | # Compute average loss 135 | avg_loss += np.mean(loss) 136 | avg_KL_s += np.mean(KL_s) 137 | avg_KL_z += np.mean(KL_z) 138 | 139 | # Concatenate samples in arrays 140 | s_total, z_total, y_total, est_data = Helpers.samples_concatenation(samples_list) 141 | 142 | # Transform discrete variables back to the original values 143 | train_data_transformed = Helpers.discrete_variables_transformation( 144 | train_data_aux[:n_batches_train * args.batch_size, :], types_dict) 145 | est_data_transformed = Helpers.discrete_variables_transformation(est_data, types_dict) 146 | 147 | # Create global dictionary of the distribution parameters 148 | p_params_complete = Helpers.p_distribution_params_concatenation(p_params_list, # DONE 149 | types_dict, 150 | args.dim_latent_z, 151 | args.dim_latent_s) 152 | 153 | q_params_complete = Helpers.q_distribution_params_concatenation(q_params_list, # DONE 154 | args.dim_latent_z, 155 | args.dim_latent_s) 156 | 157 | # Compute mean and mode of our loglik models: these correspond to the estimated values 158 | loglik_mean, loglik_mode = Helpers.statistics(p_params_complete['x'], types_dict) # DONE 159 | 160 | # Try this for the errors 161 | error_train_mean = Helpers.error_computation(train_data_transformed, loglik_mean, types_dict) 162 | error_train_mode = Helpers.error_computation(train_data_transformed, loglik_mode, types_dict) 163 | error_train_samples = Helpers.error_computation(train_data_transformed, est_data_transformed, types_dict) 164 | 165 | # Display logs per epoch step 166 | if epoch % args.display == 0: 167 | print_loss(epoch, start_time, avg_loss / n_batches_train, avg_KL_s / n_batches_train, 168 | avg_KL_z / n_batches_train) 169 | print("") 170 | 171 | # Plot evolution of test loglik 172 | loglik_per_variable = np.sum(np.concatenate(log_p_x_total, 1), 1) / n_samples_train 173 | 174 | loglik_epoch.append(loglik_per_variable) 175 | 176 | # -----------------------------------------------------------------------------------# 177 | # Apply on test data 178 | 179 | for i in range(1): 180 | samples_test_list = [] 181 | test_params_list = [] 182 | log_p_x_test_list = [] 183 | data_c_list = [] 184 | 185 | test_data_counter = out['test_counter'][1] 186 | test_data_c_counter = out['test_counter'][2] 187 | y_test_counter = out['test_counter'][3] 188 | n_samples_test = test_data_counter.shape[0] 189 | 190 | # Create test minibatch 191 | data_list = Helpers.next_batch(test_data_counter, types_dict, n_samples_test, index_batch=i) 192 | data_list_c = Helpers.next_batch(test_data_c_counter, types_dict_c, n_samples_test, index_batch=i) # DONE 193 | 194 | # Constant Gumbel-Softmax parameter (where we have finished the annealing 195 | tau = 1e-3 196 | 197 | # Create feed dictionary 198 | feedDict = {i: d for i, d in zip(tf_nodes['ground_batch'], data_list)} 199 | feedDict.update({i: d for i, d in zip(tf_nodes['ground_batch_c'], data_list_c)}) 200 | feedDict[tf_nodes['tau_GS']] = tau 201 | feedDict[tf_nodes['batch_size']] = ncounterfactuals # n_samples_test 202 | 203 | # Get samples from the generator function (computing the mode of all distributions) 204 | samples_test, log_p_x_test, test_params, theta_test, normalization_params_test, X, delta_kl = session.run( 205 | [tf_nodes['samples_test'], 206 | tf_nodes['log_p_x_test'], 207 | tf_nodes['test_params'], 208 | tf_nodes['theta_test'], 209 | tf_nodes['normalization_params'], 210 | tf_nodes['X'], 211 | tf_nodes['delta_kl']], 212 | feed_dict=feedDict) 213 | 214 | samples_test_list.append(samples_test) 215 | test_params_list.append(test_params) 216 | log_p_x_test_list.append(log_p_x_test) 217 | data_c_list.append(data_list_c) 218 | 219 | # Concatenate samples in arrays 220 | s_total_test, z_total_test, y_total_test, samples_total_test = Helpers.samples_concatenation(samples_test_list) 221 | 222 | # Transform discrete variables back to the original values 223 | est_samples_transformed = Helpers.discrete_variables_transformation(samples_total_test, types_dict) 224 | 225 | # -----------------------------------------------------------------------------------# 226 | # Find k Attainable Counterfactuals 227 | print('[*] Find Attainable Counterfactuals...') 228 | 229 | counter_batch_size = 1 # counterfactual batch size (i.e. look for counterfactuals one by one) 230 | data_concat = [] 231 | data_concat_c = [] 232 | counterfactuals = [] 233 | latent_tilde = [] 234 | latent = [] 235 | 236 | search_samples = args.search_samples 237 | p = args.norm_latent_space 238 | 239 | for i in tqdm(range(ncounterfactuals)): 240 | 241 | s = (k, n_input) # preallocate k spots; # inputs 242 | sz = (k, args.dim_latent_z) 243 | s = np.zeros(s) 244 | sz = np.zeros(sz) 245 | ik = 0 # counter 246 | 247 | l = 0 248 | step = args.step_size 249 | 250 | x_adv, y_adv, z_adv, d_adv = None, None, None, None 251 | 252 | 253 | #scale test observations 254 | scaled_test, scaler_test = Helpers.standardize(test_data_counter) 255 | 256 | # get one test observation 257 | data_list = Helpers.next_batch(test_data_counter, types_dict, counter_batch_size, index_batch=i) 258 | data_list_c = Helpers.next_batch(test_data_c_counter, types_dict_c, counter_batch_size, index_batch=i) 259 | hat_y_test = np.repeat(y_test_counter[i] * 1, search_samples, axis=0) 260 | test_data_c_replicated = np.repeat(test_data_c_counter[i, :].reshape(1, -1), search_samples, axis=0) 261 | replicated_scaled_test = np.repeat(scaled_test[i, :].reshape(1, -1), search_samples, axis=0) 262 | 263 | 264 | # get replicated observations (observation replicated nsamples times) 265 | #replicated_scaled_test = Helpers.replicate_data_list(data_list_scaled, search_samples) 266 | replicated_data_list = Helpers.replicate_data_list(data_list, search_samples) 267 | replicated_data_list_c = Helpers.replicate_data_list(data_list_c, search_samples) 268 | replicated_z = np.repeat(z_total_test[i].reshape(-1, args.dim_latent_z), search_samples, axis=0) 269 | 270 | h = l + step 271 | # counter to stop 272 | count = 0 273 | counter_step = 1 274 | max_step = 500 275 | 276 | while True: 277 | 278 | count = count + counter_step 279 | 280 | if (count > max_step) == True: 281 | sz = None 282 | s = None 283 | z = z_total_test[i].reshape(-1, args.dim_latent_z) 284 | break 285 | 286 | if degree_active == 1: #choose all latent features for search 287 | 288 | delta_z = np.random.randn(search_samples, replicated_z.shape[1]) # http://mathworld.wolfram.com/HyperspherePointPicking.html 289 | d = np.random.rand(search_samples) * (h - l) + l # length range [l, h) 290 | norm_p = np.linalg.norm(delta_z, ord=p, axis=1) 291 | d_norm = np.divide(d, norm_p).reshape(-1, 1) # rescale/normalize factor 292 | delta_z = np.multiply(delta_z, d_norm) 293 | z_tilde = replicated_z + delta_z # z tilde 294 | 295 | else: 296 | 297 | delta_z = np.random.randn(search_samples, replicated_z.shape[1]) # http://mathworld.wolfram.com/HyperspherePointPicking.html 298 | d = np.random.rand(search_samples) * (h - l) + l # length range [l, h) 299 | norm_p = np.linalg.norm(delta_z, ord=p, axis=1) 300 | d_norm = np.divide(d, norm_p).reshape(-1, 1) # rescale/normalize factor 301 | delta_z = np.multiply(delta_z, d_norm) 302 | 303 | mask = np.tile(delta_kl[3][0, :] * 1, 304 | (search_samples, 1)) # only alter most important latent features 305 | delta_z = np.multiply(delta_z, mask) 306 | 307 | z_tilde = replicated_z + delta_z 308 | 309 | 310 | # create feed dictionary 311 | feedDict = {i: d for i, d in zip(tf_nodes['ground_batch'], replicated_data_list)} 312 | feedDict.update({i: d for i, d in zip(tf_nodes['ground_batch_c'], replicated_data_list_c)}) 313 | feedDict[tf_nodes['samples_z']] = z_tilde 314 | feedDict[tf_nodes['tau_GS']] = tau 315 | feedDict[tf_nodes['batch_size']] = search_samples 316 | 317 | theta_perturbed, samples_perturbed = session.run([tf_nodes['theta_perturbed'], 318 | tf_nodes['samples_perturbed']], feed_dict=feedDict) 319 | 320 | x_tilde, params_x_perturbed = Evaluation.loglik_evaluation_test(X_list, 321 | theta_perturbed, 322 | normalization_params_test, 323 | types_dict) 324 | x_tilde = np.concatenate(x_tilde, axis=1) 325 | scaled_tilde = scaler_test.transform(x_tilde) 326 | d_scale = np.sum(np.abs(scaled_tilde - replicated_scaled_test), axis=1) 327 | 328 | x_tilde = np.c_[test_data_c_replicated, x_tilde] 329 | y_tilde = clf.predict(x_tilde) 330 | 331 | indices_adv = np.where(y_tilde == 0)[0] 332 | 333 | if len(indices_adv) == 0: # no candidate generated 334 | l = h 335 | h = l + step 336 | elif all(s[k - 1, :] == 0): # not k candidates generated 337 | 338 | indx = indices_adv[np.argmin(d_scale[indices_adv])] 339 | assert (y_tilde[indx] != 1) 340 | 341 | s[ik, :] = x_tilde[indx, :] 342 | sz[ik, :] = z_tilde[indx, :] 343 | z = z_total_test[i].reshape(-1, args.dim_latent_z) 344 | 345 | ik = ik + 1 # up the count 346 | l = h 347 | h = l + step 348 | else: # k candidates genereated 349 | break 350 | 351 | data_concat.append(np.concatenate(data_list, axis=1)) 352 | data_concat_c.append(np.concatenate(data_list_c, axis=1)) 353 | counterfactuals.append(s) 354 | latent_tilde.append(sz) 355 | latent.append(z) 356 | 357 | cchvae_counterfactuals = np.array(counterfactuals) 358 | return cchvae_counterfactuals 359 | 360 | -------------------------------------------------------------------------------- /data/givme/give_me_types.csv: -------------------------------------------------------------------------------- 1 | type,dim,nclass 2 | pos,1, 3 | count,1, 4 | pos,1, 5 | pos,1, 6 | count,1, 7 | count,1, 8 | count,1, 9 | count,1, 10 | -------------------------------------------------------------------------------- /data/givme/give_me_types_c.csv: -------------------------------------------------------------------------------- 1 | type,dim,nclass 2 | count,1, 3 | count,1, 4 | -------------------------------------------------------------------------------- /data/heloc/heloc_types.csv: -------------------------------------------------------------------------------- 1 | type,dim,nclass 2 | real,1, 3 | real,1, 4 | real,1, 5 | real,1, 6 | real,1, 7 | real,1, 8 | real,1, 9 | real,1, 10 | real,1, 11 | real,1, 12 | real,1, 13 | real,1, 14 | real,1, 15 | real,1, 16 | real,1, 17 | real,1, 18 | real,1, 19 | -------------------------------------------------------------------------------- /data/heloc/heloc_types_alt.csv: -------------------------------------------------------------------------------- 1 | type,dim,nclass 2 | count,1, 3 | count,1, 4 | count,1, 5 | count,1, 6 | count,1, 7 | count,1, 8 | count,1, 9 | count,1, 10 | count,1, 11 | count,1, 12 | count,1, 13 | count,1, 14 | count,1, 15 | count,1, 16 | count,1, 17 | count,1, 18 | count,1, 19 | -------------------------------------------------------------------------------- /data/heloc/heloc_types_c_alt.csv: -------------------------------------------------------------------------------- 1 | type,dim,nclass 2 | real,1, 3 | real,1, 4 | real,1, 5 | -------------------------------------------------------------------------------- /preprocessing/Preprocessing_GiveMeSomeCredit.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 11, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import matplotlib\n", 12 | "from matplotlib import pyplot as plt" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 17, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',\n", 25 | " 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',\n", 26 | " 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',\n", 27 | " 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',\n", 28 | " 'NumberOfDependents'],\n", 29 | " dtype='object')\n" 30 | ] 31 | } 32 | ], 33 | "source": [ 34 | "# Credit Data Processing\n", 35 | "# For further info on constraints imposed below: see also appendix in Ustun et al (2018)\n", 36 | "raw_df = pd.read_csv('cs-training.txt')\n", 37 | "processed_df = raw_df\n", 38 | "\n", 39 | "# drop NAs & unnamed column & convert boolean to numeric\n", 40 | "processed_df = processed_df.dropna()\n", 41 | "processed_df = processed_df.drop(columns='Unnamed: 0')\n", 42 | "processed_df = processed_df + 0 \n", 43 | "processed_df = processed_df.loc[processed_df['age']<88]\n", 44 | "\n", 45 | "# look at column names\n", 46 | "print(processed_df.columns)" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 18, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "Index(['RevolvingUtilizationOfUnsecuredLines',\n", 59 | " 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',\n", 60 | " 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',\n", 61 | " 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse'],\n", 62 | " dtype='object')\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "# Labels, protected & free featuers\n", 68 | "# labels\n", 69 | "epsilon = 1e-4\n", 70 | "# we clip 0 to avoid evaluation errors when using log normal likelihood\n", 71 | "\n", 72 | "labels = processed_df[processed_df.columns[0]]\n", 73 | "labels.columns = [processed_df.columns[0]]\n", 74 | "# conditioning set/protected set\n", 75 | "conditionals = processed_df[[processed_df.columns[2], processed_df.columns[10]]]\n", 76 | "conditionals.columns = [processed_df.columns[2], processed_df.columns[10]]\n", 77 | "# free features\n", 78 | "free = processed_df.drop(columns=[processed_df.columns[0], processed_df.columns[2], processed_df.columns[10]])\n", 79 | "free[free.columns[0]] = np.clip(free.values[:,0], epsilon, 1e20)\n", 80 | "free[free.columns[2]] = np.clip(free.values[:,2], epsilon, 1e20)\n", 81 | "free[free.columns[3]] = np.clip(free.values[:,3], epsilon, 1e20)\n", 82 | "print(free.columns)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 19, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# Save to CSV\n", 92 | "#free.to_csv('give_me_x_35.csv', header = False, index = False)\n", 93 | "#conditionals.to_csv('give_me_x_c_35.csv', header = False, index = False)\n", 94 | "#labels.to_csv('give_me_y_35.csv', header = False, index = False)" 95 | ] 96 | } 97 | ], 98 | "metadata": { 99 | "kernelspec": { 100 | "display_name": "Python 3", 101 | "language": "python", 102 | "name": "python3" 103 | }, 104 | "language_info": { 105 | "codemirror_mode": { 106 | "name": "ipython", 107 | "version": 3 108 | }, 109 | "file_extension": ".py", 110 | "mimetype": "text/x-python", 111 | "name": "python", 112 | "nbconvert_exporter": "python", 113 | "pygments_lexer": "ipython3", 114 | "version": "3.6.8" 115 | } 116 | }, 117 | "nbformat": 4, 118 | "nbformat_minor": 2 119 | } 120 | -------------------------------------------------------------------------------- /preprocessing/Preprocessing_Heloc.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import matplotlib\n", 12 | "from matplotlib import pyplot as plt" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "# Credit Data Processing\n", 22 | "# For further info on constraints imposed below: see also appendix in Ustun et al (2018)\n", 23 | "raw_df = pd.read_csv('heloc.csv')\n", 24 | "processed_df = raw_df\n", 25 | "# drop NAs & unnamed column & convert boolean to numeric & only keep positive records\n", 26 | "processed_df = processed_df.dropna()\n", 27 | "processed_df.columns\n", 28 | "d = {'Good':0,'Bad':1}\n", 29 | "processed_df['RiskPerformance'] = processed_df['RiskPerformance'].replace(d)\n", 30 | "prcessed_df = processed_df + 0" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "# dropped due to many missing (aka negative) observations\n", 40 | "#processed_df = processed_df.drop(columns=['MSinceMostRecentDelq', 'NetFractionInstallBurden', 'MSinceMostRecentTradeOpen', 'NumTradesOpeninLast12M', 'NumInstallTradesWBalance', 'NumInqLast6Mexcl7days', 'MaxDelq2PublicRecLast12M', 'MaxDelqEver']) \n", 41 | "processed_df = processed_df.drop(columns=['MSinceMostRecentDelq', 'MSinceMostRecentInqexcl7days', 'NetFractionInstallBurden'])" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/plain": [ 52 | "Index(['RiskPerformance', 'ExternalRiskEstimate', 'MSinceOldestTradeOpen',\n", 53 | " 'MSinceMostRecentTradeOpen', 'AverageMInFile', 'NumSatisfactoryTrades',\n", 54 | " 'NumTrades60Ever2DerogPubRec', 'NumTrades90Ever2DerogPubRec',\n", 55 | " 'PercentTradesNeverDelq', 'MaxDelq2PublicRecLast12M', 'MaxDelqEver',\n", 56 | " 'NumTotalTrades', 'NumTradesOpeninLast12M', 'PercentInstallTrades',\n", 57 | " 'NumInqLast6M', 'NumInqLast6Mexcl7days', 'NetFractionRevolvingBurden',\n", 58 | " 'NumRevolvingTradesWBalance', 'NumInstallTradesWBalance',\n", 59 | " 'NumBank2NatlTradesWHighUtilization', 'PercentTradesWBalance'],\n", 60 | " dtype='object')" 61 | ] 62 | }, 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "col = processed_df.columns\n", 70 | "col" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 5, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "#processed_df = processed_df.loc[processed_df['MSinceOldestTradeOpen']<300]\n", 80 | "#processed_df = processed_df.loc[processed_df['AverageMInFile']<200]\n", 81 | "#processed_df = processed_df.loc[processed_df['NumSatisfactoryTrades']<60]\n", 82 | "#processed_df = processed_df.loc[processed_df['NumTrades60Ever2DerogPubRec']<8]\n", 83 | "#processed_df = processed_df.loc[processed_df['NumTrades90Ever2DerogPubRec']<8]\n", 84 | "#processed_df = processed_df.loc[processed_df['NumTotalTrades']<60]\n", 85 | "#processed_df = processed_df.loc[processed_df['PercentInstallTrades']<60]\n", 86 | "#processed_df = processed_df.loc[processed_df['MSinceMostRecentInqexcl7days']<8]\n", 87 | "#processed_df = processed_df.loc[processed_df['NumInqLast6M']<10]\n", 88 | "#processed_df = processed_df.loc[processed_df['NetFractionRevolvingBurden']<100]\n", 89 | "#processed_df = processed_df.loc[processed_df['NumRevolvingTradesWBalance']<20]\n", 90 | "#processed_df = processed_df.loc[processed_df['NumBank2NatlTradesWHighUtilization']<8]" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 6, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "598\n", 103 | "827\n", 104 | "588\n", 105 | "588\n", 106 | "588\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "print((processed_df.values[:,1] < 0).sum())\n", 112 | "print((processed_df.values[:,2] < 0).sum())\n", 113 | "print((processed_df.values[:,3] < 0).sum())\n", 114 | "print((processed_df.values[:,4] < 0).sum())\n", 115 | "print((processed_df.values[:,5] < 0).sum())" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 7, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "588\n", 128 | "588\n", 129 | "588\n", 130 | "588\n", 131 | "588\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "print((processed_df.values[:,6] < 0).sum())\n", 137 | "print((processed_df.values[:,7] < 0).sum())\n", 138 | "print((processed_df.values[:,8] < 0).sum())\n", 139 | "print((processed_df.values[:,9] < 0).sum())\n", 140 | "print((processed_df.values[:,10] < 0).sum())" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 8, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "588\n", 153 | "588\n", 154 | "588\n", 155 | "588\n", 156 | "588\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "print((processed_df.values[:,11] < 0).sum())\n", 162 | "print((processed_df.values[:,12] < 0).sum())\n", 163 | "print((processed_df.values[:,13] < 0).sum())\n", 164 | "print((processed_df.values[:,14] < 0).sum())\n", 165 | "print((processed_df.values[:,15] < 0).sum())" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 9, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "(array([5.880e+02, 0.000e+00, 0.000e+00, 9.296e+03, 3.940e+02, 1.400e+02,\n", 177 | " 2.700e+01, 9.000e+00, 3.000e+00, 2.000e+00]),\n", 178 | " array([-9. , -6.2, -3.4, -0.6, 2.2, 5. , 7.8, 10.6, 13.4, 16.2, 19. ]),\n", 179 | " )" 180 | ] 181 | }, 182 | "execution_count": 9, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | }, 186 | { 187 | "data": { 188 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAD8CAYAAACcjGjIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEsVJREFUeJzt3X+s3XV9x/Hne638VGmBC8G22jIbBXVK10CRxRDqyo8Zyx+QlJhRTZdGxzbUJQ62ZBWVTBcjSKK4RuqKcRSGbDRYx5oCWbZo4RaQXwV7BUavIL2upToNzup7f5zPpYd+Ttvbc+6931Pu85GcnO/38/18z/d97/m2r/v9fL/neyIzkSSp3e80XYAkqf8YDpKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSapMb7qAbp144ok5d+7cpsuQpMPGli1bfpqZA2Ppe9iGw9y5cxkcHGy6DEk6bETEf4+1r8NKkqSK4SBJqhgOkqSK4SBJqhgOkqTKQcMhItZExI6IeKyt7fiI2BgR28rzzNIeEXFDRAxFxCMRsaBtneWl/7aIWN7W/vsR8WhZ54aIiPH+ISVJh2YsRw7/CFywT9tVwKbMnA9sKvMAFwLzy2MlcCO0wgRYBZwFnAmsGg2U0mdl23r7bkuSNMkOGg6Z+R/Azn2alwJry/Ra4OK29puz5fvAjIg4BTgf2JiZOzNzF7ARuKAse2Nmfi9b31d6c9trSZIa0u05h5Mz8wWA8nxSaZ8FbG/rN1zaDtQ+3KFdktSg8f6EdKfzBdlFe+cXj1hJawiKN7/5zd3UB8C71r6rq/UeXf5o19uUpMNJt0cOL5YhIcrzjtI+DMxp6zcbeP4g7bM7tHeUmaszc2FmLhwYGNPtQSRJXeg2HNYDo1ccLQfubGu/vFy1tAjYXYad7gaWRMTMciJ6CXB3WfbziFhUrlK6vO21JEkNOeiwUkTcApwLnBgRw7SuOvo8cFtErACeAy4t3TcAFwFDwC+BjwBk5s6I+CzwQOn3mcwcPcn9MVpXRB0NfLc8JEkNOmg4ZOZl+1m0uEPfBK7Yz+usAdZ0aB8E3nmwOiRJk8dPSEuSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKlM+HD684S1NlyBJfWfKh4MkqWY4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqdJTOETEJyLi8Yh4LCJuiYijImJeRGyOiG0RcWtEHFH6Hlnmh8ryuW2vc3Vpfyoizu/tR5Ik9arrcIiIWcBfAAsz853ANGAZ8AXgusycD+wCVpRVVgC7MvOtwHWlHxFxelnvHcAFwFcjYlq3dUmSetfrsNJ04OiImA4cA7wAnAfcXpavBS4u00vLPGX54oiI0r4uM3+Vmc8AQ8CZPdYlSepB1+GQmT8Gvgg8RysUdgNbgJcyc0/pNgzMKtOzgO1l3T2l/wnt7R3WeZWIWBkRgxExODIy0m3pkqSD6GVYaSatv/rnAW8CjgUu7NA1R1fZz7L9tdeNmaszc2FmLhwYGDj0oiVJY9LLsNL7gWcycyQzfw3cAbwXmFGGmQBmA8+X6WFgDkBZfhyws729wzqSpAb0Eg7PAYsi4phy7mAx8ARwL3BJ6bMcuLNMry/zlOX3ZGaW9mXlaqZ5wHzg/h7qkiT1aPrBu3SWmZsj4nbgQWAP8BCwGvgOsC4iPlfabiqr3AR8MyKGaB0xLCuv83hE3EYrWPYAV2Tmb7qtS5LUu67DASAzVwGr9ml+mg5XG2Xmy8Cl+3mda4Fre6lFkjR+/IS0JKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKj2FQ0TMiIjbI+LJiNgaEWdHxPERsTEitpXnmaVvRMQNETEUEY9ExIK211le+m+LiOW9/lCSpN70euTwZeDfMvPtwLuBrcBVwKbMnA9sKvMAFwLzy2MlcCNARBwPrALOAs4EVo0GiiSpGV2HQ0S8EXgfcBNAZv5fZr4ELAXWlm5rgYvL9FLg5mz5PjAjIk4Bzgc2ZubOzNwFbAQu6LYuSVLvejlyOBUYAb4REQ9FxNcj4ljg5Mx8AaA8n1T6zwK2t60/XNr21y5Jakgv4TAdWADcmJlnAL9g7xBSJ9GhLQ/QXr9AxMqIGIyIwZGRkUOtV5I0Rr2EwzAwnJmby/zttMLixTJcRHne0dZ/Ttv6s4HnD9BeyczVmbkwMxcODAz0ULok6UC6DofM/AmwPSLeVpoWA08A64HRK46WA3eW6fXA5eWqpUXA7jLsdDewJCJmlhPRS0qbJKkh03tc/8+Bb0XEEcDTwEdoBc5tEbECeA64tPTdAFwEDAG/LH3JzJ0R8VnggdLvM5m5s8e6JEk96CkcMvNhYGGHRYs79E3giv28zhpgTS+1SJLGj5+QliRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVpjddwFSx9e2ndb3uaU9uHcdKJOngej5yiIhpEfFQRNxV5udFxOaI2BYRt0bEEaX9yDI/VJbPbXuNq0v7UxFxfq81SZJ6Mx7DSlcC7X/afgG4LjPnA7uAFaV9BbArM98KXFf6ERGnA8uAdwAXAF+NiGnjUJckqUs9hUNEzAb+CPh6mQ/gPOD20mUtcHGZXlrmKcsXl/5LgXWZ+avMfAYYAs7spS5JUm96PXK4HvgU8NsyfwLwUmbuKfPDwKwyPQvYDlCW7y79X2nvsI4kqQFdh0NEfADYkZlb2ps7dM2DLDvQOvtuc2VEDEbE4MjIyCHVK0kau16OHM4BPhgRzwLraA0nXQ/MiIjRq6BmA8+X6WFgDkBZfhyws729wzqvkpmrM3NhZi4cGBjooXRJ0oF0HQ6ZeXVmzs7MubROKN+TmR8C7gUuKd2WA3eW6fVlnrL8nszM0r6sXM00D5gP3N9tXZKk3k3E5xz+ClgXEZ8DHgJuKu03Ad+MiCFaRwzLADLz8Yi4DXgC2ANckZm/mYC6JEljNC7hkJn3AfeV6afpcLVRZr4MXLqf9a8Frh2PWiRJvfP2GZKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSar4fQ6HoJfvZJCkw4nhMFV8+riGtru7me1K6onDSpKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSar4Ibg2H/3el/e77GtnXzmJlUhSszxykCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRV/IT04aCpr/iUNGV55CBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqRK1+EQEXMi4t6I2BoRj0fElaX9+IjYGBHbyvPM0h4RcUNEDEXEIxGxoO21lpf+2yJiee8/liSpF70cOewB/jIzTwMWAVdExOnAVcCmzJwPbCrzABcC88tjJXAjtMIEWAWcBZwJrBoNFElSM7oOh8x8ITMfLNM/B7YCs4ClwNrSbS1wcZleCtycLd8HZkTEKcD5wMbM3JmZu4CNwAXd1iVJ6t24nHOIiLnAGcBm4OTMfAFaAQKcVLrNAra3rTZc2vbX3mk7KyNiMCIGR0ZGxqN0SVIHPYdDRLwe+Dbw8cz82YG6dmjLA7TXjZmrM3NhZi4cGBg49GIlSWPSUzhExOtoBcO3MvOO0vxiGS6iPO8o7cPAnLbVZwPPH6BdktSQXq5WCuAmYGtmfqlt0Xpg9Iqj5cCdbe2Xl6uWFgG7y7DT3cCSiJhZTkQvKW2SpIb0clfWc4A/Bh6NiIdL218Dnwdui4gVwHPApWXZBuAiYAj4JfARgMzcGRGfBR4o/T6TmTt7qEuS1KOuwyEz/5PO5wsAFnfon8AV+3mtNcCabmuRJI0vPyEtSar4ZT+aWE1+UdGndze3bekw55GDJKnikcMYffR7X+aec1vT593X8dSJJL1meOQgSaoYDpKkiuEgSaoYDpKkiuEgSaoYDpKkipeyHga2rntT1+uetswb3Eo6dB45SJIqU/LI4ba/2/PK9IZ3tz7gJknayyMHSVLFcJAkVQwHSVLFcJAkVQwHSVJlSl6tpCmiqS8a8kuG9BpgOHThnnO/8qp5v99B0muNw0qSpIrhIEmqGA6SpIrhIEmqTMkT0hve/bscNfOTrZldX2q2GEnqQx45SJIqhoMkqWI4SJIqhgPwsucdJOlVpuQJaWlCedsOvQYYDuOg/XYa/XYrjW6/f9rvnpamNoeVJEmVvgmHiLggIp6KiKGIuKrpeiRpKuuLcIiIacBXgAuB04HLIuL0yazBk9KStFe/nHM4ExjKzKcBImIdsBR4otGqutDP5x8ORbfnKsDzFY1p6kQ4eDL8NahfwmEWsL1tfhg4a7KLeHnXl/beVmMcvFaCQtLU0y/hEB3asuoUsRJYWWb/NyKe6n6T3xmdOBH4aYf2pu1TV18YW03XTHwh+zh8f1eTb2LquqbTP+Exm1q/q970WtNbxtqxX8JhGJjTNj8bqMYmMnM1sHo8NxwRg5m5cDxfczz0Y139WBP0Z139WBP0Z139WBP0Z12TWVNfnJAGHgDmR8S8iDgCWAasb7gmSZqy+uLIITP3RMSfAXcD04A1mfl4w2VJ0pTVF+EAkJkbgA0NbHpch6nGUT/W1Y81QX/W1Y81QX/W1Y81QX/WNWk1RWZ13leSNMX1yzkHSVIfmdLh0NQtOyJiTUTsiIjH2tqOj4iNEbGtPM8s7RERN5QaH4mIBRNU05yIuDcitkbE4xFxZZ/UdVRE3B8RPyh1XVPa50XE5lLXreVCBiLiyDI/VJbPnYi6yramRcRDEXFXH9X0bEQ8GhEPR8RgaWv6PZwREbdHxJNl/zq7D2p6W/kdjT5+FhEf74O6PlH288ci4pay/zezX2XmlHzQOvH9I+BU4AjgB8Dpk7Tt9wELgMfa2v4euKpMXwV8oUxfBHyX1mdBFgGbJ6imU4AFZfoNwA9p3cqk6boCeH2Zfh2wuWzvNmBZaf8a8LEy/afA18r0MuDWCXwfPwn8E3BXme+Hmp4FTtynren3cC3wJ2X6CGBG0zXtU9804Ce0PgPQWF20Pgz8DHB02/704ab2qwn9pffzAzgbuLtt/mrg6knc/lxeHQ5PAaeU6VOAp8r0PwCXdeo3wfXdCfxhP9UFHAM8SOvT8z8Fpu/7XtK64u3sMj299IsJqGU2sAk4D7ir/KfRaE3l9Z+lDofG3kPgjeU/vOiXmjrUuAT4r6brYu+dIo4v+8ldwPlN7VdTeVip0y07ZjVUC8DJmfkCQHk+qbRPep3l8PQMWn+lN15XGb55GNgBbKR1xPdSZu7psO1X6irLdwMnTEBZ1wOfAn5b5k/og5qgdWeBf4+ILdG6owA0+x6eCowA3yhDcF+PiGMbrmlfy4BbynRjdWXmj4EvAs8BL9DaT7bQ0H41lcNhTLfs6AOTWmdEvB74NvDxzPzZgbp2aJuQujLzN5n5Hlp/rZ8JnHaAbU94XRHxAWBHZm5pb26ypjbnZOYCWnc4viIi3neAvpNR13RaQ6g3ZuYZwC9oDdc0WdPejbXG7z8I/PPBunZoG+/9aiatG47OA94EHEvrfdzfdie0pqkcDmO6ZcckejEiTgEozztK+6TVGRGvoxUM38rMO/qlrlGZ+RJwH60x3xkRMfo5nfZtv1JXWX4csHOcSzkH+GBEPAusozW0dH3DNQGQmc+X5x3Av9AK0ybfw2FgODM3l/nbaYVFv+xXFwIPZuaLZb7Jut4PPJOZI5n5a+AO4L00tF9N5XDot1t2rAeWl+nltMb8R9svL1dLLAJ2jx72jqeICOAmYGtmtn+5RdN1DUTEjDJ9NK1/QFuBe4FL9lPXaL2XAPdkGZQdL5l5dWbOzsy5tPabezLzQ03WBBARx0bEG0anaY2lP0aD72Fm/gTYHhFvK02Lad2Kv9H9qs1l7B1SGt1+U3U9ByyKiGPKv8fR31Uz+9VEnujp9wetKxB+SGsM+28mcbu30BpT/DWt9F9Ba6xwE7CtPB9f+gatL0L6EfAosHCCavoDWoekjwAPl8dFfVDX7wEPlboeA/62tJ8K3A8M0RoSOLK0H1Xmh8ryUyf4vTyXvVcrNVpT2f4PyuPx0X26D97D9wCD5T38V2Bm0zWVbR0D/A9wXFtb07+ra4Any77+TeDIpvYrPyEtSapM5WElSdJ+GA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpMr/A8xkpSYy/c9MAAAAAElFTkSuQmCC\n", 189 | "text/plain": [ 190 | "
" 191 | ] 192 | }, 193 | "metadata": { 194 | "needs_background": "light" 195 | }, 196 | "output_type": "display_data" 197 | } 198 | ], 199 | "source": [ 200 | "plt.hist(processed_df.values[:,1])\n", 201 | "plt.hist(processed_df.values[:,2])\n", 202 | "plt.hist(processed_df.values[:,3])\n", 203 | "plt.hist(processed_df.values[:,4])\n", 204 | "plt.hist(processed_df.values[:,5])\n", 205 | "plt.hist(processed_df.values[:,6])" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 10, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "(array([5.880e+02, 0.000e+00, 0.000e+00, 7.005e+03, 2.046e+03, 7.020e+02,\n", 217 | " 9.900e+01, 1.400e+01, 3.000e+00, 2.000e+00]),\n", 218 | " array([-9. , -6.2, -3.4, -0.6, 2.2, 5. , 7.8, 10.6, 13.4, 16.2, 19. ]),\n", 219 | " )" 220 | ] 221 | }, 222 | "execution_count": 10, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | }, 226 | { 227 | "data": { 228 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAD8CAYAAACcjGjIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEYpJREFUeJzt3X+s3XV9x/Hna1TEH9EWqU5bXDE2KrpsshusuhgCGwIayxLJWNxoCEvj0vl7Udw/9RdREyNKhhgiaDEGJOhGM9lIw4+4JRMpYkSspg1scKVCTQGdxh+d7/1xPlcP/dzblnsOPffQ5yNpzvm+z+d7zvubb9vX/Xy/3/O9qSokSRr2e5NuQJK09BgOkqSO4SBJ6hgOkqSO4SBJ6hgOkqSO4SBJ6hw0HJJcmeShJN8dqh2bZFuSne1xRasnySVJdiX5TpKThtbZ0MbvTLJhqP4nSe5q61ySJOPeSEnS43MoM4cvAGfsV7sQuKmq1gI3tWWAM4G17c9G4DIYhAmwGXgVcDKweS5Q2piNQ+vt/1mSpMNs2cEGVNXXk6zZr7weOKU93wLcCryv1a+qwdeuv5FkeZLnt7HbqmovQJJtwBlJbgWeVVX/1epXAWcD/3awvo477rhas2b/tiRJC7njjjt+XFUrD2XsQcNhAc+rqt0AVbU7yXNbfRVw/9C42VY7UH12nvq8kmxkMMvghS98Idu3b19k+5J05EnyP4c6dtwnpOc7X1CLqM+rqi6vqpmqmlm58pDCT5K0CIsNhwfb4SLa40OtPgscPzRuNfDAQeqr56lLkiZoseGwFZi74mgDcP1Q/bx21dI64NF2+OlG4PQkK9qJ6NOBG9trP02yrl2ldN7Qe0mSJuSg5xySXM3ghPJxSWYZXHX0MeDaJBcA9wHntOE3AGcBu4CfA+cDVNXeJB8Gbm/jPjR3chr4OwZXRD2NwYnog56MliQ9sTKtv89hZmamPCEtSYcuyR1VNXMoY/2GtCSpYzhIkjqGgySpYzhIkjqL/Ya05rHmwq8d8PX//tgbDlMnkjQaZw6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpM5I4ZDkXUnuTvLdJFcnOSbJCUluS7IzyZeTHN3GPrUt72qvrxl6n/e3+g+SvH60TZIkjWrR4ZBkFfB2YKaqXgEcBZwLfBy4uKrWAg8DF7RVLgAerqoXAxe3cSQ5sa33cuAM4DNJjlpsX5Kk0Y16WGkZ8LQky4CnA7uBU4Hr2utbgLPb8/Vtmfb6aUnS6tdU1S+r6l5gF3DyiH1Jkkaw6HCoqh8CnwDuYxAKjwJ3AI9U1b42bBZY1Z6vAu5v6+5r458zXJ9nHUnSBIxyWGkFg5/6TwBeADwDOHOeoTW3ygKvLVSf7zM3JtmeZPuePXsef9OSpEMyymGlPwPurao9VfVr4KvAa4Dl7TATwGrggfZ8FjgeoL3+bGDvcH2edR6jqi6vqpmqmlm5cuUIrUuSDmSUcLgPWJfk6e3cwWnA94BbgDe3MRuA69vzrW2Z9vrNVVWtfm67mukEYC3wzRH6kiSNaNnBh8yvqm5Lch3wLWAfcCdwOfA14JokH2m1K9oqVwBfTLKLwYzh3PY+dye5lkGw7AM2VdX/LbYvSdLoFh0OAFW1Gdi8X/ke5rnaqKp+AZyzwPtcBFw0Si+SpPHxG9KSpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpM5I35DW+P3hlj98zPJdG+6aUCfSEeoDz57AZz56+D/zIJw5SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqeOlrGP0tnsvO8iINxyWPiRpVM4cJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEmdkcIhyfIk1yX5fpIdSV6d5Ngk25LsbI8r2tgkuSTJriTfSXLS0PtsaON3Jtkw6kZJkkYz6szh08C/V9VLgT8CdgAXAjdV1VrgprYMcCawtv3ZCFwGkORYYDPwKuBkYPNcoEiSJmPR4ZDkWcDrgCsAqupXVfUIsB7Y0oZtAc5uz9cDV9XAN4DlSZ4PvB7YVlV7q+phYBtwxmL7kiSNbpSZw4uAPcDnk9yZ5HNJngE8r6p2A7TH57bxq4D7h9afbbWF6pKkCRklHJYBJwGXVdUrgZ/xu0NI88k8tTpAvX+DZGOS7Um279mz5/H2K0k6RKOEwywwW1W3teXrGITFg+1wEe3xoaHxxw+tvxp44AD1TlVdXlUzVTWzcuXKEVqXJB3IosOhqn4E3J/kJa10GvA9YCswd8XRBuD69nwrcF67amkd8Gg77HQjcHqSFe1E9OmtJkmakGUjrv824EtJjgbuAc5nEDjXJrkAuA84p429ATgL2AX8vI2lqvYm+TBwexv3oaraO2JfkqQRjBQOVfVtYGael06bZ2wBmxZ4nyuBK0fpRZI0Pn5DWpLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSZ1lk25A87v2o/sA2PHRl/229rLv75hUO5KOMM4cJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1Bk5HJIcleTOJP/alk9IcluSnUm+nOToVn9qW97VXl8z9B7vb/UfJHn9qD1JkkYzjpnDO4Dhm/58HLi4qtYCDwMXtPoFwMNV9WLg4jaOJCcC5wIvB84APpPkqDH0JUlapJHCIclq4A3A59pygFOB69qQLcDZ7fn6tkx7/bQ2fj1wTVX9sqruBXYBJ4/SlyRpNKPOHD4FvBf4TVt+DvBIVe1ry7PAqvZ8FXA/QHv90Tb+t/V51nmMJBuTbE+yfc+ePSO2LklayKLDIckbgYeq6o7h8jxD6yCvHWidxxarLq+qmaqaWbly5ePqV5J06Eb5fQ6vBd6U5CzgGOBZDGYSy5Msa7OD1cADbfwscDwwm2QZ8Gxg71B9zvA6kqQJWPTMoareX1Wrq2oNgxPKN1fVW4BbgDe3YRuA69vzrW2Z9vrNVVWtfm67mukEYC3wzcX2JUka3RPxm+DeB1yT5CPAncAVrX4F8MUkuxjMGM4FqKq7k1wLfA/YB2yqqv97Avpasna89He/7e3aCfYhSXPGEg5VdStwa3t+D/NcbVRVvwDOWWD9i4CLxtGLJGl0fkNaktQxHCRJnSfinIOWoEvfevNEPnfTZ0+dyOdKGo0zB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSZ9mkG9CT26VvvXlin73ps6dO7LOlaefMQZLUMRwkSR3DQZLUMRwkSR1PSB9GC56cPeXSQ1r/5gme3JV0ZFn0zCHJ8UluSbIjyd1J3tHqxybZlmRne1zR6klySZJdSb6T5KSh99rQxu9MsmH0zZIkjWKUw0r7gPdU1cuAdcCmJCcCFwI3VdVa4Ka2DHAmsLb92QhcBoMwATYDrwJOBjbPBYokaTIWfVipqnYDu9vznybZAawC1gOntGFbgFuB97X6VVVVwDeSLE/y/DZ2W1XtBUiyDTgDuHqxvS1Vv3j4kwd8/ZgV7z5MnUjSgY3lhHSSNcArgduA57XgmAuQ57Zhq4D7h1abbbWF6vN9zsYk25Ns37NnzzhalyTNY+RwSPJM4CvAO6vqJwcaOk+tDlDvi1WXV9VMVc2sXLny8TcrSTokI4VDkqcwCIYvVdVXW/nBdriI9vhQq88Cxw+tvhp44AB1SdKEjHK1UoArgB1VNXwwfSswd8XRBuD6ofp57aqldcCj7bDTjcDpSVa0E9Gnt5okaUJG+Z7Da4G/Ae5K8u1W+0fgY8C1SS4A7gPOaa/dAJwF7AJ+DpwPUFV7k3wYuL2N+9DcyWlJ0mSMcrXSfzL/+QKA0+YZX8CmBd7rSuDKxfYiSRovb58hSeoYDpKkjuEgSeoYDpKkjuEgSeoYDpKkjr/PQdLS9YFnT7qDI5YzB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3sr6Unr0rfePJHP3fTZUyfyudI4OXOQJHWOyJnDpH6ilKRp4cxBktQxHCRJnSPysNJS9YuHP7nga8esePdh7ETSkc6ZgySpc0TOHPwJXXqc/HWdRxxnDpKkjuEgSeoYDpKkjuEgSeoYDpKkjuEgSeockZeySk+kid0N9vf/YiKfqycnZw6SpM6SCYckZyT5QZJdSS6cdD+SdCRbEoeVkhwFXAr8OTAL3J5ka1V9b7KdSdPj0h/988Q+20NaTz5LIhyAk4FdVXUPQJJrgPWA4dAc6JYf4G0/JI3XUgmHVcD9Q8uzwKsm1MtU8n5RmqRJzVqcsTxxlko4ZJ5adYOSjcDGtvi/SX4w/la+No43OQ748TjeaDzGsk2w5LZrbNyu6fLb7fr7CTcyNh/M4dpXf3CoA5dKOMwCxw8trwYe2H9QVV0OXH64mlqsJNurambSfYyb2zVd3K7psRS3aalcrXQ7sDbJCUmOBs4Ftk64J0k6Yi2JmUNV7Uvy98CNwFHAlVV194TbkqQj1pIIB4CqugG4YdJ9jMmSP/S1SG7XdHG7pseS26ZUded9JUlHuKVyzkGStIQYDmP2ZLgNSJLjk9ySZEeSu5O8o9WPTbItyc72uGLSvS5GkqOS3JnkX9vyCUlua9v15XZRxFRJsjzJdUm+3/bbq58M+yvJu9rfwe8muTrJMdO4v5JcmeShJN8dqs27fzJwSfs/5DtJTppEz4bDGA3dBuRM4ETgr5KcONmuFmUf8J6qehmwDtjUtuNC4KaqWgvc1Jan0TuAHUPLHwcubtv1MHDBRLoazaeBf6+qlwJ/xGD7pnp/JVkFvB2YqapXMLhY5Vymc399AThjv9pC++dMYG37sxG47DD1+BiGw3j99jYgVfUrYO42IFOlqnZX1bfa858y+I9mFYNt2dKGbQHOnkyHi5dkNfAG4HNtOcCpwHVtyNRtV5JnAa8DrgCoql9V1SM8CfYXg4tmnpZkGfB0YDdTuL+q6uvA3v3KC+2f9cBVNfANYHmS5x+eTn/HcBiv+W4DsmpCvYxFkjXAK4HbgOdV1W4YBAjw3Ml1tmifAt4L/KYtPwd4pKr2teVp3GcvAvYAn2+Hyz6X5BlM+f6qqh8CnwDuYxAKjwJ3MP37a85C+2dJ/D9iOIzXId0GZFokeSbwFeCdVfWTSfczqiRvBB6qqjuGy/MMnbZ9tgw4Cbisql4J/IwpO4Q0n3YMfj1wAvAC4BkMDrnsb9r218Esib+ThsN4HdJtQKZBkqcwCIYvVdVXW/nBuelte3xoUv0t0muBNyX5bwaH/E5lMJNY3g5bwHTus1lgtqpua8vXMQiLad9ffwbcW1V7qurXwFeB1zD9+2vOQvtnSfw/YjiM15PiNiDtOPwVwI6qGr7d61ZgQ3u+Abj+cPc2iqp6f1Wtrqo1DPbNzVX1FuAW4M1t2DRu14+A+5O8pJVOY3C7+6neXwwOJ61L8vT2d3Juu6Z6fw1ZaP9sBc5rVy2tAx6dO/x0OPkluDFLchaDn0bnbgNy0YRbetyS/CnwH8Bd/O7Y/D8yOO9wLfBCBv9wz6mq/U+yTYUkpwD/UFVvTPIiBjOJY4E7gb+uql9Osr/HK8kfMzjJfjRwD3A+gx/+pnp/Jfkg8JcMrqC7E/hbBsffp2p/JbkaOIXBHWUfBDYD/8I8+6cF4T8xuLrp58D5VbX9sPdsOEiS9udhJUlSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHX+H6Fw0PqKkMWvAAAAAElFTkSuQmCC\n", 229 | "text/plain": [ 230 | "
" 231 | ] 232 | }, 233 | "metadata": { 234 | "needs_background": "light" 235 | }, 236 | "output_type": "display_data" 237 | } 238 | ], 239 | "source": [ 240 | "plt.hist(processed_df.values[:,7])\n", 241 | "plt.hist(processed_df.values[:,8])\n", 242 | "plt.hist(processed_df.values[:,9])\n", 243 | "plt.hist(processed_df.values[:,10])\n", 244 | "plt.hist(processed_df.values[:,11])\n", 245 | "plt.hist(processed_df.values[:,12])" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 11, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "df = processed_df.values\n", 255 | "n, p = np.shape(df)\n", 256 | "\n", 257 | "# keep only pos values\n", 258 | "for i in range (p):\n", 259 | " df = df[df[:,i] >= 0] " 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 12, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "# get label\n", 269 | "y = df[:,0]\n", 270 | "# get df\n", 271 | "df_ = df[:,1::]" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 13, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "col[1::]\n", 281 | "free = pd.DataFrame(df_, dtype='float')\n", 282 | "free.columns = [col[1::]]\n", 283 | "free = free + 1\n", 284 | "\n", 285 | "labels = pd.DataFrame(y, dtype='float')\n", 286 | "labels.columns = [col[0]]" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 14, 292 | "metadata": {}, 293 | "outputs": [], 294 | "source": [ 295 | "conditionals = free[['ExternalRiskEstimate', 'MSinceOldestTradeOpen', 'AverageMInFile']]" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 15, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/html": [ 306 | "
\n", 307 | "\n", 320 | "\n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | "
ExternalRiskEstimateMSinceOldestTradeOpenAverageMInFile
056.0145.085.0
168.067.025.0
267.0170.074.0
382.0334.0133.0
460.0138.079.0
\n", 362 | "
" 363 | ], 364 | "text/plain": [ 365 | " ExternalRiskEstimate MSinceOldestTradeOpen AverageMInFile\n", 366 | "0 56.0 145.0 85.0\n", 367 | "1 68.0 67.0 25.0\n", 368 | "2 67.0 170.0 74.0\n", 369 | "3 82.0 334.0 133.0\n", 370 | "4 60.0 138.0 79.0" 371 | ] 372 | }, 373 | "execution_count": 15, 374 | "metadata": {}, 375 | "output_type": "execute_result" 376 | } 377 | ], 378 | "source": [ 379 | "conditionals.head()" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 16, 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "data": { 389 | "text/html": [ 390 | "
\n", 391 | "\n", 404 | "\n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | "
ExternalRiskEstimateMSinceOldestTradeOpenMSinceMostRecentTradeOpenAverageMInFileNumSatisfactoryTradesNumTrades60Ever2DerogPubRecNumTrades90Ever2DerogPubRecPercentTradesNeverDelqMaxDelq2PublicRecLast12MMaxDelqEverNumTotalTradesNumTradesOpeninLast12MPercentInstallTradesNumInqLast6MNumInqLast6Mexcl7daysNetFractionRevolvingBurdenNumRevolvingTradesWBalanceNumInstallTradesWBalanceNumBank2NatlTradesWHighUtilizationPercentTradesWBalance
count8291.0000008291.0000008291.0000008291.0000008291.0000008291.0000008291.0000008291.0000008291.0000008291.0000008291.000008291.0000008291.0000008291.0000008291.0000008291.0000008291.0000008291.0000008291.0000008291.000000
mean72.783018202.6205529.28404378.44602623.5808711.5560251.35761793.8310226.7415277.38318725.293212.98757735.6733812.4734052.41587336.9232905.3274643.5133282.12507567.845495
std9.66510894.4787828.66124330.96353911.0593561.2185110.96141910.7081371.6282001.80989212.595461.86355215.5534472.1200662.07983228.4469313.0384341.6569941.54706220.508632
min37.0000003.0000001.0000005.0000002.0000001.0000001.00000021.0000001.0000003.0000002.000001.0000003.0000001.0000001.0000001.0000001.0000002.0000001.0000008.000000
25%65.000000139.0000004.00000059.00000016.0000001.0000001.00000091.0000006.0000007.00000016.000002.00000024.0000001.0000001.00000011.5000003.0000002.0000001.00000051.000000
50%73.000000188.0000007.00000076.00000022.0000001.0000001.00000098.0000007.0000007.00000023.000003.00000034.0000002.0000002.00000032.0000005.0000003.0000002.00000068.000000
75%81.000000257.50000012.00000096.00000030.0000002.0000001.000000101.0000008.0000009.00000032.000004.00000046.0000003.0000003.00000058.0000007.0000004.0000003.00000084.000000
max95.000000804.000000107.000000245.00000080.00000020.00000020.000000101.00000010.0000009.000000105.0000020.00000094.00000067.00000067.000000233.00000033.00000024.00000019.000000101.000000
\n", 617 | "
" 618 | ], 619 | "text/plain": [ 620 | " ExternalRiskEstimate MSinceOldestTradeOpen MSinceMostRecentTradeOpen \\\n", 621 | "count 8291.000000 8291.000000 8291.000000 \n", 622 | "mean 72.783018 202.620552 9.284043 \n", 623 | "std 9.665108 94.478782 8.661243 \n", 624 | "min 37.000000 3.000000 1.000000 \n", 625 | "25% 65.000000 139.000000 4.000000 \n", 626 | "50% 73.000000 188.000000 7.000000 \n", 627 | "75% 81.000000 257.500000 12.000000 \n", 628 | "max 95.000000 804.000000 107.000000 \n", 629 | "\n", 630 | " AverageMInFile NumSatisfactoryTrades NumTrades60Ever2DerogPubRec \\\n", 631 | "count 8291.000000 8291.000000 8291.000000 \n", 632 | "mean 78.446026 23.580871 1.556025 \n", 633 | "std 30.963539 11.059356 1.218511 \n", 634 | "min 5.000000 2.000000 1.000000 \n", 635 | "25% 59.000000 16.000000 1.000000 \n", 636 | "50% 76.000000 22.000000 1.000000 \n", 637 | "75% 96.000000 30.000000 2.000000 \n", 638 | "max 245.000000 80.000000 20.000000 \n", 639 | "\n", 640 | " NumTrades90Ever2DerogPubRec PercentTradesNeverDelq \\\n", 641 | "count 8291.000000 8291.000000 \n", 642 | "mean 1.357617 93.831022 \n", 643 | "std 0.961419 10.708137 \n", 644 | "min 1.000000 21.000000 \n", 645 | "25% 1.000000 91.000000 \n", 646 | "50% 1.000000 98.000000 \n", 647 | "75% 1.000000 101.000000 \n", 648 | "max 20.000000 101.000000 \n", 649 | "\n", 650 | " MaxDelq2PublicRecLast12M MaxDelqEver NumTotalTrades \\\n", 651 | "count 8291.000000 8291.000000 8291.00000 \n", 652 | "mean 6.741527 7.383187 25.29321 \n", 653 | "std 1.628200 1.809892 12.59546 \n", 654 | "min 1.000000 3.000000 2.00000 \n", 655 | "25% 6.000000 7.000000 16.00000 \n", 656 | "50% 7.000000 7.000000 23.00000 \n", 657 | "75% 8.000000 9.000000 32.00000 \n", 658 | "max 10.000000 9.000000 105.00000 \n", 659 | "\n", 660 | " NumTradesOpeninLast12M PercentInstallTrades NumInqLast6M \\\n", 661 | "count 8291.000000 8291.000000 8291.000000 \n", 662 | "mean 2.987577 35.673381 2.473405 \n", 663 | "std 1.863552 15.553447 2.120066 \n", 664 | "min 1.000000 3.000000 1.000000 \n", 665 | "25% 2.000000 24.000000 1.000000 \n", 666 | "50% 3.000000 34.000000 2.000000 \n", 667 | "75% 4.000000 46.000000 3.000000 \n", 668 | "max 20.000000 94.000000 67.000000 \n", 669 | "\n", 670 | " NumInqLast6Mexcl7days NetFractionRevolvingBurden \\\n", 671 | "count 8291.000000 8291.000000 \n", 672 | "mean 2.415873 36.923290 \n", 673 | "std 2.079832 28.446931 \n", 674 | "min 1.000000 1.000000 \n", 675 | "25% 1.000000 11.500000 \n", 676 | "50% 2.000000 32.000000 \n", 677 | "75% 3.000000 58.000000 \n", 678 | "max 67.000000 233.000000 \n", 679 | "\n", 680 | " NumRevolvingTradesWBalance NumInstallTradesWBalance \\\n", 681 | "count 8291.000000 8291.000000 \n", 682 | "mean 5.327464 3.513328 \n", 683 | "std 3.038434 1.656994 \n", 684 | "min 1.000000 2.000000 \n", 685 | "25% 3.000000 2.000000 \n", 686 | "50% 5.000000 3.000000 \n", 687 | "75% 7.000000 4.000000 \n", 688 | "max 33.000000 24.000000 \n", 689 | "\n", 690 | " NumBank2NatlTradesWHighUtilization PercentTradesWBalance \n", 691 | "count 8291.000000 8291.000000 \n", 692 | "mean 2.125075 67.845495 \n", 693 | "std 1.547062 20.508632 \n", 694 | "min 1.000000 8.000000 \n", 695 | "25% 1.000000 51.000000 \n", 696 | "50% 2.000000 68.000000 \n", 697 | "75% 3.000000 84.000000 \n", 698 | "max 19.000000 101.000000 " 699 | ] 700 | }, 701 | "execution_count": 16, 702 | "metadata": {}, 703 | "output_type": "execute_result" 704 | } 705 | ], 706 | "source": [ 707 | "free.describe()" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": 17, 713 | "metadata": {}, 714 | "outputs": [ 715 | { 716 | "data": { 717 | "text/html": [ 718 | "
\n", 719 | "\n", 732 | "\n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | "
ExternalRiskEstimateMSinceOldestTradeOpenAverageMInFile
count8291.0000008291.0000008291.000000
mean72.783018202.62055278.446026
std9.66510894.47878230.963539
min37.0000003.0000005.000000
25%65.000000139.00000059.000000
50%73.000000188.00000076.000000
75%81.000000257.50000096.000000
max95.000000804.000000245.000000
\n", 792 | "
" 793 | ], 794 | "text/plain": [ 795 | " ExternalRiskEstimate MSinceOldestTradeOpen AverageMInFile\n", 796 | "count 8291.000000 8291.000000 8291.000000\n", 797 | "mean 72.783018 202.620552 78.446026\n", 798 | "std 9.665108 94.478782 30.963539\n", 799 | "min 37.000000 3.000000 5.000000\n", 800 | "25% 65.000000 139.000000 59.000000\n", 801 | "50% 73.000000 188.000000 76.000000\n", 802 | "75% 81.000000 257.500000 96.000000\n", 803 | "max 95.000000 804.000000 245.000000" 804 | ] 805 | }, 806 | "execution_count": 17, 807 | "metadata": {}, 808 | "output_type": "execute_result" 809 | } 810 | ], 811 | "source": [ 812 | "conditionals.describe()" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": 18, 818 | "metadata": {}, 819 | "outputs": [ 820 | { 821 | "name": "stderr", 822 | "output_type": "stream", 823 | "text": [ 824 | "C:\\Users\\fred0\\Anaconda3\\lib\\site-packages\\pandas\\core\\generic.py:3812: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.\n", 825 | " new_axis = axis.drop(labels, errors=errors)\n" 826 | ] 827 | }, 828 | { 829 | "data": { 830 | "text/html": [ 831 | "
\n", 832 | "\n", 845 | "\n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | "
MSinceMostRecentTradeOpenNumSatisfactoryTradesNumTrades60Ever2DerogPubRecNumTrades90Ever2DerogPubRecPercentTradesNeverDelqMaxDelq2PublicRecLast12MMaxDelqEverNumTotalTradesNumTradesOpeninLast12MPercentInstallTradesNumInqLast6MNumInqLast6Mexcl7daysNetFractionRevolvingBurdenNumRevolvingTradesWBalanceNumInstallTradesWBalanceNumBank2NatlTradesWHighUtilizationPercentTradesWBalance
count8291.0000008291.0000008291.0000008291.0000008291.0000008291.0000008291.0000008291.000008291.0000008291.0000008291.0000008291.0000008291.0000008291.0000008291.0000008291.0000008291.000000
mean9.28404323.5808711.5560251.35761793.8310226.7415277.38318725.293212.98757735.6733812.4734052.41587336.9232905.3274643.5133282.12507567.845495
std8.66124311.0593561.2185110.96141910.7081371.6282001.80989212.595461.86355215.5534472.1200662.07983228.4469313.0384341.6569941.54706220.508632
min1.0000002.0000001.0000001.00000021.0000001.0000003.0000002.000001.0000003.0000001.0000001.0000001.0000001.0000002.0000001.0000008.000000
25%4.00000016.0000001.0000001.00000091.0000006.0000007.00000016.000002.00000024.0000001.0000001.00000011.5000003.0000002.0000001.00000051.000000
50%7.00000022.0000001.0000001.00000098.0000007.0000007.00000023.000003.00000034.0000002.0000002.00000032.0000005.0000003.0000002.00000068.000000
75%12.00000030.0000002.0000001.000000101.0000008.0000009.00000032.000004.00000046.0000003.0000003.00000058.0000007.0000004.0000003.00000084.000000
max107.00000080.00000020.00000020.000000101.00000010.0000009.000000105.0000020.00000094.00000067.00000067.000000233.00000033.00000024.00000019.000000101.000000
\n", 1031 | "
" 1032 | ], 1033 | "text/plain": [ 1034 | " MSinceMostRecentTradeOpen NumSatisfactoryTrades \\\n", 1035 | "count 8291.000000 8291.000000 \n", 1036 | "mean 9.284043 23.580871 \n", 1037 | "std 8.661243 11.059356 \n", 1038 | "min 1.000000 2.000000 \n", 1039 | "25% 4.000000 16.000000 \n", 1040 | "50% 7.000000 22.000000 \n", 1041 | "75% 12.000000 30.000000 \n", 1042 | "max 107.000000 80.000000 \n", 1043 | "\n", 1044 | " NumTrades60Ever2DerogPubRec NumTrades90Ever2DerogPubRec \\\n", 1045 | "count 8291.000000 8291.000000 \n", 1046 | "mean 1.556025 1.357617 \n", 1047 | "std 1.218511 0.961419 \n", 1048 | "min 1.000000 1.000000 \n", 1049 | "25% 1.000000 1.000000 \n", 1050 | "50% 1.000000 1.000000 \n", 1051 | "75% 2.000000 1.000000 \n", 1052 | "max 20.000000 20.000000 \n", 1053 | "\n", 1054 | " PercentTradesNeverDelq MaxDelq2PublicRecLast12M MaxDelqEver \\\n", 1055 | "count 8291.000000 8291.000000 8291.000000 \n", 1056 | "mean 93.831022 6.741527 7.383187 \n", 1057 | "std 10.708137 1.628200 1.809892 \n", 1058 | "min 21.000000 1.000000 3.000000 \n", 1059 | "25% 91.000000 6.000000 7.000000 \n", 1060 | "50% 98.000000 7.000000 7.000000 \n", 1061 | "75% 101.000000 8.000000 9.000000 \n", 1062 | "max 101.000000 10.000000 9.000000 \n", 1063 | "\n", 1064 | " NumTotalTrades NumTradesOpeninLast12M PercentInstallTrades NumInqLast6M \\\n", 1065 | "count 8291.00000 8291.000000 8291.000000 8291.000000 \n", 1066 | "mean 25.29321 2.987577 35.673381 2.473405 \n", 1067 | "std 12.59546 1.863552 15.553447 2.120066 \n", 1068 | "min 2.00000 1.000000 3.000000 1.000000 \n", 1069 | "25% 16.00000 2.000000 24.000000 1.000000 \n", 1070 | "50% 23.00000 3.000000 34.000000 2.000000 \n", 1071 | "75% 32.00000 4.000000 46.000000 3.000000 \n", 1072 | "max 105.00000 20.000000 94.000000 67.000000 \n", 1073 | "\n", 1074 | " NumInqLast6Mexcl7days NetFractionRevolvingBurden \\\n", 1075 | "count 8291.000000 8291.000000 \n", 1076 | "mean 2.415873 36.923290 \n", 1077 | "std 2.079832 28.446931 \n", 1078 | "min 1.000000 1.000000 \n", 1079 | "25% 1.000000 11.500000 \n", 1080 | "50% 2.000000 32.000000 \n", 1081 | "75% 3.000000 58.000000 \n", 1082 | "max 67.000000 233.000000 \n", 1083 | "\n", 1084 | " NumRevolvingTradesWBalance NumInstallTradesWBalance \\\n", 1085 | "count 8291.000000 8291.000000 \n", 1086 | "mean 5.327464 3.513328 \n", 1087 | "std 3.038434 1.656994 \n", 1088 | "min 1.000000 2.000000 \n", 1089 | "25% 3.000000 2.000000 \n", 1090 | "50% 5.000000 3.000000 \n", 1091 | "75% 7.000000 4.000000 \n", 1092 | "max 33.000000 24.000000 \n", 1093 | "\n", 1094 | " NumBank2NatlTradesWHighUtilization PercentTradesWBalance \n", 1095 | "count 8291.000000 8291.000000 \n", 1096 | "mean 2.125075 67.845495 \n", 1097 | "std 1.547062 20.508632 \n", 1098 | "min 1.000000 8.000000 \n", 1099 | "25% 1.000000 51.000000 \n", 1100 | "50% 2.000000 68.000000 \n", 1101 | "75% 3.000000 84.000000 \n", 1102 | "max 19.000000 101.000000 " 1103 | ] 1104 | }, 1105 | "execution_count": 18, 1106 | "metadata": {}, 1107 | "output_type": "execute_result" 1108 | } 1109 | ], 1110 | "source": [ 1111 | "free = free.drop(columns=['ExternalRiskEstimate', 'MSinceOldestTradeOpen', 'AverageMInFile'])\n", 1112 | "free.describe()" 1113 | ] 1114 | }, 1115 | { 1116 | "cell_type": "code", 1117 | "execution_count": 19, 1118 | "metadata": {}, 1119 | "outputs": [ 1120 | { 1121 | "data": { 1122 | "text/html": [ 1123 | "
\n", 1124 | "\n", 1137 | "\n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | "
RiskPerformance
01.0
11.0
21.0
31.0
41.0
\n", 1167 | "
" 1168 | ], 1169 | "text/plain": [ 1170 | " RiskPerformance\n", 1171 | "0 1.0\n", 1172 | "1 1.0\n", 1173 | "2 1.0\n", 1174 | "3 1.0\n", 1175 | "4 1.0" 1176 | ] 1177 | }, 1178 | "execution_count": 19, 1179 | "metadata": {}, 1180 | "output_type": "execute_result" 1181 | } 1182 | ], 1183 | "source": [ 1184 | "labels.head()" 1185 | ] 1186 | }, 1187 | { 1188 | "cell_type": "code", 1189 | "execution_count": 20, 1190 | "metadata": {}, 1191 | "outputs": [ 1192 | { 1193 | "data": { 1194 | "text/html": [ 1195 | "
\n", 1196 | "\n", 1209 | "\n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | "
ExternalRiskEstimateMSinceOldestTradeOpenAverageMInFile
056.0145.085.0
168.067.025.0
267.0170.074.0
382.0334.0133.0
460.0138.079.0
\n", 1251 | "
" 1252 | ], 1253 | "text/plain": [ 1254 | " ExternalRiskEstimate MSinceOldestTradeOpen AverageMInFile\n", 1255 | "0 56.0 145.0 85.0\n", 1256 | "1 68.0 67.0 25.0\n", 1257 | "2 67.0 170.0 74.0\n", 1258 | "3 82.0 334.0 133.0\n", 1259 | "4 60.0 138.0 79.0" 1260 | ] 1261 | }, 1262 | "execution_count": 20, 1263 | "metadata": {}, 1264 | "output_type": "execute_result" 1265 | } 1266 | ], 1267 | "source": [ 1268 | "conditionals.head()" 1269 | ] 1270 | }, 1271 | { 1272 | "cell_type": "code", 1273 | "execution_count": 21, 1274 | "metadata": {}, 1275 | "outputs": [], 1276 | "source": [ 1277 | "#Save to CSV\n", 1278 | "#pd.DataFrame(free.values).to_csv('heloc_x.csv', header = False, index = False)\n", 1279 | "#pd.DataFrame(conditionals.values).to_csv(\"heloc_x_c.csv\", header = False, index = False)\n", 1280 | "#labels.to_csv('heloc_y.csv', header = False, index = False)" 1281 | ] 1282 | } 1283 | ], 1284 | "metadata": { 1285 | "kernelspec": { 1286 | "display_name": "Python 3", 1287 | "language": "python", 1288 | "name": "python3" 1289 | }, 1290 | "language_info": { 1291 | "codemirror_mode": { 1292 | "name": "ipython", 1293 | "version": 3 1294 | }, 1295 | "file_extension": ".py", 1296 | "mimetype": "text/x-python", 1297 | "name": "python", 1298 | "nbconvert_exporter": "python", 1299 | "pygments_lexer": "ipython3", 1300 | "version": "3.6.8" 1301 | } 1302 | }, 1303 | "nbformat": 4, 1304 | "nbformat_minor": 2 1305 | } 1306 | --------------------------------------------------------------------------------