├── README.md
├── code
    ├── Decoder.py
    ├── Encoder.py
    ├── Evaluation.py
    ├── Generator.py
    ├── Graph.py
    ├── Helpers.py
    ├── LaugelEtAl.py
    ├── Loglik.py
    └── Sampling.py
├── data
    ├── givme
    │   ├── give_me_types.csv
    │   ├── give_me_types_c.csv
    │   ├── give_me_x.csv
    │   ├── give_me_x_c.csv
    │   └── give_me_y.csv
    └── heloc
    │   ├── heloc_types.csv
    │   ├── heloc_types_alt.csv
    │   ├── heloc_types_c_alt.csv
    │   ├── heloc_x.csv
    │   ├── heloc_x_c.csv
    │   └── heloc_y.csv
└── preprocessing
    ├── Preprocessing_GiveMeSomeCredit.ipynb
    └── Preprocessing_Heloc.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | # C-CHVAE
 2 | 
 3 | ## Set up
 4 | Counterfactual explanations can be obtained by identifying the smallest change made to an input vector to influence a prediction in a positive way. Classic examples can be found in credit scoring or health contexts where one tries to change a classifier's decision from ’loan rejected’ to ’awarded’ or from ’high risk of cardiovascular disease’ to ’low risk’. Our approach ensures that the produced counterfactuals are **proximate** (i.e., not local outliers) and **connected** to regions with substantial data density (i.e., close to correctly classified observations), two requirements known as **counterfactual faithfulness**.
 5 | 
 6 | ## Intution
 7 | We suggest embedding counterfactual search into a data density approximator, here a variational autoencoder (VAE). The idea is to use the VAE as a search device to find counterfactuals that are proximate and connected to the input data. Given the original tabular data, the encoder specifies a lower dimensional, realvalued and dense representation of that data, z. Therefore, it is the encoder that determines which low-dimensional neighbourhood we should look to for potential counterfactuals. Next, we perturb the low dimensional data representation, z + $\delta$, and feed the perturbed representation into the decoder. For small perturbations the decoder gives a potential counterfactual by reconstructing the input data from the perturbed representation. This counterfactualmis likely to occur. Next, the potential counterfactual is passed to the pretrained classifier, which we ask whether the prediction was altered. 
 8 | 
 9 | ## On running the (C-)HVAE
10 | To run the HVAE you have to predefine each input's type: you can choose one of the following: *real* (for inputs defined on the real line), *pos* (for inputs defined on positive part of R), *count* (for count inputs), *cat* (for categorical inputs) and *ordinal* (for ordinal inputs). To see an example, have a look at the *types*.csv files within the *data* folder.
11 | 
12 | 
13 | ## Bibtex 
14 | ```
15 | @inproceedings{pawelczyk_learning2019,
16 | author = {Pawelczyk, Martin and Broelemann, Klaus and Kasneci, Gjergji},
17 | title = {Learning Model-Agnostic Counterfactual Explanations for Tabular Data},
18 | year = {2020},
19 | publisher = {Association for Computing Machinery},
20 | address = {New York, NY, USA},
21 | booktitle = {Proceedings of The Web Conference 2020},
22 | pages = {3126–3132},
23 | numpages = {7},
24 | keywords = {Transparency, Counterfactual explanations, Interpretability},
25 | location = {Taipei, Taiwan},
26 | series = {WWW '20}
27 | }
28 | ```
29 | 
30 | 


--------------------------------------------------------------------------------
/code/Decoder.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | from tensorflow.python.ops.parallel_for.gradients import batch_jacobian
  5 | 
  6 | 
  7 | 
  8 | 
  9 | def decoder(samples_z, z_dim, y_dim, y_dim_partition, batch_size, types_list):
 10 | 
 11 |     samples = dict.fromkeys(['s', 'z', 'y', 'x'], [])
 12 |     gradients = dict.fromkeys(['g1', 'g2', 'g3'], [])
 13 | 
 14 |     samples['z'] = samples_z
 15 | 
 16 |     with tf.GradientTape() as g_1:
 17 |         g_1.watch(samples_z)
 18 |         # Create deterministic layer y
 19 |         samples['y'] = tf.layers.dense(inputs=samples_z, units=y_dim, activation=None,
 20 |                                    kernel_initializer=tf.random_normal_initializer(stddev=0.05), name='layer_h1_', reuse=None)
 21 | 
 22 |     gradients['g1'] = g_1.gradient(samples['y'], samples_z)
 23 | 
 24 |     with tf.GradientTape() as g_2:
 25 |         g_2.watch(samples['y'])
 26 |         grouped_samples_y = y_partition(samples['y'], types_list, y_dim_partition)
 27 | 
 28 |     gradients['g2'] = g_2.gradient(grouped_samples_y, samples['y'])
 29 | 
 30 |     with tf.GradientTape() as g_3:
 31 |         g_3.watch(grouped_samples_y)
 32 |         # Compute the parameters h_y
 33 |         theta = theta_estimation_from_y(grouped_samples_y, types_list, batch_size, reuse=None)
 34 | 
 35 |     gradients['g3'] = g_3.gradient(theta, grouped_samples_y)
 36 | 
 37 | 
 38 |     return theta, samples, gradients
 39 | 
 40 | 
 41 | def y_partition(samples_y, types_list, y_dim_partition):
 42 |     grouped_samples_y = []
 43 |     # First element must be 0 and the length of the partition vector must be len(types_list)+1
 44 |     if len(y_dim_partition) != len(types_list):
 45 |         raise Exception("The length of the partition vector must match the number of variables in the data + 1")
 46 | 
 47 |     # Insert a 0 at the beginning of the cumsum vector
 48 |     partition_vector_cumsum = np.insert(np.cumsum(y_dim_partition), 0, 0)
 49 |     for i in range(len(types_list)):
 50 |         grouped_samples_y.append(samples_y[:, partition_vector_cumsum[i]:partition_vector_cumsum[i + 1]])
 51 | 
 52 |     return grouped_samples_y
 53 | 
 54 | 
 55 | def observed_data_layer(observed_data, output_dim, name, reuse):
 56 |     # Train a layer with the observed data and reuse it for the missing data
 57 |     obs_output = tf.layers.dense(inputs=observed_data, units=output_dim, activation=None,
 58 |                                  kernel_initializer=tf.random_normal_initializer(stddev=0.05), name=name, reuse=reuse,
 59 |                                  trainable=True)
 60 | 
 61 |     return obs_output
 62 | 
 63 | 
 64 | def theta_estimation_from_y(samples_y, types_list, batch_size, reuse):
 65 |     theta = []
 66 | 
 67 |     # Independet yd -> Compute p(xd|yd)
 68 |     for i, d in enumerate(samples_y):
 69 | 
 70 |         observed_y = samples_y[i]
 71 |         nObs = tf.shape(observed_y)[0]
 72 | 
 73 |         # Different layer models for each type of variable
 74 |         if types_list[i]['type'] == 'real':
 75 |             params = theta_real(observed_y, types_list, i, reuse)
 76 | 
 77 |         elif types_list[i]['type'] == 'pos':
 78 |             params = theta_pos(observed_y, types_list, i, reuse)
 79 | 
 80 |         elif types_list[i]['type'] == 'count':
 81 |             params = theta_count(observed_y, types_list, i, reuse)
 82 | 
 83 |         elif types_list[i]['type'] == 'cat':
 84 |             params = theta_cat(observed_y, types_list, batch_size, i, reuse)
 85 | 
 86 |         elif types_list[i]['type'] == 'ordinal':
 87 |             params = theta_ordinal(observed_y, types_list, i, reuse)
 88 | 
 89 |         theta.append(params)
 90 | 
 91 |     return theta
 92 | 
 93 | 
 94 | def theta_real(observed_y, types_list, i, reuse):
 95 |     # Mean layer (To DO)
 96 |     h2_mean = observed_data_layer(observed_y, output_dim=types_list[i]['dim'], name='layer_h2' + str(i), reuse=reuse)
 97 |     # Sigma Layer (To DO)
 98 |     h2_sigma = observed_data_layer(observed_y, output_dim=types_list[i]['dim'], name='layer_h2_sigma' + str(i),
 99 |                                    reuse=reuse)
100 | 
101 |     return [h2_mean, h2_sigma]
102 | 
103 | 
104 | def theta_pos(observed_y, types_list, i, reuse):
105 |     # Mean layer
106 |     h2_mean = observed_data_layer(observed_y, output_dim=types_list[i]['dim'], name='layer_h2' + str(i), reuse=reuse)
107 |     # Sigma Layer
108 |     h2_sigma = observed_data_layer(observed_y, output_dim=types_list[i]['dim'], name='layer_h2_sigma' + str(i),
109 |                                    reuse=reuse)
110 | 
111 |     return [h2_mean, h2_sigma]
112 | 
113 | 
114 | def theta_count(observed_y, types_list, i, reuse):
115 |     # Lambda Layer
116 |     h2_lambda = observed_data_layer(observed_y, output_dim=types_list[i]['dim'], name='layer_h2' + str(i), reuse=reuse)
117 | 
118 |     return h2_lambda
119 | 
120 | 
121 | def theta_cat(observed_y, types_list, batch_size, i, reuse):
122 |     # Log pi layer, with zeros in the first value to avoid the identificability problem
123 |     h2_log_pi_partial = observed_data_layer(observed_y, output_dim=int(types_list[i]['dim']) - 1,
124 |                                             name='layer_h2' + str(i), reuse=reuse)
125 |     h2_log_pi = tf.concat([tf.zeros([batch_size, 1]), h2_log_pi_partial], 1)
126 | 
127 |     return h2_log_pi
128 | 
129 | 
130 | def theta_ordinal(observed_y, types_list, i, reuse):
131 |     # Theta layer, Dimension of ordinal - 1
132 |     h2_theta = observed_data_layer(observed_y, output_dim=int(types_list[i]['dim']) - 1, name='layer_h2' + str(i),
133 |                                    reuse=reuse)
134 |     # Mean layer, a single value
135 |     h2_mean = observed_data_layer(observed_y, output_dim=1, name='layer_h2_sigma' + str(i), reuse=reuse)
136 | 
137 |     return [h2_theta, h2_mean]
138 | 
139 | 
140 | def decoder_test_time(samples_z, z_dim, y_dim, y_dim_partition, batch_size, types_list):
141 |     samples = dict.fromkeys(['s', 'z', 'y', 'x'], [])
142 | 
143 |     samples['z'] = samples_z
144 | 
145 |     # Create deterministic layer y
146 |     samples['y'] = tf.layers.dense(inputs=samples_z, units=y_dim, activation=None,
147 |                                    kernel_initializer=tf.random_normal_initializer(stddev=0.05), name='layer_h1_',
148 |                                    reuse=True)
149 | 
150 |     grouped_samples_y = y_partition(samples['y'], types_list, y_dim_partition)
151 | 
152 |     # Compute the parameters h_y
153 |     theta = theta_estimation_from_y(grouped_samples_y, types_list, batch_size, reuse=True)
154 | 
155 |     return theta, samples


--------------------------------------------------------------------------------
/code/Encoder.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tensorflow as tf
  3 | 
  4 | def encoder(X_list, batch_size, z_dim, s_dim, tau):
  5 | 
  6 |     samples = dict.fromkeys(['s', 'z', 'y', 'x'], [])
  7 |     q_params = dict()
  8 |     X = tf.concat(X_list, 1)
  9 | 
 10 |     # Create the proposal of q(s|x^o): categorical(x^~)
 11 |     samples['s'], q_params['s'] = s_proposal_multinomial(X, batch_size, s_dim, tau, reuse=None)
 12 | 
 13 |     # Create the proposal of q(z|s,x^o): N(mu(x^~,s), SIGMA(x^~,s))???
 14 |     samples['z'], q_params['z'] = z_proposal_GMM_factorized(X_list, samples['s'], batch_size, z_dim, reuse=None)
 15 | 
 16 |     return samples, q_params
 17 | 
 18 | 
 19 | def encoder_c(X_list, X_list_c, batch_size, z_dim, s_dim, tau):
 20 | 
 21 |     samples = dict.fromkeys(['s', 'z', 'y', 'x'], [])
 22 |     q_params = dict()
 23 |     X = tf.concat(X_list, 1)
 24 |     X_c = tf.concat(X_list_c, 1)
 25 | 
 26 |     # Create the proposal of q(s|x^o): categorical(x^~)
 27 |     samples['s'], q_params['s'] = s_proposal_multinomial_c(X, X_c, batch_size, s_dim, tau, reuse=None)
 28 | 
 29 |     # Create the proposal of q(z|s,x^o): N(mu(x^~,s), SIGMA(x^~,s))???
 30 |     samples['z'], q_params['z'] = z_proposal_GMM_factorized_c(X_list, X_c, samples['s'], batch_size, z_dim, reuse=None)
 31 | 
 32 |     return samples, q_params
 33 | 
 34 | 
 35 | def encoder_vae(X_list, X_list_c, batch_size, z_dim, s_dim, tau):
 36 | 
 37 |     samples = dict.fromkeys(['s', 'z', 'y', 'x'], [])
 38 |     q_params = dict()
 39 |     X = tf.concat(X_list, 1)
 40 |     X_c = tf.concat(X_list_c, 1)
 41 | 
 42 |     # Create the proposal of q(s|x^o): categorical(x^~)
 43 |     samples['s'], q_params['s'] = s_proposal_multinomial_c(X, X_c, batch_size, s_dim, tau, reuse=None)
 44 | 
 45 |     # Create the proposal of q(z|s,x^o): N(mu(x^~,s), SIGMA(x^~,s))???
 46 |     samples['z'], q_params['z'] = z_proposal_GMM_factorized_c(X_list, X_c, samples['s'], batch_size, z_dim, reuse=None)
 47 | 
 48 |     return samples, q_params
 49 | 
 50 | 
 51 | 
 52 | def z_proposal_GMM_factorized(X, samples_s, batch_size, z_dim, reuse):
 53 |     mean_qz = []
 54 |     log_var_qz = []
 55 | 
 56 |     for i, d in enumerate(X):
 57 |         observed_data = d
 58 |         observed_s = samples_s
 59 | 
 60 |         # Mean layer
 61 |         aux_m_qz = tf.layers.dense(inputs=tf.concat([observed_data, observed_s], 1), units=z_dim, activation=None,
 62 |                                    kernel_initializer=tf.random_normal_initializer(stddev=0.05),
 63 |                                    name='layer_1_' + 'mean_enc_z' + str(i), reuse=reuse)
 64 | 
 65 | 
 66 |         # Logvar layers
 67 |         aux_lv_qz = tf.layers.dense(inputs=tf.concat([observed_data, observed_s], 1), units=z_dim, activation=None,
 68 |                                     kernel_initializer=tf.random_normal_initializer(stddev=0.05),
 69 |                                     name='layer_1_' + 'logvar_enc_z' + str(i), reuse=reuse)
 70 | 
 71 |         mean_qz.append(aux_m_qz)
 72 |         log_var_qz.append(aux_lv_qz)
 73 | 
 74 |         # Input prior
 75 |     log_var_qz.append(tf.zeros([batch_size, z_dim]))
 76 |     mean_qz.append(tf.zeros([batch_size, z_dim]))
 77 | 
 78 |     # Compute full parameters, as a product of Gaussians distribution
 79 |     log_var_qz_joint = -tf.reduce_logsumexp(tf.negative(log_var_qz), 0)
 80 |     mean_qz_joint = tf.multiply(tf.exp(log_var_qz_joint),
 81 |                                 tf.reduce_sum(tf.multiply(mean_qz, tf.exp(tf.negative(log_var_qz))), 0))
 82 | 
 83 |     # Avoid numerical problems
 84 |     # log_var_qz = tf.clip_by_value(log_var_qz, -15.0, 15.0)
 85 |     # Rep-trick
 86 |     eps = tf.random_normal((batch_size, z_dim), 0, 1, dtype=tf.float32)
 87 |     samples_z = mean_qz_joint + tf.multiply(tf.exp(log_var_qz_joint / 2), eps)
 88 | 
 89 |     return samples_z, [mean_qz_joint, log_var_qz_joint]
 90 | 
 91 | 
 92 | def z_proposal_GMM_factorized_c(X, X_c, samples_s, batch_size, z_dim, reuse):
 93 |     mean_qz = []
 94 |     log_var_qz = []
 95 | 
 96 |     for i, d in enumerate(X):
 97 |         observed_data = d
 98 |         observed_s = samples_s
 99 | 
100 |         # Mean layer
101 |         aux_m_qz = tf.layers.dense(inputs=tf.concat([observed_data, observed_s, X_c], 1), units=z_dim, activation=None,
102 |                                    kernel_initializer=tf.random_normal_initializer(stddev=0.05),
103 |                                    name='layer_1_' + 'mean_enc_z' + str(i), reuse=reuse)
104 | 
105 |         # Logvar layers
106 |         aux_lv_qz = tf.layers.dense(inputs=tf.concat([observed_data, observed_s, X_c], 1), units=z_dim, activation=None,
107 |                                     kernel_initializer=tf.random_normal_initializer(stddev=0.05),
108 |                                     name='layer_1_' + 'logvar_enc_z' + str(i), reuse=reuse)
109 | 
110 |         mean_qz.append(aux_m_qz)
111 |         log_var_qz.append(aux_lv_qz)
112 | 
113 |     # Input prior
114 |     log_var_qz.append(tf.zeros([batch_size, z_dim]))
115 |     mean_qz.append(tf.zeros([batch_size, z_dim]))
116 | 
117 |     # Compute full parameters, as a product of Gaussians distribution
118 |     log_var_qz_joint = -tf.reduce_logsumexp(tf.negative(log_var_qz), 0)
119 |     mean_qz_joint = tf.multiply(tf.exp(log_var_qz_joint),
120 |                                 tf.reduce_sum(tf.multiply(mean_qz, tf.exp(tf.negative(log_var_qz))), 0))
121 | 
122 |     # Avoid numerical problems
123 |     # log_var_qz = tf.clip_by_value(log_var_qz, -15.0, 15.0)
124 |     # Rep-trick
125 |     eps = tf.random_normal((batch_size, z_dim), 0, 1, dtype=tf.float32)
126 |     samples_z = mean_qz_joint + tf.multiply(tf.exp(log_var_qz_joint / 2), eps)
127 | 
128 |     return samples_z, [mean_qz_joint, log_var_qz_joint]
129 | 
130 | 
131 | def z_proposal_distribution_GMM(x_list, x_list_c, samples_s, z_dim, reuse):
132 |     # We propose a GMM for z
133 | 
134 |     x = tf.concat(x_list, 1)
135 |     x_c = tf.concat(x_list_c, 1)
136 | 
137 |     h1 = tf.layers.dense(inputs=tf.concat([x, samples_s, x_c], 1), units=z_dim, activation=tf.nn.relu,
138 |                               kernel_initializer=tf.random_normal_initializer(stddev=0.05),
139 |                               name='layer_1_enc', reuse=reuse)
140 | 
141 |     # Mean layer
142 |     aux_m_qz = tf.layers.dense(inputs=h1, units=z_dim, activation=None,
143 |                                kernel_initializer=tf.random_normal_initializer(stddev=0.05),
144 |                                name='layer_2_' + 'mean_enc_z', reuse=reuse)
145 | 
146 |     # Logvar layers
147 |     aux_lv_qz = tf.layers.dense(inputs=h1, units=z_dim, activation=None,
148 |                                 kernel_initializer=tf.random_normal_initializer(stddev=0.05),
149 |                                 name='layer_2_' + 'logvar_enc_z', reuse=reuse)
150 | 
151 |     # Input prior
152 | 
153 |     log_var_qz.append(tf.zeros([batch_size, z_dim]))
154 |     mean_qz.append(tf.zeros([batch_size, z_dim]))
155 | 
156 |     # Compute full parameters, as a product of Gaussians distribution
157 |     log_var_qz_joint = -tf.reduce_logsumexp(tf.negative(log_var_qz), 0)
158 |     mean_qz_joint = tf.multiply(tf.exp(log_var_qz_joint),tf.reduce_sum(tf.multiply(mean_qz, tf.exp(tf.negative(log_var_qz))), 0))
159 | 
160 |     # Avoid numerical problems
161 |     # log_var_qz = tf.clip_by_value(log_var_qz, -15.0, 15.0)
162 |     # Rep-trick
163 |     eps = tf.random_normal((batch_size, z_dim), 0, 1, dtype=tf.float32)
164 |     samples_z = mean_qz_joint + tf.multiply(tf.exp(log_var_qz_joint / 2), eps)
165 | 
166 |     return mean_pz, log_var_pz
167 | 
168 | 
169 | 
170 | 
171 | def s_proposal_multinomial(X, batch_size, s_dim, tau, reuse):
172 |     # Categorical(\pi(x^~))
173 |     # We propose a categorical distribution to create a GMM for the latent space z
174 |     log_pi = tf.layers.dense(inputs=X, units=s_dim, activation=None,
175 |                              kernel_initializer=tf.random_normal_initializer(stddev=0.05), name='layer_1_' + 'enc_s',
176 |                              reuse=reuse)
177 | 
178 |     # Gumbel-softmax trick (tau is temperature parameter)
179 |     U = -tf.log(-tf.log(tf.random_uniform([batch_size, s_dim])))
180 |     samples_s = tf.nn.softmax((log_pi + U) / tau)
181 | 
182 |     return samples_s, log_pi
183 | 
184 | 
185 | def s_proposal_multinomial_c(X, X_c, batch_size, s_dim, tau, reuse):
186 |     # Categorical(\pi(x^~))
187 |     # We propose a categorical distribution to create a GMM for the latent space z
188 |     log_pi = tf.layers.dense(inputs=tf.concat([X, X_c], 1), units=s_dim, activation=None,
189 |                              kernel_initializer=tf.random_normal_initializer(stddev=0.05), name='layer_1_' + 'enc_s',
190 |                              reuse=reuse)
191 | 
192 |     # Gumbel-softmax trick (tau is temperature parameter)
193 |     U = -tf.log(-tf.log(tf.random_uniform([batch_size, s_dim])))
194 |     samples_s = tf.nn.softmax((log_pi + U) / tau)
195 | 
196 |     return samples_s, log_pi
197 | 
198 | 
199 | 
200 | def z_distribution_GMM(samples_s, z_dim, reuse):
201 |     # We propose a GMM for z
202 |     mean_pz = tf.layers.dense(inputs=samples_s, units=z_dim, activation=None,
203 |                               kernel_initializer=tf.random_normal_initializer(stddev=0.05),
204 |                               name='layer_1_' + 'mean_dec_z', reuse=reuse)
205 | 
206 |     log_var_pz = tf.zeros([tf.shape(samples_s)[0], z_dim])
207 | 
208 |     # Avoid numerical problems
209 |     log_var_pz = tf.clip_by_value(log_var_pz, -15.0, 15.0)
210 | 
211 |     return mean_pz, log_var_pz
212 | 
213 | 


--------------------------------------------------------------------------------
/code/Evaluation.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tensorflow as tf
  3 | import Loglik
  4 | 
  5 | def loglik_evaluation(batch_data_list, types_list, theta, normalization_params, reuse):
  6 | 
  7 |     log_p_x = []
  8 |     samples_x = []
  9 |     params_x = []
 10 | 
 11 |     # Independet yd -> Compute log(p(xd|yd))
 12 |     # batch data list is a list of tensors with different dimensions depending on data type
 13 | 
 14 |     for i, d in enumerate(batch_data_list):
 15 | 
 16 |         # Select the likelihood for the types of variables
 17 |         # For that we need to import 'loglik_models_missing_normalize' as function
 18 |         loglik_function = getattr(Loglik, 'loglik_' + types_list[i]['type'])
 19 | 
 20 |         out = loglik_function(d, types_list[i], theta[i], normalization_params[i],
 21 |                               kernel_initializer=tf.random_normal_initializer(stddev=0.05), name='layer_1_mean_dec_x' + str(i), reuse=reuse)
 22 | 
 23 |         log_p_x.append(out['log_p_x'])
 24 |         samples_x.append(out['samples'])
 25 |         params_x.append(out['params'])
 26 | 
 27 |     return log_p_x, samples_x, params_x
 28 | 
 29 | 
 30 | 
 31 | def loglik_evaluation_test(batch_data_list, theta, normalization_params, list_type):
 32 | 
 33 |     samples_x_perturbed = []
 34 |     params_x_perturbed = []
 35 | 
 36 |     # batch data list is a list of tensors with different dimensions depending on data type
 37 |     # needed here for loop; nothing else!
 38 | 
 39 |     for i, d in enumerate(batch_data_list):
 40 | 
 41 |         # Select the likelihood for the types of variables
 42 |         # For that we need to import 'loglik_models_missing_normalize' as function
 43 |         loglik_function = getattr(Loglik, 'loglik_test_' + list_type[i]['type'])
 44 | 
 45 |         out = loglik_function(theta[i], normalization_params[i], list_type[i])
 46 | 
 47 |         samples_x_perturbed.append(out['samples'])
 48 |         params_x_perturbed.append(out['params'])
 49 | 
 50 |     return samples_x_perturbed, params_x_perturbed
 51 | 
 52 | 
 53 | 
 54 | 
 55 | def cost_function(log_p_x, p_params, q_params, types_list, z_dim, y_dim, s_dim):
 56 |     # KL(q(s|x)|p(s))
 57 |     log_pi = q_params['s']
 58 |     pi_param = tf.nn.softmax(log_pi)
 59 |     KL_s = -tf.nn.softmax_cross_entropy_with_logits(logits=log_pi, labels=pi_param) + tf.log(float(s_dim))
 60 | 
 61 |     # KL(q(z|s,x)|p(z|s))
 62 |     mean_pz, log_var_pz = p_params['z']
 63 |     mean_qz, log_var_qz = q_params['z']
 64 |     KL_z = -0.5 * z_dim + 0.5 * tf.reduce_sum(
 65 |         tf.exp(log_var_qz - log_var_pz) + tf.square(mean_pz - mean_qz) / tf.exp(log_var_pz) - log_var_qz + log_var_pz,
 66 |         1)
 67 | 
 68 |     # Eq[log_p(x|y)]
 69 |     loss_reconstruction = tf.reduce_sum(log_p_x, 0)
 70 | 
 71 |     # Complete ELBO
 72 |     #ELBO = tf.reduce_mean(loss_reconstruction - KL_z - KL_s, 0)
 73 |     ELBO = tf.reduce_mean(1.20*loss_reconstruction - (KL_z + KL_s), 0)
 74 | 
 75 |     return ELBO, loss_reconstruction, KL_z, KL_s
 76 | 
 77 | 
 78 | def kl_z_diff(p_params, q_params, degree_active, batch_size, z_dim):
 79 |     # method to check whether one is within the polarized regime
 80 | 
 81 |     # parameters
 82 |     mean_pz, log_var_pz = p_params['z']
 83 |     mean_qz, log_var_qz = q_params['z']
 84 | 
 85 |     ones = tf.ones([batch_size, z_dim])
 86 | 
 87 |     # index according to global importance
 88 |     index = tf.greater(degree_active*ones, tf.reduce_mean(tf.exp(log_var_qz), 0))
 89 | 
 90 |     mean_qz_approx = tf.reshape(tf.boolean_mask(mean_qz, index), [batch_size, -1])
 91 |     mean_pz_approx = tf.reshape(tf.boolean_mask(mean_pz, index), [batch_size, -1])
 92 |     log_var_qz_approx = tf.reshape(tf.boolean_mask(log_var_qz, index), [batch_size, -1])
 93 |     log_var_pz_approx = tf.reshape(tf.boolean_mask(log_var_pz, index), [batch_size, -1])
 94 | 
 95 |     kl_approx = tf.reduce_mean(tf.reduce_sum(tf.exp(log_var_qz_approx - log_var_pz_approx) + tf.square(mean_pz_approx - mean_qz_approx) / tf.exp(log_var_pz_approx) - log_var_qz_approx + log_var_pz_approx, 1), 0)
 96 |     kl = tf.reduce_mean(tf.reduce_sum(tf.exp(log_var_qz - log_var_pz) + tf.square(mean_pz - mean_qz) / tf.exp(log_var_pz) - log_var_qz + log_var_pz, 1), 0)
 97 | 
 98 |     delta_kl = tf.divide(tf.abs(tf.subtract(kl_approx, kl)), kl)
 99 | 
100 |     return [delta_kl, kl_approx, kl, index]
101 | 
102 | 


--------------------------------------------------------------------------------
/code/Generator.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tensorflow as tf
  3 | import Encoder
  4 | import Decoder
  5 | import Evaluation
  6 | 
  7 | def samples_generator(batch_data_list, X_list, types_list, batch_size, z_dim, y_dim, y_dim_partition, s_dim, tau, normalization_params):
  8 | 
  9 |     samples_test = dict.fromkeys(['s' ,'z' ,'y' ,'x'] ,[])
 10 |     test_params = dict()
 11 |     X = tf.concat(X_list ,1)
 12 | 
 13 |     # Create the proposal of q(s|x^o)
 14 |     _, params = Encoder.s_proposal_multinomial(X, batch_size, s_dim, tau, reuse=True)
 15 |     samples_test['s'] = tf.one_hot(tf.argmax(params ,1) ,depth=s_dim)
 16 | 
 17 |     # Create the proposal of q(z|s,x^o)
 18 |     _, params = Encoder.z_proposal_GMM_factorized(X_list, samples_test['s'], batch_size, z_dim, reuse=True)
 19 |     samples_test['z'] = params[0]
 20 | 
 21 |     # Create deterministic layer y
 22 |     samples_test['y'] = tf.layers.dense(inputs=samples_test['z'],
 23 |                                         units=y_dim,
 24 |                                         activation=None,
 25 |                                         kernel_initializer=tf.random_normal_initializer(stddev=0.05),
 26 |                                         trainable=True,
 27 |                                         name= 'layer_h1_', reuse=True)
 28 | 
 29 |     grouped_samples_y = Decoder.y_partition(samples_test['y'], types_list, y_dim_partition)
 30 | 
 31 |     # Compute the parameters h_y
 32 |     theta = Decoder.theta_estimation_from_y(grouped_samples_y, types_list, batch_size, reuse=True)
 33 | 
 34 |     # Compute loglik and output of the VAE
 35 |     log_p_x, samples_test['x'], test_params['x'] = Evaluation.loglik_evaluation(batch_data_list,
 36 |                                                                      types_list,
 37 |                                                                      theta,
 38 |                                                                      normalization_params,
 39 |                                                                      reuse=True)
 40 | 
 41 |     return samples_test, test_params, log_p_x, theta
 42 | 
 43 | 
 44 | 
 45 | def samples_generator_c(batch_data_list, X_list, X_list_c, types_list, batch_size, z_dim, y_dim, y_dim_partition, s_dim, tau, normalization_params):
 46 | 
 47 |     samples_test = dict.fromkeys(['s' ,'z' ,'y' ,'x'] ,[])
 48 |     test_params = dict()
 49 |     X = tf.concat(X_list ,1)
 50 |     X_c = tf.concat(X_list_c, 1)
 51 | 
 52 |     # Create the proposal of q(s|x^o)
 53 |     _, params = Encoder.s_proposal_multinomial_c(X, X_c, batch_size, s_dim, tau, reuse=True)
 54 |     samples_test['s'] = tf.one_hot(tf.argmax(params, 1), depth=s_dim)
 55 | 
 56 |     # Create the proposal of q(z|s,x^o)
 57 |     _, params = Encoder.z_proposal_GMM_factorized_c(X_list, X_c, samples_test['s'], batch_size, z_dim, reuse=True)
 58 |     samples_test['z'] = params[0]
 59 | 
 60 |     # Create deterministic layer y
 61 |     samples_test['y'] = tf.layers.dense(inputs=samples_test['z'],
 62 |                                         units=y_dim,
 63 |                                         activation=None,
 64 |                                         kernel_initializer=tf.random_normal_initializer(stddev=0.05),
 65 |                                         trainable=True,
 66 |                                         name='layer_h1_', reuse=True)
 67 | 
 68 |     grouped_samples_y = Decoder.y_partition(samples_test['y'], types_list, y_dim_partition)
 69 | 
 70 |     # Compute the parameters h_y
 71 |     theta = Decoder.theta_estimation_from_y(grouped_samples_y, types_list, batch_size, reuse=True)
 72 | 
 73 |     # Compute loglik and output of the VAE
 74 |     log_p_x, samples_test['x'], test_params['x'] = Evaluation.loglik_evaluation(batch_data_list,
 75 |                                                                      types_list,
 76 |                                                                      theta,
 77 |                                                                      normalization_params,
 78 |                                                                      reuse=True)
 79 | 
 80 |     return samples_test, test_params, log_p_x, theta
 81 | 
 82 | 
 83 | 
 84 | 
 85 | def samples_perturbation_z(batch_data_list, X_list, types_list, z_dim, y_dim, y_dim_partition, s_dim, tau,
 86 |                            normalization_params, nsamples, batch_size, p, l, h):
 87 |     # I ended up not using this one
 88 |     # should be: batch_size size = nsamples
 89 | 
 90 |     samples_test = dict.fromkeys(['s', 'z', 'y_tilde', 'z_tilde', 'x_tilde'], [])
 91 |     test_params = dict()
 92 |     X = tf.concat(X_list, 1)
 93 | 
 94 |     # -----------------------------------------------------------------------------------#
 95 |     # Encoder: Test Time
 96 | 
 97 |     # Create the proposal of q(s|x^o)
 98 |     _, params = Encoder.s_proposal_multinomial(X, batch_size, s_dim, tau, reuse=True)
 99 |     samples_test['s'] = tf.one_hot(tf.argmax(params, 1), depth=s_dim)
100 | 
101 |     # Create the proposal of q(z|s,x^o)
102 |     _, params = Encoder.z_proposal_GMM_factorized(X_list, samples_test['s'], batch_size, z_dim, reuse=True)
103 |     samples_test['z'] = params[0]
104 | 
105 |     # -----------------------------------------------------------------------------------#
106 |     # counterfactual step
107 | 
108 |     # z = samples_test['z']
109 |     delta_z = tf.random_normal((nsamples, z_dim), 0, 1,
110 |                                dtype=tf.float32)  # http://mathworld.wolfram.com/HyperspherePointPicking.html
111 |     d = tf.add(tf.multiply(tf.random_uniform((nsamples, 1), 0, 1, dtype=tf.float32), (h - l)), l)  # length range [l, h)
112 |     norm_p = tf.norm(delta_z, ord=p, axis=1)
113 |     norm_p = tf.reshape(norm_p, [-1, 1])  # right format
114 |     d_norm = tf.div(d, norm_p)  # rescale/normalize factor
115 |     delta_z = tf.multiply(delta_z, d_norm)  # shape: (nsamples x z_dim)
116 | 
117 |     # -----------------------------------------------------------------------------------#
118 |     # Decoder: Test Time
119 | 
120 |     # during counterfactual search
121 |     z_tilde = tf.add(samples_test['z'], delta_z)  # gives (nsamples x z_dim) vector
122 |     samples_test['z_tilde'] = tf.reshape(z_tilde, [-1, z_dim])  # use reshape to avoid rank error
123 | 
124 |     # Create deterministic layer y
125 |     samples_test['y_tilde'] = tf.layers.dense(inputs=samples_test['z_tilde'],
126 |                                               units=y_dim,
127 |                                               activation=None,
128 |                                               kernel_initializer=tf.random_normal_initializer(stddev=0.05),
129 |                                               trainable=True,
130 |                                               name='layer_h1_', reuse=True)
131 | 
132 |     grouped_samples_y = Decoder.y_partition(samples_test['y_tilde'], types_list, y_dim_partition)
133 | 
134 |     # Compute the parameters h_y
135 |     theta = Decoder.theta_estimation_from_y(grouped_samples_y, types_list, batch_size, reuse=True)
136 | 
137 |     # Compute loglik and output of the VAE
138 |     log_p_x, samples_test['x_tilde'], test_params['x'] = Evaluation.loglik_evaluation(batch_data_list,
139 |                                                                            types_list,
140 |                                                                            theta,
141 |                                                                            normalization_params,
142 |                                                                            reuse=True)
143 | 
144 |     return samples_test, delta_z, d, theta


--------------------------------------------------------------------------------
/code/Graph.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | import Helpers
  5 | import Encoder
  6 | import Decoder
  7 | import Evaluation
  8 | import Generator
  9 | 
 10 | # MASTER of Disaster
 11 | 
 12 | def C_HVAE_graph(types_file, learning_rate=1e-4, z_dim=1, y_dim=1, s_dim=1, y_dim_partition=[], nsamples=1000, p=2):
 13 | 
 14 |     # -----------------------------------------------------------------------------------#
 15 |     # Preliminaries
 16 | 
 17 |     # Load remaining placeholders
 18 |     print('[*] Defining placeholders')
 19 | 
 20 | 
 21 |     # Placeholder for batch_size (required for counterfactual search loop)
 22 |     batch_size = tf.placeholder(dtype=tf.int32)
 23 |     # Placeholder for Gumbel-softmax parameter
 24 |     tau = tf.placeholder(tf.float32, shape=())
 25 |     batch_data_list, types_list = Helpers.place_holder_types(types_file, batch_size)
 26 | 
 27 |     # Batch normalization of the data
 28 |     X_list, normalization_params, X_list_noisy = Helpers.batch_normalization(batch_data_list, types_list, batch_size)
 29 | 
 30 | 
 31 |     # Set dimensionality of Y
 32 |     if y_dim_partition:
 33 |         y_dim_output = np.sum(y_dim_partition)
 34 |     else:
 35 |         y_dim_partition = y_dim * np.ones(len(types_list), dtype=int)
 36 |         y_dim_output = np.sum(y_dim_partition)
 37 | 
 38 |     # -----------------------------------------------------------------------------------#
 39 |     # (HVAE) Encoder and Decoder for training time
 40 | 
 41 |     # Encoder
 42 |     print('[*] Defining Encoder...')
 43 |     samples, q_params = Encoder.encoder(X_list_noisy, batch_size, z_dim, s_dim, tau)
 44 | 
 45 |     samples_s = samples['s']
 46 |     samples_z = samples['z']
 47 |     p_params = dict()
 48 | 
 49 |     # Create the distribution of p(z|s)
 50 |     p_params['z'] = Encoder.z_distribution_GMM(samples['s'], z_dim, reuse=None)
 51 | 
 52 |     # Decoder
 53 |     print('[*] Defining Decoder...')
 54 |     theta, samples, gradient_decoder = Decoder.decoder(samples_z, z_dim, y_dim_output, y_dim_partition, batch_size, types_list)
 55 | 
 56 |     samples['s'] = samples_s
 57 |     # Compute loglik and output of the VAE
 58 |     log_p_x, samples['x'], p_params['x'] = Evaluation.loglik_evaluation(batch_data_list,
 59 |                                                              types_list,
 60 |                                                              theta,
 61 |                                                              normalization_params,
 62 |                                                              reuse=None)
 63 | 
 64 |     # Evaluate active vs passive variables
 65 |     degree_active = 0.95# must be less than 1 (not used in paper)
 66 |     delta_kl = Evaluation.kl_z_diff(p_params, q_params, degree_active, batch_size, z_dim)
 67 | 
 68 | 
 69 |     # -----------------------------------------------------------------------------------#
 70 |     # optimize ELBO
 71 | 
 72 |     print('[*] Defining Cost function...')
 73 |     ELBO, loss_reconstruction, KL_z, KL_s = Evaluation.cost_function(log_p_x,
 74 |                                                           p_params,
 75 |                                                           q_params,
 76 |                                                           types_list,
 77 |                                                           z_dim,
 78 |                                                           y_dim_output,
 79 |                                                           s_dim)
 80 | 
 81 |     optim = tf.train.AdamOptimizer(learning_rate).minimize(-ELBO)
 82 | 
 83 |     # -----------------------------------------------------------------------------------#
 84 |     # Generator function for test time sample generation
 85 |     samples_test, test_params, log_p_x_test, theta_test = Generator.samples_generator(batch_data_list,
 86 |                                                                             X_list,
 87 |                                                                             types_list,
 88 |                                                                             batch_size,
 89 |                                                                             z_dim,
 90 |                                                                             y_dim_output,
 91 |                                                                             y_dim_partition,
 92 |                                                                             s_dim,
 93 |                                                                             tau,
 94 |                                                                             normalization_params)
 95 | 
 96 |     # -----------------------------------------------------------------------------------#
 97 |     #  Decoder for test time counterfactuals
 98 |     #  'samples_perturbed': does not contain 'x' samples
 99 | 
100 |     print('[*] Defining Test Time Decoder...')
101 |     theta_perturbed, samples_perturbed = Decoder.decoder_test_time(samples_z,
102 |                                                             z_dim,
103 |                                                             y_dim_output,
104 |                                                             y_dim_partition,
105 |                                                             batch_size,
106 |                                                             types_list)
107 | 
108 |     # Evaluation Function not necessary here
109 |     '''log_p_x, samples_perturbed['x'], p_params_x_perturbed = Evaluation.loglik_evaluation(batch_data_list,
110 |                                                                               types_list,
111 |                                                                               theta_perturbed,
112 |                                                                               normalization_params,
113 |                                                                               reuse=True)'''
114 | 
115 |     # -----------------------------------------------------------------------------------#
116 |     # Packing results
117 | 
118 |     tf_nodes = {'batch_size': batch_size,#feed
119 |                 'ground_batch': batch_data_list,#feed
120 |                 'tau_GS': tau,#feed,
121 |                 #'predict_proba': predict_proba,#feed
122 |                 'samples_z': samples_z,#feed
123 |                 'samples': samples,
124 |                 'log_p_x': log_p_x,
125 |                 'loss_re': loss_reconstruction,
126 |                 'loss': -ELBO,
127 |                 'optim': optim,
128 |                 'KL_s': KL_s,
129 |                 'KL_z': KL_z,
130 |                 'X': X_list,
131 |                 'p_params': p_params,
132 |                 'q_params': q_params,
133 |                 'samples_test': samples_test,
134 |                 'test_params': test_params,
135 |                 'log_p_x_test': log_p_x_test,
136 |                 'samples_perturbed': samples_perturbed,
137 |                 'theta_test': theta_test,
138 |                 'theta_perturbed': theta_perturbed,
139 |                 'normalization_params': normalization_params,
140 |                 'gradient_decoder': gradient_decoder,
141 |                 'delta_kl': delta_kl}
142 | 
143 |     return tf_nodes
144 | 
145 | 
146 | # MASTER of Disaster for conditional density approximations
147 | 
148 | def C_CHVAE_graph(types_file, types_file_c, learning_rate=1e-3, z_dim=1, y_dim=1, s_dim=1, y_dim_partition=[], nsamples=1000, p=2, degree_active=0.95):
149 | 
150 |     # -----------------------------------------------------------------------------------#
151 |     # Preliminaries
152 | 
153 |     # Load placeholders
154 |     print('[*] Defining placeholders')
155 | 
156 |     # c: short for 'conditional'
157 |     # Placeholder for batch_size (required for counterfactual search loop)
158 |     batch_size = tf.placeholder(dtype=tf.int32)
159 |     # Placeholder for Gumbel-softmax parameter
160 |     tau = tf.placeholder(tf.float32, shape=())
161 |     batch_data_list, types_list = Helpers.place_holder_types(types_file, batch_size)
162 |     batch_data_list_c, types_list_c = Helpers.place_holder_types(types_file_c, batch_size)
163 | 
164 | 
165 |     # Batch normalization of the data
166 |     X_list, normalization_params, X_list_noisy = Helpers.batch_normalization(batch_data_list, types_list, batch_size)
167 |     # Batch normalization of the data
168 |     X_list_c, _, X_list_noisy_c = Helpers.batch_normalization(batch_data_list_c, types_list, batch_size)
169 | 
170 | 
171 |     # Set dimensionality of Y
172 |     if y_dim_partition:
173 |         y_dim_output = np.sum(y_dim_partition)
174 |     else:
175 |         y_dim_partition = y_dim * np.ones(len(types_list), dtype=int)
176 |         y_dim_output = np.sum(y_dim_partition)
177 | 
178 |     # -----------------------------------------------------------------------------------#
179 |     # (HVAE) Encoder and Decoder for training time
180 | 
181 |     # Encoder
182 |     print('[*] Defining Encoder...')
183 |     samples, q_params = Encoder.encoder_c(X_list, X_list_c, batch_size, z_dim, s_dim, tau)
184 | 
185 |     samples_s = samples['s']
186 |     samples_z = samples['z']
187 |     p_params = dict()
188 | 
189 |     # Create the distribution of p(z|s)
190 |     p_params['z'] = Encoder.z_distribution_GMM(samples['s'], z_dim, reuse=None)
191 | 
192 |     # Decoder
193 |     print('[*] Defining Decoder...')
194 |     theta, samples, gradient_decoder = Decoder.decoder(samples_z, z_dim, y_dim_output, y_dim_partition, batch_size, types_list)
195 | 
196 |     samples['s'] = samples_s
197 |     # Compute loglik and output of the VAE
198 |     log_p_x, samples['x'], p_params['x'] = Evaluation.loglik_evaluation(batch_data_list,
199 |                                                              types_list,
200 |                                                              theta,
201 |                                                              normalization_params,
202 |                                                              reuse=None)
203 | 
204 |     # -----------------------------------------------------------------------------------#
205 |     # optimize ELBO
206 | 
207 |     print('[*] Defining Cost function...')
208 |     ELBO, loss_reconstruction, KL_z, KL_s = Evaluation.cost_function(log_p_x,
209 |                                                           p_params,
210 |                                                           q_params,
211 |                                                           types_list,
212 |                                                           z_dim,
213 |                                                           y_dim_output,
214 |                                                           s_dim)
215 | 
216 |     optim = tf.train.AdamOptimizer(learning_rate).minimize(-ELBO)
217 | 
218 |     # -----------------------------------------------------------------------------------#
219 |     # Generator function for test time sample generation
220 |     samples_test, test_params, log_p_x_test, theta_test = Generator.samples_generator_c(batch_data_list,
221 |                                                                             X_list, X_list_c,
222 |                                                                             types_list,
223 |                                                                             batch_size,
224 |                                                                             z_dim,
225 |                                                                             y_dim_output,
226 |                                                                             y_dim_partition,
227 |                                                                             s_dim,
228 |                                                                             tau,
229 |                                                                             normalization_params)
230 | 
231 |     # -----------------------------------------------------------------------------------#
232 |     #  Decoder for test time counterfactuals
233 |     #  'samples_perturbed': does not contain 'x' samples
234 | 
235 |     print('[*] Defining Test Time Decoder...')
236 |     theta_perturbed, samples_perturbed = Decoder.decoder_test_time(samples_z,
237 |                                                             z_dim,
238 |                                                             y_dim_output,
239 |                                                             y_dim_partition,
240 |                                                             batch_size,
241 |                                                             types_list)
242 | 
243 |     # Evaluation Function not necessary here
244 |     degree_active = degree_active# must be less than 1
245 |     delta_kl = Evaluation.kl_z_diff(p_params, q_params, degree_active, batch_size, z_dim)
246 | 
247 |     # -----------------------------------------------------------------------------------#
248 |     # Packing results
249 | 
250 |     tf_nodes = {'batch_size': batch_size,  #feed
251 |                 'ground_batch': batch_data_list,  #feed
252 |                 'ground_batch_c': batch_data_list_c,  #feed
253 |                 'tau_GS': tau,  #feed,
254 |                 'samples_z': samples_z,  #feed
255 |                 'samples': samples,
256 |                 'log_p_x': log_p_x,
257 |                 'loss_re': loss_reconstruction,
258 |                 'loss': -ELBO,
259 |                 'optim': optim,
260 |                 'KL_s': KL_s,
261 |                 'KL_z': KL_z,
262 |                 'X': X_list,
263 |                 'p_params': p_params,
264 |                 'q_params': q_params,
265 |                 'samples_test': samples_test,
266 |                 'test_params': test_params,
267 |                 'log_p_x_test': log_p_x_test,
268 |                 'samples_perturbed': samples_perturbed,
269 |                 'theta_test': theta_test,
270 |                 'theta_perturbed': theta_perturbed,
271 |                 'normalization_params': normalization_params,
272 |                 'gradient_decoder': gradient_decoder,
273 |                 'delta_kl': delta_kl}
274 | 
275 |     return tf_nodes


--------------------------------------------------------------------------------
/code/Helpers.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | from sklearn.metrics import mean_squared_error
  5 | from sklearn import preprocessing
  6 | from sklearn.model_selection import train_test_split
  7 | from sklearn.linear_model import LogisticRegression
  8 | from sklearn.ensemble import RandomForestClassifier
  9 | from sklearn.svm import SVC
 10 | from sklearn.model_selection import GridSearchCV
 11 | from sklearn.neighbors import LocalOutlierFactor
 12 | from sklearn.decomposition import PCA
 13 | from sklearn.preprocessing import StandardScaler
 14 | from sklearn.cluster import DBSCAN
 15 | from scipy.stats import moment
 16 | import csv
 17 | import argparse
 18 | 
 19 | 
 20 | 
 21 | 
 22 | # Argument Parser
 23 | def getArgs(argv=None):
 24 |     parser = argparse.ArgumentParser(description='Default parameters of the models', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 25 |     parser.add_argument('--batch_size', type=int, default=100, help='Size of the batches')
 26 |     parser.add_argument('--epochs', type=int, default=80, help='Number of epochs of the simulations')
 27 |     parser.add_argument('--train', type=int, default=1, help='Training model flag')
 28 |     parser.add_argument('--display', type=int, default=1, help='Display option flag')
 29 |     parser.add_argument('--save', type=int, default=1000, help='Save variables every save iterations')
 30 |     parser.add_argument('--restore', type=int, default=0, help='To restore session, to keep training or evaluation')
 31 |     parser.add_argument('--dim_latent_s', type=int, default=3, help='Dimension of the categorical space')
 32 |     parser.add_argument('--dim_latent_z', type=int, default=2, help='Dimension of the Z latent space')
 33 |     parser.add_argument('--dim_latent_y', type=int, default=5, help='Dimension of the Y latent space')
 34 |     parser.add_argument('--dim_latent_y_partition', type=int, nargs='+', help='Partition of the Y latent space')
 35 |     parser.add_argument('--save_file', type=str, default='new_mnist_zdim5_ydim10_4images_', help='Save file name')
 36 |     parser.add_argument('--data_file', type=str, default='MNIST_data', help='File with the data')
 37 |     parser.add_argument('--data_file_c', type=str, default='MNIST_data', help='File with the conditioning data')
 38 |     parser.add_argument('--types_file', type=str, default='mnist_train_types2.csv', help='File with the types of the data')
 39 |     parser.add_argument('--types_file_c', type=str, default='mnist_train_types2.csv', help='File with the types of the conditioning data')
 40 |     parser.add_argument('--classifier', type=str, default='RLinearR', help='Classification model (RandomForest, SVM or else RLinearR)')
 41 |     parser.add_argument('--classifier_two', type=str, default='RandomForest', help='Classification model (RandomForest, SVM or else RLinearR)')
 42 |     parser.add_argument('--norm_latent_space', type=int, default=2, help='To measure distance between latent variables')
 43 |     parser.add_argument('--step_size', type=float, default=0.5, help='Step size for Random Search')
 44 |     parser.add_argument('--search_samples', type=int, default=1000, help='Nunber search samples for counterfactual search')
 45 |     parser.add_argument('--data_y_file', type=str, default='cs_y_training', help='File with the y data')
 46 |     parser.add_argument('--ncounterfactuals', type=int, default=25, help='First #counterf. test data points for which we find counterf.')
 47 |     parser.add_argument('--boundary', type=float, default=-0.5, help='Boundary y = def. for simple classifier')
 48 |     parser.add_argument('--degree_active', type=float, default=1, help='active latent variable threshold')
 49 | 
 50 |     return parser.parse_args(argv)
 51 | 
 52 | 
 53 | def next_batch(data, types_dict, batch_size, index_batch):
 54 | 
 55 |     # Create minibath
 56 |     batch_xs = data[index_batch * batch_size:(index_batch + 1) * batch_size, :]
 57 | 
 58 |     # Slipt variables of the batches
 59 |     data_list = []
 60 |     initial_index = 0
 61 |     for d in types_dict:
 62 |         dim = int(d['dim'])
 63 |         data_list.append(batch_xs[:, initial_index:initial_index + dim])
 64 |         initial_index += dim
 65 | 
 66 |     return data_list
 67 | 
 68 | def next_batch_y(y, batch_size, index_batch):
 69 |     return y[index_batch * batch_size:(index_batch + 1) * batch_size, :]
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | def samples_concatenation(samples):
 76 |     for i, batch in enumerate(samples):
 77 |         if i == 0:
 78 |             samples_x = np.concatenate(batch['x'], 1)
 79 |             samples_y = batch['y']
 80 |             samples_z = batch['z']
 81 |             samples_s = batch['s']
 82 |         else:
 83 |             samples_x = np.concatenate([samples_x, np.concatenate(batch['x'], 1)], 0)
 84 |             samples_y = np.concatenate([samples_y, batch['y']], 0)
 85 |             samples_z = np.concatenate([samples_z, batch['z']], 0)
 86 |             samples_s = np.concatenate([samples_s, batch['s']], 0)
 87 | 
 88 |     return samples_s, samples_z, samples_y, samples_x
 89 | 
 90 | 
 91 | def discrete_variables_transformation(data, types_dict):
 92 |     ind_ini = 0
 93 |     output = []
 94 |     for d in range(len(types_dict)):
 95 |         ind_end = ind_ini + int(types_dict[d]['dim'])
 96 |         if types_dict[d]['type'] == 'cat':
 97 |             output.append(np.reshape(np.argmax(data[:, ind_ini:ind_end], 1), [-1, 1]))
 98 |         elif types_dict[d]['type'] == 'ordinal':
 99 |             output.append(np.reshape(np.sum(data[:, ind_ini:ind_end], 1) - 1, [-1, 1]))
100 |         else:
101 |             output.append(data[:, ind_ini:ind_end])
102 |         ind_ini = ind_end
103 | 
104 |     return np.concatenate(output, 1)
105 | 
106 | 
107 | def read_data(data_file, types_file):
108 |     # Read types of data from data file
109 |     with open(data_file, 'r') as f:
110 |         data = [[float(x) for x in rec] for rec in csv.reader(f, delimiter=',')]
111 |         data = np.array(data)
112 | 
113 |     # Read types of data from data file
114 |     with open(types_file) as f:
115 |         types_dict = [{k: v for k, v in row.items()}
116 |                       for row in csv.DictReader(f, skipinitialspace=True)]
117 | 
118 |     # Construct the data matrices
119 |     data_complete = []
120 |     for i in range(np.shape(data)[1]):
121 | 
122 |         if types_dict[i]['type'] == 'cat':
123 |             # Get categories
124 |             cat_data = [int(x) for x in data[:, i]]
125 |             categories, indexes = np.unique(cat_data, return_inverse=True)
126 |             # Transform categories to a vector of 0:n_categories
127 |             new_categories = np.arange(int(types_dict[i]['dim']))
128 |             cat_data = new_categories[indexes]
129 |             # Create one hot encoding for the categories
130 |             aux = np.zeros([np.shape(data)[0], len(new_categories)])
131 |             aux[np.arange(np.shape(data)[0]), cat_data] = 1
132 |             data_complete.append(aux)
133 | 
134 |         elif types_dict[i]['type'] == 'ordinal':
135 |             # Get categories
136 |             cat_data = [int(x) for x in data[:, i]]
137 |             categories, indexes = np.unique(cat_data, return_inverse=True)
138 |             # Transform categories to a vector of 0:n_categories
139 |             new_categories = np.arange(int(types_dict[i]['dim']))
140 |             cat_data = new_categories[indexes]
141 |             # Create thermometer encoding for the categories
142 |             aux = np.zeros([np.shape(data)[0], 1 + len(new_categories)])
143 |             aux[:, 0] = 1
144 |             aux[np.arange(np.shape(data)[0]), 1 + cat_data] = -1
145 |             aux = np.cumsum(aux, 1)
146 |             data_complete.append(aux[:, :-1])
147 | 
148 |         else:
149 |             data_complete.append(np.transpose([data[:, i]]))
150 | 
151 |     n_samples = np.shape(data)[0]
152 |     # n_variables = len(types_dict)
153 | 
154 |     data = np.concatenate(data_complete, 1)
155 | 
156 |     return data, types_dict, n_samples
157 | 
158 | 
159 | def p_distribution_params_concatenation(params, types_dict, z_dim, s_dim):
160 |     keys = params[0].keys()
161 |     out_dict = {key: [] for key in keys}
162 | 
163 |     for i, batch in enumerate(params):
164 | 
165 |         for d, k in enumerate(keys):
166 | 
167 |             if k == 'z' or k == 'y':
168 |                 if i == 0:
169 |                     out_dict[k] = batch[k]
170 |                 else:
171 |                     out_dict[k] = np.concatenate([out_dict[k], batch[k]], 1)
172 | 
173 |             elif k == 'x':
174 |                 if i == 0:
175 |                     out_dict[k] = batch[k]
176 |                 else:
177 |                     for v in range(len(types_dict)):
178 |                         if types_dict[v]['type'] == 'pos' or types_dict[v]['type'] == 'real':
179 |                             out_dict[k][v] = np.concatenate([out_dict[k][v], batch[k][v]], 1)
180 |                         else:
181 |                             out_dict[k][v] = np.concatenate([out_dict[k][v], batch[k][v]], 0)
182 | 
183 |     return out_dict
184 | 
185 | 
186 | def q_distribution_params_concatenation(params, z_dim, s_dim):
187 |     keys = params[0].keys()
188 |     out_dict = {key: [] for key in keys}
189 | 
190 |     for i, batch in enumerate(params):
191 |         for d, k in enumerate(keys):
192 |             out_dict[k].append(batch[k])
193 | 
194 |     out_dict['z'] = np.concatenate(out_dict['z'], 1)
195 |     out_dict['s'] = np.concatenate(out_dict['s'], 0)
196 | 
197 |     return out_dict
198 | 
199 | 
200 | def statistics(loglik_params, types_dict):
201 |     loglik_mean = []
202 |     loglik_mode = []
203 | 
204 |     for d, attrib in enumerate(loglik_params):
205 |         if types_dict[d]['type'] == 'real':
206 |             # Normal distribution (mean, sigma)
207 |             loglik_mean.append(attrib[0])
208 |             loglik_mode.append(attrib[0])
209 |             # Only for log-normal
210 |         elif types_dict[d]['type'] == 'pos':
211 |             # Log-normal distribution (mean, sigma)
212 |             loglik_mean.append(np.exp(attrib[0] + 0.5 * attrib[1]) - 1.0)
213 |             loglik_mode.append(np.exp(attrib[0] - attrib[1]) - 1.0)
214 |         elif types_dict[d]['type'] == 'count':
215 |             # Poisson distribution (lambda)
216 |             loglik_mean.append(attrib)
217 |             loglik_mode.append(np.floor(attrib))
218 | 
219 |         else:
220 |             # Categorical and ordinal (mode imputation for both)
221 |             loglik_mean.append(np.reshape(np.argmax(attrib, 1), [-1, 1]))
222 |             loglik_mode.append(np.reshape(np.argmax(attrib, 1), [-1, 1]))
223 | 
224 |     return np.transpose(np.squeeze(loglik_mean)), np.transpose(np.squeeze(loglik_mode))
225 | 
226 | 
227 | def error_computation(x_train, x_hat, types_dict):
228 |     error_observed = []
229 |     ind_ini = 0
230 |     for dd in range(len(types_dict)):
231 | 
232 |         # Mean classification error
233 |         if types_dict[dd]['type'] == 'cat':
234 |             ind_end = ind_ini + 1
235 |             error_observed.append(np.mean(x_train[:, ind_ini:ind_end] != x_hat[:, ind_ini:ind_end]))
236 | 
237 |         # Mean "shift" error
238 |         elif types_dict[dd]['type'] == 'ordinal':
239 |             ind_end = ind_ini + 1
240 |             error_observed.append(
241 |                 np.mean(np.abs(x_train[:, ind_ini:ind_end] - x_hat[:, ind_ini:ind_end])) / int(types_dict[dd]['dim']))
242 | 
243 |         # Normalized root mean square error
244 |         else:
245 |             ind_end = ind_ini + int(types_dict[dd]['dim'])
246 |             norm_term = np.max(x_train[:, dd]) - np.min(x_train[:, dd])
247 |             error_observed.append(
248 |                 np.sqrt(mean_squared_error(x_train[:, ind_ini:ind_end], x_hat[:, ind_ini:ind_end])) / norm_term)
249 | 
250 |         ind_ini = ind_end
251 | 
252 |     return error_observed
253 | 
254 | 
255 | def place_holder_types(types_file, batch_size):
256 |     # Read the types of the data from the files
257 |     with open(types_file) as f:
258 |         types_list = [{k: v for k, v in row.items()}
259 |                       for row in csv.DictReader(f, skipinitialspace=True)]
260 | 
261 |     # Create placeholders for every data type, with appropriate dimensions
262 |     batch_data_list = []
263 |     for i in range(len(types_list)):
264 |         batch_data_list.append(tf.placeholder(tf.float32, shape=(None, types_list[i]['dim'])))
265 |     tf.concat(batch_data_list, axis=1)
266 | 
267 |     return batch_data_list, types_list
268 | 
269 | 
270 | def batch_normalization(batch_data_list, types_list, batch_size):
271 |     normalized_data = []
272 |     normalization_parameters = []
273 |     noisy_data = []
274 | 
275 |     for i, d in enumerate(batch_data_list):
276 | 
277 |         observed_data = d
278 | 
279 |         if types_list[i]['type'] == 'real':
280 |             # We transform the data to a gaussian with mean 0 and std 1
281 |             data_mean, data_var = tf.nn.moments(observed_data, 0)
282 |             data_var = tf.clip_by_value(data_var, 1e-6, 1e20)  # Avoid zero values
283 |             aux_X = tf.nn.batch_normalization(observed_data, data_mean, data_var, offset=0.0, scale=1.0,
284 |                                               variance_epsilon=1e-6)
285 | 
286 |             aux_X_noisy = aux_X + tf.random_normal((batch_size, 1), 0, 0.05, dtype=tf.float32)
287 | 
288 |             normalized_data.append(aux_X)
289 |             noisy_data.append(aux_X_noisy)
290 |             normalization_parameters.append([data_mean, data_var])
291 | 
292 |         # When using log-normal
293 |         elif types_list[i]['type'] == 'pos':
294 | 
295 |             # We transform the log of the data to a gaussian with mean 0 and std 1
296 |             observed_data_log = tf.log(1 + observed_data)
297 |             data_mean_log, data_var_log = tf.nn.moments(observed_data_log, 0)
298 |             data_var_log = tf.clip_by_value(data_var_log, 1e-6, 1e20)  # Avoid zero values
299 |             aux_X = tf.nn.batch_normalization(observed_data_log, data_mean_log, data_var_log, offset=0.0, scale=1.0,
300 |                                               variance_epsilon=1e-6)
301 | 
302 |             normalized_data.append(aux_X)
303 |             normalization_parameters.append([data_mean_log, data_var_log])
304 | 
305 |         elif types_list[i]['type'] == 'count':
306 | 
307 |             # We transform the log of the data to a gaussian with mean 0 and std 1
308 |             observed_data_log = tf.log(1 + observed_data)
309 |             data_mean_log, data_var_log = tf.nn.moments(observed_data_log, 0)
310 |             data_var_log = tf.clip_by_value(data_var_log, 1e-6, 1e20)  # Avoid zero values
311 |             aux_X = tf.nn.batch_normalization(observed_data_log, data_mean_log, data_var_log, offset=0.0, scale=1.0,
312 |                                               variance_epsilon=1e-6)
313 | 
314 |             normalized_data.append(aux_X)
315 |             normalization_parameters.append([data_mean_log, data_var_log])
316 | 
317 | 
318 |         else:
319 |             # Don't normalize the categorical and ordinal variables
320 |             normalized_data.append(d)
321 |             normalization_parameters.append(tf.convert_to_tensor([0.0, 1.0], dtype=tf.float32))  # No normalization here
322 | 
323 |             aux_X_noisy = d + tf.random_normal((batch_size, 1), 0, 0.05, dtype=tf.float32)
324 |             noisy_data.append(aux_X_noisy)
325 | 
326 | 
327 |     return normalized_data, normalization_parameters, noisy_data
328 | 
329 | 
330 | # normalization function
331 | 
332 | def normalization_classification(batch_data_list, types_list):
333 |     normalized_data = []
334 |     normalization_parameters = []
335 | 
336 |     for i in range(len(types_list)):
337 | 
338 |         observed_data = batch_data_list[:, i]
339 | 
340 |         if types_list[i]['type'] == 'real':
341 |             # We transform the data to a gaussian with mean 0 and std 1
342 |             data_mean = np.mean(observed_data)
343 |             data_var = moment(observed_data, 2)
344 |             data_var = np.clip(data_var, 1e-6, 1e20)
345 |             data_std = np.sqrt(data_var)
346 |             aux_X = preprocessing.scale(observed_data)
347 | 
348 |             normalized_data.append(aux_X)
349 |             normalization_parameters.append([data_mean, data_std])
350 | 
351 |         # When using log-normal
352 |         elif types_list[i]['type'] == 'pos':
353 |             #           #We transform the log of the data to a gaussian with mean 0 and std 1
354 |             observed_data = observed_data
355 |             data_mean = np.mean(observed_data)
356 |             data_var = moment(observed_data, 2)
357 |             data_var = np.clip(data_var, 1e-6, 1e20)  # Avoid zero values
358 |             data_std = np.sqrt(data_var)
359 | 
360 |             aux_X = preprocessing.scale(observed_data)
361 | 
362 |             normalized_data.append(aux_X)
363 |             normalization_parameters.append([data_mean, data_std])
364 | 
365 |         elif types_list[i]['type'] == 'count':
366 | 
367 |             # Input log of the data
368 |             observed_data = observed_data
369 |             data_mean = np.mean(observed_data)
370 |             data_var = moment(observed_data, 2)
371 |             data_var = np.clip(data_var, 1e-6, 1e20)  # Avoid zero values
372 |             data_std = np.sqrt(data_var)
373 | 
374 |             aux_X = preprocessing.scale(observed_data)
375 | 
376 |             normalized_data.append(aux_X)
377 |             normalization_parameters.append([data_mean, data_std])
378 | 
379 |         else:
380 |             # Don't normalize the categorical and ordinal variables
381 |             normalized_data.append(observed_data)
382 |             normalization_parameters.append([0.0, 1.0])  # No normalization here
383 | 
384 |     return normalized_data, normalization_parameters
385 | 
386 | 
387 | 
388 | def replicate_data_list(data_list, num_replications):
389 |     # data_list: expected to have 1 row
390 |     # num_replications: expected to have #rows = nsamples
391 |     new_data_list = []
392 | 
393 |     for i in range(len(data_list)):
394 |         if i == 0:
395 |             new_data_list = [np.repeat(data_list[i], num_replications, axis=0)]
396 |         else:
397 |             new_data_list.append(np.repeat(data_list[i], num_replications, axis=0))
398 | 
399 |     return new_data_list
400 | 
401 | 
402 | # stylised classifier
403 | def f_star(x_tilde, boundary):
404 |     y = x_tilde[:,1] > boundary
405 |     y = y*1
406 |     return y
407 | 
408 | 
409 | def indices_to_one_hot(data, nb_classes):
410 |     """Convert an iterable of indices to one-hot encoded labels."""
411 |     targets = np.array(data).reshape(-1)
412 |     return np.eye(nb_classes)[targets]
413 | 
414 | 
415 | def sequence_mask(pseudo_cat, dim_ord, batch_size):
416 |     x = np.linspace(1, dim_ord, dim_ord).reshape(1, -1)
417 |     x = ~(np.repeat(x, batch_size, axis=0).T > pseudo_cat).T
418 |     x = x * 1
419 |     return x
420 | 
421 | 
422 | def cat_sample(logits):
423 |     u = np.random.uniform(0, 1, logits.shape)
424 |     return np.argmax(logits - np.log(-np.log(u)), axis=1)
425 | 
426 | 
427 | def Compute_LOF(neighbors, x_train, x_test):
428 |     # x_test: - np array
429 |     # x_test_counterfactual: - np array
430 |     # x_train: train data  - np array
431 | 
432 |     clf = LocalOutlierFactor(n_neighbors=neighbors, contamination=0.01, novelty=True)
433 |     clf.fit(x_train)
434 | 
435 |     X_outlier = clf.predict(x_test)
436 | 
437 |     return X_outlier
438 | 
439 | 
440 | def Connectedness(x_train, x_counter, number, epsilon, min_samples):
441 |     x_counter.shape
442 | 
443 |     dbscan_list = []
444 |     n, _ = x_counter.shape
445 | 
446 |     for i in range(n):
447 |         density_control = np.r_[x_train[0:number, :], x_counter[i, :].reshape(1, -1)]
448 |         density_pred = DBSCAN(eps=epsilon, min_samples=min_samples).fit(density_control)
449 |         dbscan_list.append(density_pred.labels_[-1])
450 | 
451 |     not_connected = np.array(dbscan_list.count(-1)) / n #count occurcene of (-1) labels & divide by number of test set
452 | 
453 |     return not_connected, np.array(dbscan_list)
454 | 
455 | 
456 | def Read_Split_Data(test_size, classifier, data_total, data_total_c, y_true, types_dict, types_dict_c, normalization):
457 |     out = dict()
458 | 
459 |     # out_training: training x and y
460 |     # out_test: test x and y
461 |     # out_train_pos: x with corresponding positive predicted label on train set
462 |     # out_test_counter: x with corresponding negative predicted label on test set
463 | 
464 | 
465 |     # Split into test and train data
466 |     train_data, test_data, train_data_c, test_data_c, y_train, y_test = train_test_split(data_total,
467 |                                                                                          data_total_c,
468 |                                                                                          y_true,
469 |                                                                                          random_state=619,
470 |                                                                                          test_size=test_size)
471 | 
472 |     n_train, _ = np.shape(train_data)
473 |     df = np.r_[train_data, test_data]
474 |     df_c = np.r_[train_data_c, test_data_c]
475 | 
476 |     df_norm, df_param = normalization_classification(df, types_dict)
477 |     df_norm = np.transpose(np.array(df_norm))
478 |     df_c_norm, df_c_param = normalization_classification(df_c, types_dict_c)
479 |     df_c_norm = np.transpose(np.array(df_c_norm))
480 | 
481 |     train_data_norm = df_norm[0:n_train, :]
482 |     test_data_norm = df_norm[n_train::, :]
483 |     train_data_c_norm = df_c_norm[0:n_train, :]
484 |     test_data_c_norm = df_c_norm[n_train::, :]
485 | 
486 |     # Concatenate free and conditioning features
487 |     train_concat = np.c_[train_data_c, train_data]
488 |     test_concat = np.c_[test_data_c, test_data]
489 |     train_concat_norm = np.c_[train_data_c_norm, train_data_norm]
490 |     test_concat_norm = np.c_[test_data_c_norm, test_data_norm]
491 | 
492 | 
493 |     if normalization == True:
494 |         train_concat_x = train_concat_norm
495 |         test_concat_x = test_concat_norm
496 | 
497 |         # not normalized
498 |         train_data_not = train_data
499 |         train_data_c_not = train_data_c
500 |         train_data_concat_not = np.c_[train_data_c_not, train_data_not]
501 | 
502 |         test_data_not = test_data
503 |         test_data_c_not = test_data_c
504 |         test_data_concat_not = np.c_[test_data_c_not, test_data_not]
505 | 
506 |         # normalized data
507 |         train_data = train_data_norm
508 |         train_data_c = train_data_c_norm
509 |         test_data = test_data_norm
510 |         test_data_c = test_data_c_norm
511 | 
512 |     else:
513 |         train_concat_x = train_concat
514 |         test_concat_x = test_concat
515 | 
516 | 
517 |     # classifcation model training: Random forest or LR model: use default values
518 |     if classifier == 'RandomForest':
519 |         clf = RandomForestClassifier(random_state=619)
520 | 
521 |         param_grid = {'bootstrap': [True],
522 |                         'max_depth': [3, 5, 7],
523 |                         'min_samples_leaf': [5],
524 |                         'min_samples_split': [4, 10],
525 |                         'n_estimators': [50, 100]}
526 | 
527 |         grid = GridSearchCV(estimator=clf,
528 |                             param_grid=param_grid,
529 |                             scoring='roc_auc',
530 |                             cv=3,
531 |                             n_jobs=-1,
532 |                             verbose=2)
533 | 
534 |         grid.fit(train_concat_x, y_train.reshape(-1))
535 |         clf = grid.best_estimator_
536 | 
537 | 
538 |         inv_y_train = 1 - y_train
539 |         ## grid search
540 |         clf_ar = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=619)
541 |         grid_ar = GridSearchCV(
542 |             clf_ar, param_grid={'C': np.logspace(-4, 3)},
543 |             cv=5,
544 |             scoring='roc_auc',
545 |             return_train_score=True)
546 |         grid_ar.fit(train_concat_x, inv_y_train.reshape(-1))
547 |         clf_ar = grid_ar.best_estimator_
548 | 
549 | 
550 |     elif classifier == 'SVM':
551 | 
552 |         clf = SVC(random_state=619)
553 | 
554 |         tuned_parameters = [{'kernel': ['rbf'], 'C': [0.01, 1, 10]}]
555 |         # tuned_parameters = [{'alpha': [0.0001, 0.001]}]
556 | 
557 |         grid = GridSearchCV(clf, tuned_parameters, cv=3, n_jobs=-1)
558 |         grid.fit(train_concat_x, y_train.reshape(-1))
559 |         clf = grid.best_estimator_
560 |         print(grid.cv_results_)
561 | 
562 |         # for AR algorithm as placeholder
563 |         inv_y_train = 1 - y_train
564 |         ## grid search
565 |         clf_ar = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=619)
566 |         grid_ar = GridSearchCV(
567 |             clf_ar, param_grid={'C': np.logspace(-4, 3)},
568 |             cv=5,
569 |             scoring='roc_auc',
570 |             return_train_score=True)
571 |         grid_ar.fit(train_concat_x, inv_y_train.reshape(-1))
572 |         clf_ar = grid_ar.best_estimator_
573 | 
574 | 
575 |     else:
576 | 
577 |         ## grid search
578 |         clf = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=619)
579 |         grid = GridSearchCV(
580 |             clf, param_grid={'C': np.logspace(-4, 3)},
581 |             cv=5,
582 |             scoring='roc_auc',
583 |             return_train_score=True)
584 |         grid.fit(train_concat_x, y_train.reshape(-1))
585 |         clf = grid.best_estimator_
586 | 
587 |     # for AR algorithm
588 |         inv_y_train = 1 - y_train
589 |         ## grid search
590 |         clf_ar = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=619)
591 |         grid_ar = GridSearchCV(
592 |             clf_ar, param_grid={'C': np.logspace(-4, 3)},
593 |             cv=5,
594 |             scoring='roc_auc',
595 |             return_train_score=True)
596 |         grid_ar.fit(train_concat_x, inv_y_train.reshape(-1))
597 |         clf_ar = grid_ar.best_estimator_
598 | 
599 | 
600 | 
601 |     # THESE GUYS/GURLS WILL NEED OUR HELP (TEST SET)
602 |     index_predicted_denied = np.where(clf.predict(test_concat_x) == 1)[0]
603 | 
604 |     test_data_c_denied = test_data_c[index_predicted_denied, :]
605 |     test_data_denied = test_data[index_predicted_denied, :]
606 |     y_test_denied = y_test[index_predicted_denied]
607 |     test_concat_x_denied = np.c_[test_data_c_denied, test_data_denied]
608 |     ncounterfactuals, _ = test_concat_x_denied.shape
609 | 
610 |     # FROM THESE GUYS WE HAVE POSTIVE RECORD & they have predicted positive record (TRAINING SET)
611 |     index_predicted_nodefault = (clf.predict(train_concat_x) == 0)
612 |     index_true_nodefault = (y_train.reshape(-1) == 0)
613 |     intersection_no = (index_predicted_nodefault * 1 + index_true_nodefault * 1)
614 |     index_intersection_no = (intersection_no == 2)  #(nodefault + predicted nodefault) index
615 | 
616 |     train_data_c_pos = train_data_c[index_intersection_no, :]
617 |     train_data_pos = train_data[index_intersection_no, :]
618 |     train_concat_x_pos = np.c_[train_data_c_pos, train_data_pos]
619 |     y_train_pos = y_train[index_intersection_no]
620 | 
621 |     if normalization == True:
622 | 
623 |         test_data_denied_not = test_data_not[index_predicted_denied, :]
624 |         test_data_c_denied_not = test_data_c_not[index_predicted_denied, :]
625 |         test_concat_x_denied_not = np.c_[test_data_c_denied_not, test_data_denied_not]
626 | 
627 |         train_data_c_pos_not = train_data_c_not[index_intersection_no, :]
628 |         train_data_pos_not = train_data_not[index_intersection_no, :]
629 |         train_concat_x_pos_not = np.c_[train_data_c_pos_not, train_data_pos_not]
630 | 
631 |     else:
632 | 
633 |         test_concat_x_denied_not = _
634 | 
635 |         test_data_denied_not = _
636 |         test_data_c_denied_not = _
637 |         train_concat_x_pos_not = _
638 | 
639 |         test_data_concat_not = _
640 |         train_data_not = _
641 |         train_data_c_not = _
642 |         train_data_concat_not = _
643 | 
644 | 
645 |     # return
646 |     out['training'] = [train_concat_x, train_data, train_data_c, y_train]
647 |     out['training_not'] = [train_data_concat_not, train_data_not, train_data_c_not, y_train]
648 |     out['test'] = [test_concat_x, y_test]
649 |     out['test_not'] = [test_data_concat_not, y_test]
650 |     out['test_counter'] = [test_concat_x_denied, test_data_denied, test_data_c_denied, y_test_denied]
651 |     out['test_counter_not'] = [test_concat_x_denied_not, test_data_denied_not, test_data_c_denied_not, y_test_denied]
652 |     out['train_pos'] = [train_concat_x_pos, y_train_pos]
653 |     out['train_pos_not'] = [train_concat_x_pos_not, y_train_pos]
654 |     out['normalization_parameters'] = [df_param, df_c_param]
655 | 
656 |     return ncounterfactuals, clf, out, clf_ar, grid
657 | 
658 | 
659 | def compute_cdf(data):
660 |     # per free feature
661 |     # relies on computing histogram first
662 |     # num_bins: # bins in histogram
663 |     # you can use bin_edges & norm_cdf to plot cdf
664 | 
665 |     n, p = np.shape(data)
666 |     # num_bins = n
667 |     norm_cdf = np.zeros((n, p))
668 | 
669 |     for j in range(p):
670 |         counts, bin_edges = np.histogram(data[:, j], bins=n, normed=True)
671 |         cdf = np.cumsum(counts)
672 |         norm_cdf[:, j] = cdf / cdf[-1]
673 |         # plt.plot (bin_edges[1:], norm_cdf)
674 | 
675 |     return bin_edges[1:], norm_cdf
676 | 
677 | 
678 | def max_percentile_shift(norm_cdfs, norm_cdfs_counterfactual):
679 |     # (3) in ustun et al
680 |     delta_cdfs = np.abs(norm_cdfs - norm_cdfs_counterfactual)
681 |     cost = np.max(delta_cdfs, 1)
682 |     return cost
683 | 
684 | 
685 | def total_percentile_shift(norm_cdfs, norm_cdfs_counterfactual):
686 |     inv_counterfactual = norm_cdfs_counterfactual
687 |     inv = norm_cdfs
688 |     ratio = np.abs(inv_counterfactual - inv)
689 |     cost = np.sum(ratio, 1)
690 |     return cost
691 | 
692 | 
693 | def total_log_percentile_shift(norm_cdfs, norm_cdfs_counterfactual):
694 |     # (4) in ustun et al
695 |     inv_counterfactual = np.clip(1-norm_cdfs_counterfactual, 0.01, 0.99)
696 |     inv = np.clip(1-norm_cdfs, 0.01, 0.99)
697 |     ratio = np.abs(np.log(np.clip((inv_counterfactual/inv), 0.01, 10)))
698 |     cost = np.sum(ratio, 1)
699 |     return cost
700 | 
701 | 
702 | def denormalization(norm_para, norm_para_c, samples, samples_c):
703 | 
704 |     # norm_para & norm_para_c: numpy arrays
705 |     # samples: numpy arrays
706 | 
707 |     n, p = np.shape(samples)
708 |     n_c, p_c = np.shape(samples_c)
709 | 
710 |     norm_samples = np.zeros((n, p))
711 |     norm_samples_c = np.zeros((n_c, p_c))
712 | 
713 |     for i in range(p):
714 |         norm_samples[:, i] = (samples[:, i] - norm_para[i, 0])/norm_para[i, 1]
715 | 
716 |     for i in range(p_c):
717 |         norm_samples_c[:, i] = (samples_c[:, i] - norm_para_c[i, 0])/norm_para_c[i, 1]
718 | 
719 |     return norm_samples, norm_samples_c
720 | 
721 | 
722 | # standardize data
723 | def standardize(data):
724 |     scaler = StandardScaler()
725 |     a = scaler.fit(data)
726 |     a = scaler.transform(data)
727 | 
728 |     return a, scaler
729 | 
730 | # reduce dim of data
731 | def reduce_dim(data, dim):
732 |     pca = PCA(n_components= dim)
733 |     components = pca.fit_transform(data)
734 |     return components


--------------------------------------------------------------------------------
/code/LaugelEtAl.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from numpy import linalg as LA
  3 | from scipy.spatial.distance import cdist
  4 | 
  5 | # rejection sampling algorithm comes from LSE lecture notes
  6 | # alternatively see WOLFRAM: http://mathworld.wolfram.com/CirclePointPicking.html
  7 | # # http://mathworld.wolfram.com/HyperspherePointPicking.html
  8 | 
  9 | def unit_circumference_coordinates(r, n, coordinates):
 10 |     # r: radius
 11 |     # n: number of samples
 12 | 
 13 |     x1 = np.random.uniform(-1, 1, n)
 14 |     x2 = np.random.uniform(-1, 1, n)
 15 |     index = np.where((x1 ** 2 + x2 ** 2) < 1)  # accepted samples
 16 |     x1 = x1[index]
 17 |     x2 = x2[index]
 18 |     # coordinates
 19 |     x = ((x1) ** 2 - (x2) ** 2) / ((x1) ** 2 + (x2) ** 2) * r
 20 |     y = (2 * (x1) * (x2)) / ((x1) ** 2 + (x2) ** 2) * r
 21 | 
 22 |     a = coordinates[0]
 23 |     b = coordinates[1]  # 1x2 vector
 24 |     a = a + x
 25 |     b = b + y
 26 | 
 27 |     return a, b
 28 | 
 29 | 
 30 | def hyper_sphere_coordindates(n_search_samples, x, h, l, p):
 31 | 
 32 |     delta_x = np.random.randn(n_search_samples, x.shape[1])  # http://mathworld.wolfram.com/HyperspherePointPicking.html
 33 |     d = np.random.rand(n_search_samples) * (h - l) + l  # length range [l, h)
 34 |     norm_p = np.linalg.norm(delta_x, ord=p, axis=1)
 35 |     d_norm = np.divide(d, norm_p).reshape(-1, 1)  # rescale/normalize factor
 36 |     delta_x = np.multiply(delta_x, d_norm)
 37 |     x_tilde = x + delta_x  # x tilde
 38 | 
 39 |     return x_tilde, d
 40 | 
 41 | 
 42 | def Laugel_Search(ncounterfactuals, out, search_samples, clf):
 43 | 
 44 |     # this function IS NOT GENERAL: works for "give me credit"
 45 |     x_tilde_star_list = []
 46 | 
 47 |     # Set parameters
 48 |     p = 2
 49 | 
 50 |     threshold = 200
 51 | 
 52 |     for i in range(ncounterfactuals):
 53 | 
 54 |         # Test data
 55 |         test_data_replicated = np.repeat(out['test_counter'][1][i, :].reshape(1, -1), search_samples, axis=0)
 56 |         test_data_c_replicated = np.repeat(out['test_counter'][2][i, :].reshape(1, -1), search_samples, axis=0)
 57 | 
 58 |         l = 0
 59 |         step = 0.5
 60 |         h = l + step
 61 | 
 62 |         # counter to stop
 63 |         count = 0
 64 |         counter_step = 1
 65 | 
 66 | 
 67 |         while True:
 68 | 
 69 |             count = count + counter_step
 70 | 
 71 |             if (count > threshold) is True:
 72 |                 x_tilde_star = None
 73 |                 break
 74 | 
 75 |             # STEP 1 of Algorithm
 76 |             # sample points on hyper sphere around test point
 77 |             x_tilde, _ = hyper_sphere_coordindates(search_samples, test_data_replicated, h, l, p)
 78 |             # one way: #x_tilde = np.ceil(x_tilde); another x_tilde = np.around(x_tilde,1)
 79 |             x_tilde = np.c_[test_data_c_replicated, x_tilde]
 80 | 
 81 |             # STEP 2 of Algorithm
 82 |             # compute l_1 distance
 83 |             distances = np.abs((x_tilde - np.c_[test_data_c_replicated, test_data_replicated])).sum(axis=1)
 84 | 
 85 |             # counterfactual labels
 86 |             y_tilde = clf.predict(x_tilde)
 87 |             cla_index = np.where(y_tilde != 1)
 88 | 
 89 |             x_tilde_candidates = x_tilde[cla_index]
 90 |             candidates_dist = distances[cla_index]
 91 | 
 92 |             if len(candidates_dist) == 0:  # no candidate generated
 93 |                 l = h
 94 |                 h = l + step
 95 |             else:  # certain candidates generated
 96 |                 min_index = np.argmin(candidates_dist)
 97 |                 x_tilde_star = x_tilde_candidates[min_index]
 98 |                 break
 99 | 
100 |         x_tilde_star_list.append(x_tilde_star)
101 |     X_test_counterfactual = np.array(x_tilde_star_list)
102 | 
103 |     return X_test_counterfactual


--------------------------------------------------------------------------------
/code/Loglik.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Created on  27 05 2019
  5 | 
  6 | @based on Nazabal et al 2018
  7 | 
  8 | List of loglikelihoods for the types of variables considered in this paper.
  9 | Basically, we create the different layers needed in the decoder and during the
 10 | generation of new samples
 11 | 
 12 | The variable reuse indicates the mode of this functions
 13 | - reuse = None -> Decoder implementation
 14 | - reuse = True -> Samples generator implementation
 15 | 
 16 | """
 17 | 
 18 | import tensorflow as tf
 19 | import numpy as np
 20 | import Helpers
 21 | from scipy.special import softmax
 22 | from scipy.special import expit
 23 | 
 24 | 
 25 | def loglik_real(batch_data, list_type, theta, normalization_params, kernel_initializer, name, reuse):
 26 |     
 27 |     output=dict()
 28 |     epsilon = tf.constant(1e-6, dtype=tf.float32)
 29 |     
 30 |     #Data outputs
 31 |     data = batch_data
 32 |     
 33 |     data_mean, data_var = normalization_params
 34 |     data_var = tf.clip_by_value(data_var, epsilon, np.inf)
 35 |     
 36 |     est_mean, est_var = theta
 37 |     est_var = tf.clip_by_value(tf.nn.softplus(est_var), epsilon, 1.0) #Must be positive
 38 |     
 39 |     # Affine transformation of the parameters
 40 |     est_mean = tf.sqrt(data_var)*est_mean + data_mean
 41 |     est_var = data_var*est_var
 42 |     
 43 |     #Compute loglik
 44 |     log_p_x = -0.5 * tf.reduce_sum(tf.squared_difference(data, est_mean)/est_var, 1) - int(list_type['dim'])*0.5*tf.log(2* np.pi) - 0.5*tf.reduce_sum(tf.log(est_var),1)
 45 |     
 46 |     #Outputs
 47 |     output['log_p_x'] = log_p_x
 48 |     output['params'] = [est_mean, est_var]
 49 |     output['samples'] = tf.contrib.distributions.Normal(est_mean, tf.sqrt(est_var)).sample()
 50 |         
 51 |     return output
 52 | 
 53 | def loglik_pos(batch_data, list_type, theta, normalization_params, kernel_initializer, name, reuse):
 54 |     
 55 |     #Log-normal distribution
 56 |     output = dict()
 57 |     epsilon = tf.constant(1e-6, dtype=tf.float32)
 58 |     
 59 |     #Data outputs
 60 |     data_mean_log, data_var_log = normalization_params
 61 |     data_var_log = tf.clip_by_value(data_var_log, epsilon, np.inf)
 62 |     
 63 |     data = batch_data
 64 |     data_log = tf.log(1.0 + data)
 65 |     
 66 |     est_mean, est_var = theta
 67 |     est_var = tf.clip_by_value(tf.nn.softplus(est_var), epsilon, 1.0)
 68 |     
 69 |     # Affine transformation of the parameters
 70 |     est_mean = tf.sqrt(data_var_log)*est_mean + data_mean_log
 71 |     est_var = data_var_log*est_var
 72 |     
 73 |     #Compute loglik
 74 |     log_p_x = -0.5 * tf.reduce_sum(tf.squared_difference(data_log,est_mean)/est_var,1) \
 75 |         - 0.5*tf.reduce_sum(tf.log(2*np.pi*est_var),1) - tf.reduce_sum(data_log,1)
 76 |     
 77 |     output['log_p_x'] = log_p_x
 78 |     output['params'] = [est_mean, est_var]
 79 |     output['samples'] = tf.exp(tf.contrib.distributions.Normal(est_mean,tf.sqrt(est_var)).sample()) - 1.0
 80 |         
 81 |     return output
 82 | 
 83 | def loglik_cat(batch_data, list_type, theta, normalization_params, kernel_initializer, name, reuse):
 84 |     
 85 |     output=dict()
 86 |     
 87 |     #Data outputs
 88 |     data = batch_data
 89 |     
 90 |     log_pi = theta
 91 |     
 92 |     #Compute loglik
 93 |     log_p_x = -tf.nn.softmax_cross_entropy_with_logits(logits=log_pi,labels=data)
 94 |     
 95 |     output['log_p_x'] = log_p_x
 96 |     output['params'] = log_pi
 97 |     output['samples'] = tf.one_hot(tf.contrib.distributions.Categorical(probs=tf.nn.softmax(log_pi)).sample(),depth=int(list_type['dim']))
 98 |     
 99 |     return output
100 |     
101 | def loglik_ordinal(batch_data, list_type, theta, normalization_params, kernel_initializer, name, reuse):
102 |     
103 |     output=dict()
104 |     epsilon = tf.constant(1e-6, dtype=tf.float32)
105 |     
106 |     #Data outputs
107 |     data = batch_data
108 |     batch_size = tf.shape(data)[0]
109 |     
110 |     # We need to force that the outputs of the network increase with the categories
111 |     partition_param, mean_param = theta
112 |     mean_value = tf.reshape(mean_param,[-1,1])
113 |     theta_values = tf.cumsum(tf.clip_by_value(tf.nn.softplus(partition_param), epsilon, 1e20),1)
114 |     sigmoid_est_mean = tf.nn.sigmoid(theta_values - mean_value)
115 |     mean_probs = tf.concat([sigmoid_est_mean,tf.ones([batch_size,1],tf.float32)],1) - tf.concat([tf.zeros([batch_size,1],tf.float32),sigmoid_est_mean],1)
116 |     
117 |     #Code needed to compute samples from an ordinal distribution
118 |     true_values = tf.one_hot(tf.reduce_sum(tf.cast(data,tf.int32),1)-1,int(list_type['dim']))
119 |     
120 |     #Compute loglik
121 |     log_p_x = tf.log(tf.clip_by_value(tf.reduce_sum(mean_probs*true_values,1),epsilon,1e20))
122 |     
123 |     output['log_p_x'] = log_p_x
124 |     output['params'] = mean_probs
125 |     output['samples'] = tf.sequence_mask(1+tf.contrib.distributions.Categorical(logits=tf.log(tf.clip_by_value(mean_probs,epsilon,1e20))).sample(), int(list_type['dim']),dtype=tf.float32)
126 |     
127 |     return output
128 | 
129 | def loglik_count(batch_data, list_type, theta, normalization_params, kernel_initializer, name, reuse):
130 |     
131 |     output=dict()
132 |     epsilon = tf.constant(1e-6, dtype=tf.float32)
133 |     
134 |     #Data outputs
135 |     data = batch_data
136 |     
137 |     est_lambda = theta
138 |     est_lambda = tf.clip_by_value(tf.nn.softplus(est_lambda), epsilon, 1e20)
139 |     
140 |     log_p_x = -tf.reduce_sum(tf.nn.log_poisson_loss(targets=data, log_input=tf.log(est_lambda), compute_full_loss=True), 1)
141 |     
142 |     output['log_p_x'] = log_p_x
143 |     output['params'] = est_lambda
144 |     output['samples'] = tf.contrib.distributions.Poisson(est_lambda).sample()
145 | 
146 |     return output
147 | 
148 | 
149 | def loglik_test_real(theta, normalization_params, list_type):
150 | 
151 |     output = dict()
152 |     epsilon = 1e-6
153 | 
154 |     # Data outputs
155 |     data_mean, data_var = normalization_params
156 |     data_var = np.clip(data_var, epsilon, np.inf)
157 | 
158 |     # Estimated parameters
159 |     est_mean, est_var = theta
160 |     soft_plus_est_var = np.log(1 + np.exp(-np.abs(est_var))) + np.maximum(est_var, 0)
161 |     est_var = np.clip(soft_plus_est_var, epsilon, 1.0)  # Must be positive
162 | 
163 |     # Affine transformation of the parameters
164 |     est_mean = np.sqrt(data_var) * est_mean + data_mean
165 |     est_var = data_var * est_var
166 | 
167 |     # Outputs
168 |     output['samples'] = np.random.normal(est_mean, np.sqrt(est_var))
169 |     output['params'] = [est_mean, est_var]
170 | 
171 |     return output
172 | 
173 | 
174 | def loglik_test_pos(theta, normalization_params, list_type):
175 | 
176 |     # Log-normal distribution
177 |     output = dict()
178 |     epsilon = 1e-6
179 | 
180 |     # Data outputs
181 |     data_mean_log, data_var_log = normalization_params
182 |     data_var_log = np.clip(data_var_log, epsilon, np.inf)
183 | 
184 |     est_mean, est_var = theta
185 |     soft_plus_est_var = np.log(1 + np.exp(-np.abs(est_var))) + np.maximum(est_var, 0)
186 |     est_var = np.clip(soft_plus_est_var, epsilon, 1.0)
187 | 
188 |     # Affine transformation of the parameters
189 |     est_mean = np.sqrt(data_var_log) * est_mean + data_mean_log
190 |     est_var = data_var_log * est_var
191 | 
192 |     output['samples'] = np.exp(np.random.normal(est_mean, np.sqrt(est_var))) - 1.0
193 |     output['params'] = [est_mean, est_var]
194 | 
195 |     return output
196 | 
197 | 
198 | def loglik_test_cat(theta, normalization_params, list_type):
199 |     output = dict()
200 | 
201 |     # Data outputs
202 |     log_pi = theta
203 | 
204 |     est_cat = Helpers.cat_sample(log_pi)
205 |     estimated_samples = Helpers.indices_to_one_hot(est_cat, int(list_type['dim']))
206 | 
207 |     output['samples'] = estimated_samples
208 |     output['params'] = log_pi
209 | 
210 |     return output
211 | 
212 | 
213 | def loglik_test_ordinal(theta, normalization_params, list_type):
214 |     output = dict()
215 |     epsilon = 1e-6
216 | 
217 |     # We need to force that the outputs of the network increase with the categories
218 |     partition_param, mean_param = theta
219 | 
220 |     batch_size = mean_param.shape[0]
221 | 
222 |     mean_value = mean_param.reshape(-1, 1)
223 |     soft_plus_partition_param = np.log(1 + np.exp(-np.abs(partition_param))) + np.maximum(partition_param, 0)
224 | 
225 |     theta_values = np.cumsum(np.clip(soft_plus_partition_param, epsilon, 1e20), axis=1)
226 |     sigmoid_est_mean = expit(theta_values - mean_value)
227 |     mean_probs = np.c_[sigmoid_est_mean, np.ones(batch_size)] - np.c_[np.zeros(batch_size), sigmoid_est_mean]
228 |     mean_probs = np.clip(mean_probs, epsilon, 1e20)
229 | 
230 |     mean_logits = np.log(mean_probs/(1-mean_probs))
231 | 
232 |     pseudo_cat = 1 + Helpers.cat_sample(mean_logits)
233 | 
234 |     output['samples'] = Helpers.sequence_mask(pseudo_cat, batch_size, int(list_type['dim']))
235 |     output['params'] = mean_probs
236 | 
237 |     return output
238 | 
239 | 
240 | 
241 | def loglik_test_count(theta, normalization_params, list_type):
242 |     output = dict()
243 |     epsilon = 1e-6
244 | 
245 |     est_lambda = theta
246 |     soft_plus_lambda = np.log(1 + np.exp(-np.abs(est_lambda))) + np.maximum(est_lambda, 0)
247 |     est_lambda = np.clip(soft_plus_lambda, epsilon, 1e20)
248 | 
249 |     output['samples'] = np.random.poisson(est_lambda)
250 |     output['params'] = est_lambda
251 | 
252 |     return output
253 | 
254 | 


--------------------------------------------------------------------------------
/code/Sampling.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import time
  4 | #import csv
  5 | #import pandas as pd
  6 | #import matplotlib
  7 | #import random
  8 | #from matplotlib import pyplot as plt
  9 | #import seaborn as sns
 10 | #from numpy import linalg as LA
 11 | #from scipy.spatial.distance import cdist
 12 | #from sklearn.model_selection import train_test_split
 13 | #from sklearn.neighbors import LocalOutlierFactor
 14 | #from sklearn.cluster import DBSCAN
 15 | #from sklearn import preprocessing
 16 | #from sklearn.linear_model import LogisticRegression
 17 | #from sklearn.ensemble import RandomForestClassifier
 18 | from sklearn.preprocessing import StandardScaler
 19 | 
 20 | from tqdm import tqdm, tqdm_notebook
 21 | 
 22 | from recourse.builder import RecourseBuilder
 23 | from recourse.builder import ActionSet
 24 | 
 25 | # import functions
 26 | import Helpers
 27 | import Evaluation
 28 | import Graph
 29 | 
 30 | #import Encoder
 31 | #import Decoder
 32 | #import Generator
 33 | #import Loglik
 34 | #import LaugelEtAl
 35 | np.random.seed(619)
 36 | 
 37 | def print_loss(epoch, start_time, avg_loss, avg_KL_s, avg_KL_z):
 38 |     print("Epoch: [%2d]  time: %4.4f, train_loglik: %.8f, KL_z: %.8f, KL_s: %.8f, ELBO: %.8f"
 39 |           % (epoch, time.time() - start_time, avg_loss, avg_KL_z, avg_KL_s, avg_loss - avg_KL_z - avg_KL_s))
 40 | 
 41 | # -----------------------------------------------------------------------------------#
 42 | ############################# Running the C-CHVAE search   ##########################
 43 | # -----------------------------------------------------------------------------------#
 44 | 
 45 | def sampling(settings, types_dict, types_dict_c, out, ncounterfactuals, clf, n_batches_train, n_samples_train, k, n_input, degree_active):
 46 | 
 47 |     argvals = settings.split()
 48 |     args = Helpers.getArgs(argvals)
 49 | 
 50 | # Creating graph
 51 |     sess_HVAE = tf.Graph()
 52 | 
 53 |     with sess_HVAE.as_default():
 54 |         # args.model_name: excluded
 55 |         tf_nodes = Graph.C_CHVAE_graph(args.types_file, args.types_file_c,
 56 |                                    learning_rate=1e-3, z_dim=args.dim_latent_z,
 57 |                                    y_dim=args.dim_latent_y, s_dim=args.dim_latent_s,
 58 |                                    y_dim_partition=args.dim_latent_y_partition, nsamples=1000, p=2)
 59 | 
 60 |     # start session
 61 |     with tf.Session(graph=sess_HVAE) as session:
 62 |         # Add ops to save and restore all the variables.
 63 |         saver = tf.train.Saver()
 64 |         print('Initizalizing Variables ...')
 65 |         tf.global_variables_initializer().run()
 66 | 
 67 |         # -----------------------------------------------------------------------------------#
 68 |         # Apply on training data
 69 | 
 70 |         print('Training the CHVAE ...')
 71 |         if (args.train == 1):
 72 | 
 73 |             start_time = time.time()
 74 |             # Training cycle
 75 | 
 76 |             loglik_epoch = []
 77 |             KL_s_epoch = []
 78 |             KL_z_epoch = []
 79 |             for epoch in tqdm(range(args.epochs)):
 80 |                 avg_loss = 0.
 81 |                 avg_KL_s = 0.
 82 |                 avg_KL_z = 0.
 83 |                 samples_list = []
 84 |                 p_params_list = []
 85 |                 q_params_list = []
 86 |                 log_p_x_total = []
 87 | 
 88 |                 # Annealing of Gumbel-Softmax parameter
 89 |                 tau = np.max([1.0 - 0.001 * epoch, 1e-3])
 90 | 
 91 |                 # Randomize the data in the mini-batches
 92 |                 train_data = out['training'][1]
 93 |                 train_data_c = out['training'][2]
 94 |                 random_perm = np.random.permutation(range(np.shape(train_data)[0]))
 95 |                 train_data_aux = train_data[random_perm, :]
 96 |                 train_data_aux_c = train_data_c[random_perm, :]
 97 | 
 98 |                 for i in range(n_batches_train):
 99 |                     # Create inputs for the feed_dict
100 |                     data_list = Helpers.next_batch(train_data_aux, types_dict, args.batch_size, index_batch=i)  # DONE
101 |                     data_list_c = Helpers.next_batch(train_data_aux_c, types_dict_c, args.batch_size, index_batch=i)  # DONE
102 | 
103 |                     # Create feed dictionary
104 |                     feedDict = {i: d for i, d in zip(tf_nodes['ground_batch'], data_list)}
105 |                     feedDict.update({i: d for i, d in zip(tf_nodes['ground_batch_c'], data_list_c)})
106 |                     feedDict[tf_nodes['tau_GS']] = tau
107 |                     feedDict[tf_nodes['batch_size']] = args.batch_size
108 | 
109 |                     # Running VAE
110 |                     _, X_list, loss, KL_z, KL_s, samples, log_p_x, p_params, q_params = session.run(
111 |                         [tf_nodes['optim'],
112 |                         tf_nodes['X'],
113 |                         tf_nodes['loss_re'],
114 |                         tf_nodes['KL_z'],
115 |                         tf_nodes['KL_s'],
116 |                         tf_nodes['samples'],
117 |                         tf_nodes['log_p_x'],
118 |                         tf_nodes['p_params'],
119 |                         tf_nodes['q_params']],
120 |                         feed_dict=feedDict)
121 | 
122 |                     # Collect all samples, distirbution parameters and logliks in lists
123 |                     if i == 0:
124 |                         samples_list = [samples]
125 |                         p_params_list = [p_params]
126 |                         q_params_list = [q_params]
127 |                         log_p_x_total = [log_p_x]
128 |                     else:
129 |                         samples_list.append(samples)
130 |                         p_params_list.append(p_params)
131 |                         q_params_list.append(q_params)
132 |                         log_p_x_total.append(log_p_x)
133 | 
134 |                     # Compute average loss
135 |                     avg_loss += np.mean(loss)
136 |                     avg_KL_s += np.mean(KL_s)
137 |                     avg_KL_z += np.mean(KL_z)
138 | 
139 |                 # Concatenate samples in arrays
140 |                 s_total, z_total, y_total, est_data = Helpers.samples_concatenation(samples_list)
141 | 
142 |                 # Transform discrete variables back to the original values
143 |                 train_data_transformed = Helpers.discrete_variables_transformation(
144 |                     train_data_aux[:n_batches_train * args.batch_size, :], types_dict)
145 |                 est_data_transformed = Helpers.discrete_variables_transformation(est_data, types_dict)
146 | 
147 |                 # Create global dictionary of the distribution parameters
148 |                 p_params_complete = Helpers.p_distribution_params_concatenation(p_params_list,  # DONE
149 |                                                                             types_dict,
150 |                                                                             args.dim_latent_z,
151 |                                                                             args.dim_latent_s)
152 | 
153 |                 q_params_complete = Helpers.q_distribution_params_concatenation(q_params_list,  # DONE
154 |                                                                             args.dim_latent_z,
155 |                                                                             args.dim_latent_s)
156 | 
157 |                 # Compute mean and mode of our loglik models: these correspond to the estimated values
158 |                 loglik_mean, loglik_mode = Helpers.statistics(p_params_complete['x'], types_dict)  # DONE
159 | 
160 |                 # Try this for the errors
161 |                 error_train_mean = Helpers.error_computation(train_data_transformed, loglik_mean, types_dict)
162 |                 error_train_mode = Helpers.error_computation(train_data_transformed, loglik_mode, types_dict)
163 |                 error_train_samples = Helpers.error_computation(train_data_transformed, est_data_transformed, types_dict)
164 | 
165 |                 # Display logs per epoch step
166 |                 if epoch % args.display == 0:
167 |                     print_loss(epoch, start_time, avg_loss / n_batches_train, avg_KL_s / n_batches_train,
168 |                            avg_KL_z / n_batches_train)
169 |                     print("")
170 | 
171 |             # Plot evolution of test loglik
172 |                 loglik_per_variable = np.sum(np.concatenate(log_p_x_total, 1), 1) / n_samples_train
173 | 
174 |                 loglik_epoch.append(loglik_per_variable)
175 | 
176 |             # -----------------------------------------------------------------------------------#
177 |             # Apply on test data
178 | 
179 |             for i in range(1):
180 |                 samples_test_list = []
181 |                 test_params_list = []
182 |                 log_p_x_test_list = []
183 |                 data_c_list = []
184 | 
185 |                 test_data_counter = out['test_counter'][1]
186 |                 test_data_c_counter = out['test_counter'][2]
187 |                 y_test_counter = out['test_counter'][3]
188 |                 n_samples_test = test_data_counter.shape[0]
189 | 
190 |                 # Create test minibatch
191 |                 data_list = Helpers.next_batch(test_data_counter, types_dict, n_samples_test, index_batch=i)
192 |                 data_list_c = Helpers.next_batch(test_data_c_counter, types_dict_c, n_samples_test, index_batch=i)  # DONE
193 | 
194 |             # Constant Gumbel-Softmax parameter (where we have finished the annealing
195 |                 tau = 1e-3
196 | 
197 |             # Create feed dictionary
198 |                 feedDict = {i: d for i, d in zip(tf_nodes['ground_batch'], data_list)}
199 |                 feedDict.update({i: d for i, d in zip(tf_nodes['ground_batch_c'], data_list_c)})
200 |                 feedDict[tf_nodes['tau_GS']] = tau
201 |                 feedDict[tf_nodes['batch_size']] = ncounterfactuals  # n_samples_test
202 | 
203 |             # Get samples from the generator function (computing the mode of all distributions)
204 |                 samples_test, log_p_x_test, test_params, theta_test, normalization_params_test, X, delta_kl = session.run(
205 |                     [tf_nodes['samples_test'],
206 |                     tf_nodes['log_p_x_test'],
207 |                     tf_nodes['test_params'],
208 |                     tf_nodes['theta_test'],
209 |                     tf_nodes['normalization_params'],
210 |                     tf_nodes['X'],
211 |                     tf_nodes['delta_kl']],
212 |                     feed_dict=feedDict)
213 | 
214 |                 samples_test_list.append(samples_test)
215 |                 test_params_list.append(test_params)
216 |                 log_p_x_test_list.append(log_p_x_test)
217 |                 data_c_list.append(data_list_c)
218 | 
219 |             # Concatenate samples in arrays
220 |             s_total_test, z_total_test, y_total_test, samples_total_test = Helpers.samples_concatenation(samples_test_list)
221 | 
222 |             # Transform discrete variables back to the original values
223 |             est_samples_transformed = Helpers.discrete_variables_transformation(samples_total_test, types_dict)
224 | 
225 |             # -----------------------------------------------------------------------------------#
226 |             # Find k Attainable Counterfactuals
227 |             print('[*] Find Attainable Counterfactuals...')
228 | 
229 |             counter_batch_size = 1  # counterfactual batch size (i.e. look for counterfactuals one by one)
230 |             data_concat = []
231 |             data_concat_c = []
232 |             counterfactuals = []
233 |             latent_tilde = []
234 |             latent = []
235 | 
236 |             search_samples = args.search_samples
237 |             p = args.norm_latent_space
238 | 
239 |             for i in tqdm(range(ncounterfactuals)):
240 | 
241 |                 s = (k, n_input)  # preallocate k spots; # inputs
242 |                 sz = (k, args.dim_latent_z)
243 |                 s = np.zeros(s)
244 |                 sz = np.zeros(sz)
245 |                 ik = 0  # counter
246 | 
247 |                 l = 0
248 |                 step = args.step_size
249 | 
250 |                 x_adv, y_adv, z_adv, d_adv = None, None, None, None
251 | 
252 | 
253 |                 #scale test observations
254 |                 scaled_test, scaler_test = Helpers.standardize(test_data_counter)
255 | 
256 |                 # get one test observation
257 |                 data_list = Helpers.next_batch(test_data_counter, types_dict, counter_batch_size, index_batch=i)
258 |                 data_list_c = Helpers.next_batch(test_data_c_counter, types_dict_c, counter_batch_size, index_batch=i)
259 |                 hat_y_test = np.repeat(y_test_counter[i] * 1, search_samples, axis=0)
260 |                 test_data_c_replicated = np.repeat(test_data_c_counter[i, :].reshape(1, -1), search_samples, axis=0)
261 |                 replicated_scaled_test = np.repeat(scaled_test[i, :].reshape(1, -1), search_samples, axis=0)
262 | 
263 | 
264 |                 # get replicated observations (observation replicated nsamples times)
265 |                 #replicated_scaled_test = Helpers.replicate_data_list(data_list_scaled, search_samples)
266 |                 replicated_data_list = Helpers.replicate_data_list(data_list, search_samples)
267 |                 replicated_data_list_c = Helpers.replicate_data_list(data_list_c, search_samples)
268 |                 replicated_z = np.repeat(z_total_test[i].reshape(-1, args.dim_latent_z), search_samples, axis=0)
269 | 
270 |                 h = l + step
271 |                 # counter to stop
272 |                 count = 0
273 |                 counter_step = 1
274 |                 max_step = 500
275 | 
276 |                 while True:
277 | 
278 |                     count = count + counter_step
279 | 
280 |                     if  (count > max_step) == True:
281 |                         sz = None
282 |                         s = None
283 |                         z = z_total_test[i].reshape(-1, args.dim_latent_z)
284 |                         break
285 | 
286 |                     if degree_active == 1: #choose all latent features for search
287 | 
288 |                         delta_z = np.random.randn(search_samples, replicated_z.shape[1])  # http://mathworld.wolfram.com/HyperspherePointPicking.html
289 |                         d = np.random.rand(search_samples) * (h - l) + l  # length range [l, h)
290 |                         norm_p = np.linalg.norm(delta_z, ord=p, axis=1)
291 |                         d_norm = np.divide(d, norm_p).reshape(-1, 1)  # rescale/normalize factor
292 |                         delta_z = np.multiply(delta_z, d_norm)
293 |                         z_tilde = replicated_z + delta_z  # z tilde
294 | 
295 |                     else:
296 | 
297 |                         delta_z = np.random.randn(search_samples, replicated_z.shape[1])  # http://mathworld.wolfram.com/HyperspherePointPicking.html
298 |                         d = np.random.rand(search_samples) * (h - l) + l  # length range [l, h)
299 |                         norm_p = np.linalg.norm(delta_z, ord=p, axis=1)
300 |                         d_norm = np.divide(d, norm_p).reshape(-1, 1)  # rescale/normalize factor
301 |                         delta_z = np.multiply(delta_z, d_norm)
302 | 
303 |                         mask = np.tile(delta_kl[3][0, :] * 1,
304 |                                        (search_samples, 1))  # only alter most important latent features
305 |                         delta_z = np.multiply(delta_z, mask)
306 | 
307 |                         z_tilde = replicated_z + delta_z
308 | 
309 | 
310 |                     # create feed dictionary
311 |                     feedDict = {i: d for i, d in zip(tf_nodes['ground_batch'], replicated_data_list)}
312 |                     feedDict.update({i: d for i, d in zip(tf_nodes['ground_batch_c'], replicated_data_list_c)})
313 |                     feedDict[tf_nodes['samples_z']] = z_tilde
314 |                     feedDict[tf_nodes['tau_GS']] = tau
315 |                     feedDict[tf_nodes['batch_size']] = search_samples
316 | 
317 |                     theta_perturbed, samples_perturbed = session.run([tf_nodes['theta_perturbed'],
318 |                                                                   tf_nodes['samples_perturbed']], feed_dict=feedDict)
319 | 
320 |                     x_tilde, params_x_perturbed = Evaluation.loglik_evaluation_test(X_list,
321 |                                                                                 theta_perturbed,
322 |                                                                                 normalization_params_test,
323 |                                                                                 types_dict)
324 |                     x_tilde = np.concatenate(x_tilde, axis=1)
325 |                     scaled_tilde = scaler_test.transform(x_tilde)
326 |                     d_scale = np.sum(np.abs(scaled_tilde - replicated_scaled_test), axis=1)
327 | 
328 |                     x_tilde = np.c_[test_data_c_replicated, x_tilde]
329 |                     y_tilde = clf.predict(x_tilde)
330 | 
331 |                     indices_adv = np.where(y_tilde == 0)[0]
332 | 
333 |                     if len(indices_adv) == 0:  # no candidate generated
334 |                         l = h
335 |                         h = l + step
336 |                     elif all(s[k - 1, :] == 0):  # not k candidates generated
337 | 
338 |                         indx = indices_adv[np.argmin(d_scale[indices_adv])]
339 |                         assert (y_tilde[indx] != 1)
340 | 
341 |                         s[ik, :] = x_tilde[indx, :]
342 |                         sz[ik, :] = z_tilde[indx, :]
343 |                         z = z_total_test[i].reshape(-1, args.dim_latent_z)
344 | 
345 |                         ik = ik + 1  # up the count
346 |                         l = h
347 |                         h = l + step
348 |                     else:  # k candidates genereated
349 |                         break
350 | 
351 |                 data_concat.append(np.concatenate(data_list, axis=1))
352 |                 data_concat_c.append(np.concatenate(data_list_c, axis=1))
353 |                 counterfactuals.append(s)
354 |                 latent_tilde.append(sz)
355 |                 latent.append(z)
356 | 
357 |     cchvae_counterfactuals = np.array(counterfactuals)
358 |     return cchvae_counterfactuals
359 | 
360 | 


--------------------------------------------------------------------------------
/data/givme/give_me_types.csv:
--------------------------------------------------------------------------------
 1 | type,dim,nclass
 2 | pos,1,
 3 | count,1,
 4 | pos,1,
 5 | pos,1,
 6 | count,1,
 7 | count,1,
 8 | count,1,
 9 | count,1,
10 | 


--------------------------------------------------------------------------------
/data/givme/give_me_types_c.csv:
--------------------------------------------------------------------------------
1 | type,dim,nclass
2 | count,1,
3 | count,1,
4 | 


--------------------------------------------------------------------------------
/data/heloc/heloc_types.csv:
--------------------------------------------------------------------------------
 1 | type,dim,nclass
 2 | real,1,
 3 | real,1,
 4 | real,1,
 5 | real,1,
 6 | real,1,
 7 | real,1,
 8 | real,1,
 9 | real,1,
10 | real,1,
11 | real,1,
12 | real,1,
13 | real,1,
14 | real,1,
15 | real,1,
16 | real,1,
17 | real,1,
18 | real,1,
19 | 


--------------------------------------------------------------------------------
/data/heloc/heloc_types_alt.csv:
--------------------------------------------------------------------------------
 1 | type,dim,nclass
 2 | count,1,
 3 | count,1,
 4 | count,1,
 5 | count,1,
 6 | count,1,
 7 | count,1,
 8 | count,1,
 9 | count,1,
10 | count,1,
11 | count,1,
12 | count,1,
13 | count,1,
14 | count,1,
15 | count,1,
16 | count,1,
17 | count,1,
18 | count,1,
19 | 


--------------------------------------------------------------------------------
/data/heloc/heloc_types_c_alt.csv:
--------------------------------------------------------------------------------
1 | type,dim,nclass
2 | real,1,
3 | real,1,
4 | real,1,
5 | 


--------------------------------------------------------------------------------
/preprocessing/Preprocessing_GiveMeSomeCredit.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 11,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "import matplotlib\n",
 12 |     "from matplotlib import pyplot as plt"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 17,
 18 |    "metadata": {},
 19 |    "outputs": [
 20 |     {
 21 |      "name": "stdout",
 22 |      "output_type": "stream",
 23 |      "text": [
 24 |       "Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',\n",
 25 |       "       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',\n",
 26 |       "       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',\n",
 27 |       "       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',\n",
 28 |       "       'NumberOfDependents'],\n",
 29 |       "      dtype='object')\n"
 30 |      ]
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "# Credit Data Processing\n",
 35 |     "# For further info on constraints imposed below: see also appendix in Ustun et al (2018)\n",
 36 |     "raw_df = pd.read_csv('cs-training.txt')\n",
 37 |     "processed_df = raw_df\n",
 38 |     "\n",
 39 |     "# drop NAs & unnamed column & convert boolean to numeric\n",
 40 |     "processed_df = processed_df.dropna()\n",
 41 |     "processed_df = processed_df.drop(columns='Unnamed: 0')\n",
 42 |     "processed_df = processed_df + 0 \n",
 43 |     "processed_df = processed_df.loc[processed_df['age']<88]\n",
 44 |     "\n",
 45 |     "# look at column names\n",
 46 |     "print(processed_df.columns)"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 18,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "Index(['RevolvingUtilizationOfUnsecuredLines',\n",
 59 |       "       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',\n",
 60 |       "       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',\n",
 61 |       "       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse'],\n",
 62 |       "      dtype='object')\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "# Labels, protected & free featuers\n",
 68 |     "# labels\n",
 69 |     "epsilon = 1e-4\n",
 70 |     "# we clip 0 to avoid evaluation errors when using log normal likelihood\n",
 71 |     "\n",
 72 |     "labels = processed_df[processed_df.columns[0]]\n",
 73 |     "labels.columns = [processed_df.columns[0]]\n",
 74 |     "# conditioning set/protected set\n",
 75 |     "conditionals = processed_df[[processed_df.columns[2], processed_df.columns[10]]]\n",
 76 |     "conditionals.columns = [processed_df.columns[2], processed_df.columns[10]]\n",
 77 |     "# free features\n",
 78 |     "free = processed_df.drop(columns=[processed_df.columns[0], processed_df.columns[2], processed_df.columns[10]])\n",
 79 |     "free[free.columns[0]] = np.clip(free.values[:,0], epsilon, 1e20)\n",
 80 |     "free[free.columns[2]] = np.clip(free.values[:,2], epsilon, 1e20)\n",
 81 |     "free[free.columns[3]] = np.clip(free.values[:,3], epsilon, 1e20)\n",
 82 |     "print(free.columns)"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 19,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# Save to CSV\n",
 92 |     "#free.to_csv('give_me_x_35.csv', header = False, index = False)\n",
 93 |     "#conditionals.to_csv('give_me_x_c_35.csv', header = False, index = False)\n",
 94 |     "#labels.to_csv('give_me_y_35.csv', header = False, index = False)"
 95 |    ]
 96 |   }
 97 |  ],
 98 |  "metadata": {
 99 |   "kernelspec": {
100 |    "display_name": "Python 3",
101 |    "language": "python",
102 |    "name": "python3"
103 |   },
104 |   "language_info": {
105 |    "codemirror_mode": {
106 |     "name": "ipython",
107 |     "version": 3
108 |    },
109 |    "file_extension": ".py",
110 |    "mimetype": "text/x-python",
111 |    "name": "python",
112 |    "nbconvert_exporter": "python",
113 |    "pygments_lexer": "ipython3",
114 |    "version": "3.6.8"
115 |   }
116 |  },
117 |  "nbformat": 4,
118 |  "nbformat_minor": 2
119 | }
120 | 


--------------------------------------------------------------------------------
/preprocessing/Preprocessing_Heloc.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "import numpy as np\n",
  10 |     "import pandas as pd\n",
  11 |     "import matplotlib\n",
  12 |     "from matplotlib import pyplot as plt"
  13 |    ]
  14 |   },
  15 |   {
  16 |    "cell_type": "code",
  17 |    "execution_count": 2,
  18 |    "metadata": {},
  19 |    "outputs": [],
  20 |    "source": [
  21 |     "# Credit Data Processing\n",
  22 |     "# For further info on constraints imposed below: see also appendix in Ustun et al (2018)\n",
  23 |     "raw_df = pd.read_csv('heloc.csv')\n",
  24 |     "processed_df = raw_df\n",
  25 |     "# drop NAs & unnamed column & convert boolean to numeric & only keep positive records\n",
  26 |     "processed_df = processed_df.dropna()\n",
  27 |     "processed_df.columns\n",
  28 |     "d = {'Good':0,'Bad':1}\n",
  29 |     "processed_df['RiskPerformance'] = processed_df['RiskPerformance'].replace(d)\n",
  30 |     "prcessed_df = processed_df + 0"
  31 |    ]
  32 |   },
  33 |   {
  34 |    "cell_type": "code",
  35 |    "execution_count": 3,
  36 |    "metadata": {},
  37 |    "outputs": [],
  38 |    "source": [
  39 |     "# dropped due to many missing (aka negative) observations\n",
  40 |     "#processed_df = processed_df.drop(columns=['MSinceMostRecentDelq', 'NetFractionInstallBurden', 'MSinceMostRecentTradeOpen', 'NumTradesOpeninLast12M', 'NumInstallTradesWBalance', 'NumInqLast6Mexcl7days', 'MaxDelq2PublicRecLast12M', 'MaxDelqEver'])                                      \n",
  41 |     "processed_df = processed_df.drop(columns=['MSinceMostRecentDelq', 'MSinceMostRecentInqexcl7days', 'NetFractionInstallBurden'])"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "code",
  46 |    "execution_count": 4,
  47 |    "metadata": {},
  48 |    "outputs": [
  49 |     {
  50 |      "data": {
  51 |       "text/plain": [
  52 |        "Index(['RiskPerformance', 'ExternalRiskEstimate', 'MSinceOldestTradeOpen',\n",
  53 |        "       'MSinceMostRecentTradeOpen', 'AverageMInFile', 'NumSatisfactoryTrades',\n",
  54 |        "       'NumTrades60Ever2DerogPubRec', 'NumTrades90Ever2DerogPubRec',\n",
  55 |        "       'PercentTradesNeverDelq', 'MaxDelq2PublicRecLast12M', 'MaxDelqEver',\n",
  56 |        "       'NumTotalTrades', 'NumTradesOpeninLast12M', 'PercentInstallTrades',\n",
  57 |        "       'NumInqLast6M', 'NumInqLast6Mexcl7days', 'NetFractionRevolvingBurden',\n",
  58 |        "       'NumRevolvingTradesWBalance', 'NumInstallTradesWBalance',\n",
  59 |        "       'NumBank2NatlTradesWHighUtilization', 'PercentTradesWBalance'],\n",
  60 |        "      dtype='object')"
  61 |       ]
  62 |      },
  63 |      "execution_count": 4,
  64 |      "metadata": {},
  65 |      "output_type": "execute_result"
  66 |     }
  67 |    ],
  68 |    "source": [
  69 |     "col = processed_df.columns\n",
  70 |     "col"
  71 |    ]
  72 |   },
  73 |   {
  74 |    "cell_type": "code",
  75 |    "execution_count": 5,
  76 |    "metadata": {},
  77 |    "outputs": [],
  78 |    "source": [
  79 |     "#processed_df = processed_df.loc[processed_df['MSinceOldestTradeOpen']<300]\n",
  80 |     "#processed_df = processed_df.loc[processed_df['AverageMInFile']<200]\n",
  81 |     "#processed_df = processed_df.loc[processed_df['NumSatisfactoryTrades']<60]\n",
  82 |     "#processed_df = processed_df.loc[processed_df['NumTrades60Ever2DerogPubRec']<8]\n",
  83 |     "#processed_df = processed_df.loc[processed_df['NumTrades90Ever2DerogPubRec']<8]\n",
  84 |     "#processed_df = processed_df.loc[processed_df['NumTotalTrades']<60]\n",
  85 |     "#processed_df = processed_df.loc[processed_df['PercentInstallTrades']<60]\n",
  86 |     "#processed_df = processed_df.loc[processed_df['MSinceMostRecentInqexcl7days']<8]\n",
  87 |     "#processed_df = processed_df.loc[processed_df['NumInqLast6M']<10]\n",
  88 |     "#processed_df = processed_df.loc[processed_df['NetFractionRevolvingBurden']<100]\n",
  89 |     "#processed_df = processed_df.loc[processed_df['NumRevolvingTradesWBalance']<20]\n",
  90 |     "#processed_df = processed_df.loc[processed_df['NumBank2NatlTradesWHighUtilization']<8]"
  91 |    ]
  92 |   },
  93 |   {
  94 |    "cell_type": "code",
  95 |    "execution_count": 6,
  96 |    "metadata": {},
  97 |    "outputs": [
  98 |     {
  99 |      "name": "stdout",
 100 |      "output_type": "stream",
 101 |      "text": [
 102 |       "598\n",
 103 |       "827\n",
 104 |       "588\n",
 105 |       "588\n",
 106 |       "588\n"
 107 |      ]
 108 |     }
 109 |    ],
 110 |    "source": [
 111 |     "print((processed_df.values[:,1] < 0).sum())\n",
 112 |     "print((processed_df.values[:,2] < 0).sum())\n",
 113 |     "print((processed_df.values[:,3] < 0).sum())\n",
 114 |     "print((processed_df.values[:,4] < 0).sum())\n",
 115 |     "print((processed_df.values[:,5] < 0).sum())"
 116 |    ]
 117 |   },
 118 |   {
 119 |    "cell_type": "code",
 120 |    "execution_count": 7,
 121 |    "metadata": {},
 122 |    "outputs": [
 123 |     {
 124 |      "name": "stdout",
 125 |      "output_type": "stream",
 126 |      "text": [
 127 |       "588\n",
 128 |       "588\n",
 129 |       "588\n",
 130 |       "588\n",
 131 |       "588\n"
 132 |      ]
 133 |     }
 134 |    ],
 135 |    "source": [
 136 |     "print((processed_df.values[:,6] < 0).sum())\n",
 137 |     "print((processed_df.values[:,7] < 0).sum())\n",
 138 |     "print((processed_df.values[:,8] < 0).sum())\n",
 139 |     "print((processed_df.values[:,9] < 0).sum())\n",
 140 |     "print((processed_df.values[:,10] < 0).sum())"
 141 |    ]
 142 |   },
 143 |   {
 144 |    "cell_type": "code",
 145 |    "execution_count": 8,
 146 |    "metadata": {},
 147 |    "outputs": [
 148 |     {
 149 |      "name": "stdout",
 150 |      "output_type": "stream",
 151 |      "text": [
 152 |       "588\n",
 153 |       "588\n",
 154 |       "588\n",
 155 |       "588\n",
 156 |       "588\n"
 157 |      ]
 158 |     }
 159 |    ],
 160 |    "source": [
 161 |     "print((processed_df.values[:,11] < 0).sum())\n",
 162 |     "print((processed_df.values[:,12] < 0).sum())\n",
 163 |     "print((processed_df.values[:,13] < 0).sum())\n",
 164 |     "print((processed_df.values[:,14] < 0).sum())\n",
 165 |     "print((processed_df.values[:,15] < 0).sum())"
 166 |    ]
 167 |   },
 168 |   {
 169 |    "cell_type": "code",
 170 |    "execution_count": 9,
 171 |    "metadata": {},
 172 |    "outputs": [
 173 |     {
 174 |      "data": {
 175 |       "text/plain": [
 176 |        "(array([5.880e+02, 0.000e+00, 0.000e+00, 9.296e+03, 3.940e+02, 1.400e+02,\n",
 177 |        "        2.700e+01, 9.000e+00, 3.000e+00, 2.000e+00]),\n",
 178 |        " array([-9. , -6.2, -3.4, -0.6,  2.2,  5. ,  7.8, 10.6, 13.4, 16.2, 19. ]),\n",
 179 |        " <a list of 10 Patch objects>)"
 180 |       ]
 181 |      },
 182 |      "execution_count": 9,
 183 |      "metadata": {},
 184 |      "output_type": "execute_result"
 185 |     },
 186 |     {
 187 |      "data": {
 188 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAD8CAYAAACcjGjIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEsVJREFUeJzt3X+s3XV9x/Hne638VGmBC8G22jIbBXVK10CRxRDqyo8Zyx+QlJhRTZdGxzbUJQ62ZBWVTBcjSKK4RuqKcRSGbDRYx5oCWbZo4RaQXwV7BUavIL2upToNzup7f5zPpYd+Ttvbc+6931Pu85GcnO/38/18z/d97/m2r/v9fL/neyIzkSSp3e80XYAkqf8YDpKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSapMb7qAbp144ok5d+7cpsuQpMPGli1bfpqZA2Ppe9iGw9y5cxkcHGy6DEk6bETEf4+1r8NKkqSK4SBJqhgOkqSK4SBJqhgOkqTKQcMhItZExI6IeKyt7fiI2BgR28rzzNIeEXFDRAxFxCMRsaBtneWl/7aIWN7W/vsR8WhZ54aIiPH+ISVJh2YsRw7/CFywT9tVwKbMnA9sKvMAFwLzy2MlcCO0wgRYBZwFnAmsGg2U0mdl23r7bkuSNMkOGg6Z+R/Azn2alwJry/Ra4OK29puz5fvAjIg4BTgf2JiZOzNzF7ARuKAse2Nmfi9b31d6c9trSZIa0u05h5Mz8wWA8nxSaZ8FbG/rN1zaDtQ+3KFdktSg8f6EdKfzBdlFe+cXj1hJawiKN7/5zd3UB8C71r6rq/UeXf5o19uUpMNJt0cOL5YhIcrzjtI+DMxp6zcbeP4g7bM7tHeUmaszc2FmLhwYGNPtQSRJXeg2HNYDo1ccLQfubGu/vFy1tAjYXYad7gaWRMTMciJ6CXB3WfbziFhUrlK6vO21JEkNOeiwUkTcApwLnBgRw7SuOvo8cFtErACeAy4t3TcAFwFDwC+BjwBk5s6I+CzwQOn3mcwcPcn9MVpXRB0NfLc8JEkNOmg4ZOZl+1m0uEPfBK7Yz+usAdZ0aB8E3nmwOiRJk8dPSEuSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKlM+HD684S1NlyBJfWfKh4MkqWY4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqWI4SJIqhoMkqdJTOETEJyLi8Yh4LCJuiYijImJeRGyOiG0RcWtEHFH6Hlnmh8ryuW2vc3Vpfyoizu/tR5Ik9arrcIiIWcBfAAsz853ANGAZ8AXgusycD+wCVpRVVgC7MvOtwHWlHxFxelnvHcAFwFcjYlq3dUmSetfrsNJ04OiImA4cA7wAnAfcXpavBS4u00vLPGX54oiI0r4uM3+Vmc8AQ8CZPdYlSepB1+GQmT8Gvgg8RysUdgNbgJcyc0/pNgzMKtOzgO1l3T2l/wnt7R3WeZWIWBkRgxExODIy0m3pkqSD6GVYaSatv/rnAW8CjgUu7NA1R1fZz7L9tdeNmaszc2FmLhwYGDj0oiVJY9LLsNL7gWcycyQzfw3cAbwXmFGGmQBmA8+X6WFgDkBZfhyws729wzqSpAb0Eg7PAYsi4phy7mAx8ARwL3BJ6bMcuLNMry/zlOX3ZGaW9mXlaqZ5wHzg/h7qkiT1aPrBu3SWmZsj4nbgQWAP8BCwGvgOsC4iPlfabiqr3AR8MyKGaB0xLCuv83hE3EYrWPYAV2Tmb7qtS5LUu67DASAzVwGr9ml+mg5XG2Xmy8Cl+3mda4Fre6lFkjR+/IS0JKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKoaDJKliOEiSKj2FQ0TMiIjbI+LJiNgaEWdHxPERsTEitpXnmaVvRMQNETEUEY9ExIK211le+m+LiOW9/lCSpN70euTwZeDfMvPtwLuBrcBVwKbMnA9sKvMAFwLzy2MlcCNARBwPrALOAs4EVo0GiiSpGV2HQ0S8EXgfcBNAZv5fZr4ELAXWlm5rgYvL9FLg5mz5PjAjIk4Bzgc2ZubOzNwFbAQu6LYuSVLvejlyOBUYAb4REQ9FxNcj4ljg5Mx8AaA8n1T6zwK2t60/XNr21y5Jakgv4TAdWADcmJlnAL9g7xBSJ9GhLQ/QXr9AxMqIGIyIwZGRkUOtV5I0Rr2EwzAwnJmby/zttMLixTJcRHne0dZ/Ttv6s4HnD9BeyczVmbkwMxcODAz0ULok6UC6DofM/AmwPSLeVpoWA08A64HRK46WA3eW6fXA5eWqpUXA7jLsdDewJCJmlhPRS0qbJKkh03tc/8+Bb0XEEcDTwEdoBc5tEbECeA64tPTdAFwEDAG/LH3JzJ0R8VnggdLvM5m5s8e6JEk96CkcMvNhYGGHRYs79E3giv28zhpgTS+1SJLGj5+QliRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVpjddwFSx9e2ndb3uaU9uHcdKJOngej5yiIhpEfFQRNxV5udFxOaI2BYRt0bEEaX9yDI/VJbPbXuNq0v7UxFxfq81SZJ6Mx7DSlcC7X/afgG4LjPnA7uAFaV9BbArM98KXFf6ERGnA8uAdwAXAF+NiGnjUJckqUs9hUNEzAb+CPh6mQ/gPOD20mUtcHGZXlrmKcsXl/5LgXWZ+avMfAYYAs7spS5JUm96PXK4HvgU8NsyfwLwUmbuKfPDwKwyPQvYDlCW7y79X2nvsI4kqQFdh0NEfADYkZlb2ps7dM2DLDvQOvtuc2VEDEbE4MjIyCHVK0kau16OHM4BPhgRzwLraA0nXQ/MiIjRq6BmA8+X6WFgDkBZfhyws729wzqvkpmrM3NhZi4cGBjooXRJ0oF0HQ6ZeXVmzs7MubROKN+TmR8C7gUuKd2WA3eW6fVlnrL8nszM0r6sXM00D5gP3N9tXZKk3k3E5xz+ClgXEZ8DHgJuKu03Ad+MiCFaRwzLADLz8Yi4DXgC2ANckZm/mYC6JEljNC7hkJn3AfeV6afpcLVRZr4MXLqf9a8Frh2PWiRJvfP2GZKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSar4fQ6HoJfvZJCkw4nhMFV8+riGtru7me1K6onDSpKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSaoYDpKkiuEgSar4Ibg2H/3el/e77GtnXzmJlUhSszxykCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRVDAdJUsVwkCRV/IT04aCpr/iUNGV55CBJqhgOkqSK4SBJqhgOkqSK4SBJqhgOkqRK1+EQEXMi4t6I2BoRj0fElaX9+IjYGBHbyvPM0h4RcUNEDEXEIxGxoO21lpf+2yJiee8/liSpF70cOewB/jIzTwMWAVdExOnAVcCmzJwPbCrzABcC88tjJXAjtMIEWAWcBZwJrBoNFElSM7oOh8x8ITMfLNM/B7YCs4ClwNrSbS1wcZleCtycLd8HZkTEKcD5wMbM3JmZu4CNwAXd1iVJ6t24nHOIiLnAGcBm4OTMfAFaAQKcVLrNAra3rTZc2vbX3mk7KyNiMCIGR0ZGxqN0SVIHPYdDRLwe+Dbw8cz82YG6dmjLA7TXjZmrM3NhZi4cGBg49GIlSWPSUzhExOtoBcO3MvOO0vxiGS6iPO8o7cPAnLbVZwPPH6BdktSQXq5WCuAmYGtmfqlt0Xpg9Iqj5cCdbe2Xl6uWFgG7y7DT3cCSiJhZTkQvKW2SpIb0clfWc4A/Bh6NiIdL218Dnwdui4gVwHPApWXZBuAiYAj4JfARgMzcGRGfBR4o/T6TmTt7qEuS1KOuwyEz/5PO5wsAFnfon8AV+3mtNcCabmuRJI0vPyEtSar4ZT+aWE1+UdGndze3bekw55GDJKnikcMYffR7X+aec1vT593X8dSJJL1meOQgSaoYDpKkiuEgSaoYDpKkiuEgSaoYDpKkipeyHga2rntT1+uetswb3Eo6dB45SJIqU/LI4ba/2/PK9IZ3tz7gJknayyMHSVLFcJAkVQwHSVLFcJAkVQwHSVJlSl6tpCmiqS8a8kuG9BpgOHThnnO/8qp5v99B0muNw0qSpIrhIEmqGA6SpIrhIEmqTMkT0hve/bscNfOTrZldX2q2GEnqQx45SJIqhoMkqWI4SJIqhgPwsucdJOlVpuQJaWlCedsOvQYYDuOg/XYa/XYrjW6/f9rvnpamNoeVJEmVvgmHiLggIp6KiKGIuKrpeiRpKuuLcIiIacBXgAuB04HLIuL0yazBk9KStFe/nHM4ExjKzKcBImIdsBR4otGqutDP5x8ORbfnKsDzFY1p6kQ4eDL8NahfwmEWsL1tfhg4a7KLeHnXl/beVmMcvFaCQtLU0y/hEB3asuoUsRJYWWb/NyKe6n6T3xmdOBH4aYf2pu1TV18YW03XTHwh+zh8f1eTb2LquqbTP+Exm1q/q970WtNbxtqxX8JhGJjTNj8bqMYmMnM1sHo8NxwRg5m5cDxfczz0Y139WBP0Z139WBP0Z139WBP0Z12TWVNfnJAGHgDmR8S8iDgCWAasb7gmSZqy+uLIITP3RMSfAXcD04A1mfl4w2VJ0pTVF+EAkJkbgA0NbHpch6nGUT/W1Y81QX/W1Y81QX/W1Y81QX/WNWk1RWZ13leSNMX1yzkHSVIfmdLh0NQtOyJiTUTsiIjH2tqOj4iNEbGtPM8s7RERN5QaH4mIBRNU05yIuDcitkbE4xFxZZ/UdVRE3B8RPyh1XVPa50XE5lLXreVCBiLiyDI/VJbPnYi6yramRcRDEXFXH9X0bEQ8GhEPR8RgaWv6PZwREbdHxJNl/zq7D2p6W/kdjT5+FhEf74O6PlH288ci4pay/zezX2XmlHzQOvH9I+BU4AjgB8Dpk7Tt9wELgMfa2v4euKpMXwV8oUxfBHyX1mdBFgGbJ6imU4AFZfoNwA9p3cqk6boCeH2Zfh2wuWzvNmBZaf8a8LEy/afA18r0MuDWCXwfPwn8E3BXme+Hmp4FTtynren3cC3wJ2X6CGBG0zXtU9804Ce0PgPQWF20Pgz8DHB02/704ab2qwn9pffzAzgbuLtt/mrg6knc/lxeHQ5PAaeU6VOAp8r0PwCXdeo3wfXdCfxhP9UFHAM8SOvT8z8Fpu/7XtK64u3sMj299IsJqGU2sAk4D7ir/KfRaE3l9Z+lDofG3kPgjeU/vOiXmjrUuAT4r6brYu+dIo4v+8ldwPlN7VdTeVip0y07ZjVUC8DJmfkCQHk+qbRPep3l8PQMWn+lN15XGb55GNgBbKR1xPdSZu7psO1X6irLdwMnTEBZ1wOfAn5b5k/og5qgdWeBf4+ILdG6owA0+x6eCowA3yhDcF+PiGMbrmlfy4BbynRjdWXmj4EvAs8BL9DaT7bQ0H41lcNhTLfs6AOTWmdEvB74NvDxzPzZgbp2aJuQujLzN5n5Hlp/rZ8JnHaAbU94XRHxAWBHZm5pb26ypjbnZOYCWnc4viIi3neAvpNR13RaQ6g3ZuYZwC9oDdc0WdPejbXG7z8I/PPBunZoG+/9aiatG47OA94EHEvrfdzfdie0pqkcDmO6ZcckejEiTgEozztK+6TVGRGvoxUM38rMO/qlrlGZ+RJwH60x3xkRMfo5nfZtv1JXWX4csHOcSzkH+GBEPAusozW0dH3DNQGQmc+X5x3Av9AK0ybfw2FgODM3l/nbaYVFv+xXFwIPZuaLZb7Jut4PPJOZI5n5a+AO4L00tF9N5XDot1t2rAeWl+nltMb8R9svL1dLLAJ2jx72jqeICOAmYGtmtn+5RdN1DUTEjDJ9NK1/QFuBe4FL9lPXaL2XAPdkGZQdL5l5dWbOzsy5tPabezLzQ03WBBARx0bEG0anaY2lP0aD72Fm/gTYHhFvK02Lad2Kv9H9qs1l7B1SGt1+U3U9ByyKiGPKv8fR31Uz+9VEnujp9wetKxB+SGsM+28mcbu30BpT/DWt9F9Ba6xwE7CtPB9f+gatL0L6EfAosHCCavoDWoekjwAPl8dFfVDX7wEPlboeA/62tJ8K3A8M0RoSOLK0H1Xmh8ryUyf4vTyXvVcrNVpT2f4PyuPx0X26D97D9wCD5T38V2Bm0zWVbR0D/A9wXFtb07+ra4Any77+TeDIpvYrPyEtSapM5WElSdJ+GA6SpIrhIEmqGA6SpIrhIEmqGA6SpIrhIEmqGA6SpMr/A8xkpSYy/c9MAAAAAElFTkSuQmCC\n",
 189 |       "text/plain": [
 190 |        "<Figure size 432x288 with 1 Axes>"
 191 |       ]
 192 |      },
 193 |      "metadata": {
 194 |       "needs_background": "light"
 195 |      },
 196 |      "output_type": "display_data"
 197 |     }
 198 |    ],
 199 |    "source": [
 200 |     "plt.hist(processed_df.values[:,1])\n",
 201 |     "plt.hist(processed_df.values[:,2])\n",
 202 |     "plt.hist(processed_df.values[:,3])\n",
 203 |     "plt.hist(processed_df.values[:,4])\n",
 204 |     "plt.hist(processed_df.values[:,5])\n",
 205 |     "plt.hist(processed_df.values[:,6])"
 206 |    ]
 207 |   },
 208 |   {
 209 |    "cell_type": "code",
 210 |    "execution_count": 10,
 211 |    "metadata": {},
 212 |    "outputs": [
 213 |     {
 214 |      "data": {
 215 |       "text/plain": [
 216 |        "(array([5.880e+02, 0.000e+00, 0.000e+00, 7.005e+03, 2.046e+03, 7.020e+02,\n",
 217 |        "        9.900e+01, 1.400e+01, 3.000e+00, 2.000e+00]),\n",
 218 |        " array([-9. , -6.2, -3.4, -0.6,  2.2,  5. ,  7.8, 10.6, 13.4, 16.2, 19. ]),\n",
 219 |        " <a list of 10 Patch objects>)"
 220 |       ]
 221 |      },
 222 |      "execution_count": 10,
 223 |      "metadata": {},
 224 |      "output_type": "execute_result"
 225 |     },
 226 |     {
 227 |      "data": {
 228 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYcAAAD8CAYAAACcjGjIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAEYpJREFUeJzt3X+s3XV9x/Hna1TEH9EWqU5bXDE2KrpsshusuhgCGwIayxLJWNxoCEvj0vl7Udw/9RdREyNKhhgiaDEGJOhGM9lIw4+4JRMpYkSspg1scKVCTQGdxh+d7/1xPlcP/dzblnsOPffQ5yNpzvm+z+d7zvubb9vX/Xy/3/O9qSokSRr2e5NuQJK09BgOkqSO4SBJ6hgOkqSO4SBJ6hgOkqSO4SBJ6hw0HJJcmeShJN8dqh2bZFuSne1xRasnySVJdiX5TpKThtbZ0MbvTLJhqP4nSe5q61ySJOPeSEnS43MoM4cvAGfsV7sQuKmq1gI3tWWAM4G17c9G4DIYhAmwGXgVcDKweS5Q2piNQ+vt/1mSpMNs2cEGVNXXk6zZr7weOKU93wLcCryv1a+qwdeuv5FkeZLnt7HbqmovQJJtwBlJbgWeVVX/1epXAWcD/3awvo477rhas2b/tiRJC7njjjt+XFUrD2XsQcNhAc+rqt0AVbU7yXNbfRVw/9C42VY7UH12nvq8kmxkMMvghS98Idu3b19k+5J05EnyP4c6dtwnpOc7X1CLqM+rqi6vqpmqmlm58pDCT5K0CIsNhwfb4SLa40OtPgscPzRuNfDAQeqr56lLkiZoseGwFZi74mgDcP1Q/bx21dI64NF2+OlG4PQkK9qJ6NOBG9trP02yrl2ldN7Qe0mSJuSg5xySXM3ghPJxSWYZXHX0MeDaJBcA9wHntOE3AGcBu4CfA+cDVNXeJB8Gbm/jPjR3chr4OwZXRD2NwYnog56MliQ9sTKtv89hZmamPCEtSYcuyR1VNXMoY/2GtCSpYzhIkjqGgySpYzhIkjqL/Ya05rHmwq8d8PX//tgbDlMnkjQaZw6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpM5I4ZDkXUnuTvLdJFcnOSbJCUluS7IzyZeTHN3GPrUt72qvrxl6n/e3+g+SvH60TZIkjWrR4ZBkFfB2YKaqXgEcBZwLfBy4uKrWAg8DF7RVLgAerqoXAxe3cSQ5sa33cuAM4DNJjlpsX5Kk0Y16WGkZ8LQky4CnA7uBU4Hr2utbgLPb8/Vtmfb6aUnS6tdU1S+r6l5gF3DyiH1Jkkaw6HCoqh8CnwDuYxAKjwJ3AI9U1b42bBZY1Z6vAu5v6+5r458zXJ9nHUnSBIxyWGkFg5/6TwBeADwDOHOeoTW3ygKvLVSf7zM3JtmeZPuePXsef9OSpEMyymGlPwPurao9VfVr4KvAa4Dl7TATwGrggfZ8FjgeoL3+bGDvcH2edR6jqi6vqpmqmlm5cuUIrUuSDmSUcLgPWJfk6e3cwWnA94BbgDe3MRuA69vzrW2Z9vrNVVWtfm67mukEYC3wzRH6kiSNaNnBh8yvqm5Lch3wLWAfcCdwOfA14JokH2m1K9oqVwBfTLKLwYzh3PY+dye5lkGw7AM2VdX/LbYvSdLoFh0OAFW1Gdi8X/ke5rnaqKp+AZyzwPtcBFw0Si+SpPHxG9KSpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpI7hIEnqGA6SpM5I35DW+P3hlj98zPJdG+6aUCfSEeoDz57AZz56+D/zIJw5SJI6hoMkqWM4SJI6hoMkqWM4SJI6hoMkqeOlrGP0tnsvO8iINxyWPiRpVM4cJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEmdkcIhyfIk1yX5fpIdSV6d5Ngk25LsbI8r2tgkuSTJriTfSXLS0PtsaON3Jtkw6kZJkkYz6szh08C/V9VLgT8CdgAXAjdV1VrgprYMcCawtv3ZCFwGkORYYDPwKuBkYPNcoEiSJmPR4ZDkWcDrgCsAqupXVfUIsB7Y0oZtAc5uz9cDV9XAN4DlSZ4PvB7YVlV7q+phYBtwxmL7kiSNbpSZw4uAPcDnk9yZ5HNJngE8r6p2A7TH57bxq4D7h9afbbWF6pKkCRklHJYBJwGXVdUrgZ/xu0NI88k8tTpAvX+DZGOS7Um279mz5/H2K0k6RKOEwywwW1W3teXrGITFg+1wEe3xoaHxxw+tvxp44AD1TlVdXlUzVTWzcuXKEVqXJB3IosOhqn4E3J/kJa10GvA9YCswd8XRBuD69nwrcF67amkd8Gg77HQjcHqSFe1E9OmtJkmakGUjrv824EtJjgbuAc5nEDjXJrkAuA84p429ATgL2AX8vI2lqvYm+TBwexv3oaraO2JfkqQRjBQOVfVtYGael06bZ2wBmxZ4nyuBK0fpRZI0Pn5DWpLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSR3DQZLUMRwkSZ1lk25A87v2o/sA2PHRl/229rLv75hUO5KOMM4cJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1DEcJEkdw0GS1Bk5HJIcleTOJP/alk9IcluSnUm+nOToVn9qW97VXl8z9B7vb/UfJHn9qD1JkkYzjpnDO4Dhm/58HLi4qtYCDwMXtPoFwMNV9WLg4jaOJCcC5wIvB84APpPkqDH0JUlapJHCIclq4A3A59pygFOB69qQLcDZ7fn6tkx7/bQ2fj1wTVX9sqruBXYBJ4/SlyRpNKPOHD4FvBf4TVt+DvBIVe1ry7PAqvZ8FXA/QHv90Tb+t/V51nmMJBuTbE+yfc+ePSO2LklayKLDIckbgYeq6o7h8jxD6yCvHWidxxarLq+qmaqaWbly5ePqV5J06Eb5fQ6vBd6U5CzgGOBZDGYSy5Msa7OD1cADbfwscDwwm2QZ8Gxg71B9zvA6kqQJWPTMoareX1Wrq2oNgxPKN1fVW4BbgDe3YRuA69vzrW2Z9vrNVVWtfm67mukEYC3wzcX2JUka3RPxm+DeB1yT5CPAncAVrX4F8MUkuxjMGM4FqKq7k1wLfA/YB2yqqv97Avpasna89He/7e3aCfYhSXPGEg5VdStwa3t+D/NcbVRVvwDOWWD9i4CLxtGLJGl0fkNaktQxHCRJnSfinIOWoEvfevNEPnfTZ0+dyOdKGo0zB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSZ9mkG9CT26VvvXlin73ps6dO7LOlaefMQZLUMRwkSR3DQZLUMRwkSR1PSB9GC56cPeXSQ1r/5gme3JV0ZFn0zCHJ8UluSbIjyd1J3tHqxybZlmRne1zR6klySZJdSb6T5KSh99rQxu9MsmH0zZIkjWKUw0r7gPdU1cuAdcCmJCcCFwI3VdVa4Ka2DHAmsLb92QhcBoMwATYDrwJOBjbPBYokaTIWfVipqnYDu9vznybZAawC1gOntGFbgFuB97X6VVVVwDeSLE/y/DZ2W1XtBUiyDTgDuHqxvS1Vv3j4kwd8/ZgV7z5MnUjSgY3lhHSSNcArgduA57XgmAuQ57Zhq4D7h1abbbWF6vN9zsYk25Ns37NnzzhalyTNY+RwSPJM4CvAO6vqJwcaOk+tDlDvi1WXV9VMVc2sXLny8TcrSTokI4VDkqcwCIYvVdVXW/nBdriI9vhQq88Cxw+tvhp44AB1SdKEjHK1UoArgB1VNXwwfSswd8XRBuD6ofp57aqldcCj7bDTjcDpSVa0E9Gnt5okaUJG+Z7Da4G/Ae5K8u1W+0fgY8C1SS4A7gPOaa/dAJwF7AJ+DpwPUFV7k3wYuL2N+9DcyWlJ0mSMcrXSfzL/+QKA0+YZX8CmBd7rSuDKxfYiSRovb58hSeoYDpKkjuEgSeoYDpKkjuEgSeoYDpKkjr/PQdLS9YFnT7qDI5YzB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3sr6Unr0rfePJHP3fTZUyfyudI4OXOQJHWOyJnDpH6ilKRp4cxBktQxHCRJnSPysNJS9YuHP7nga8esePdh7ETSkc6ZgySpc0TOHPwJXXqc/HWdRxxnDpKkjuEgSeoYDpKkjuEgSeoYDpKkjuEgSeockZeySk+kid0N9vf/YiKfqycnZw6SpM6SCYckZyT5QZJdSS6cdD+SdCRbEoeVkhwFXAr8OTAL3J5ka1V9b7KdSdPj0h/988Q+20NaTz5LIhyAk4FdVXUPQJJrgPWA4dAc6JYf4G0/JI3XUgmHVcD9Q8uzwKsm1MtU8n5RmqRJzVqcsTxxlko4ZJ5adYOSjcDGtvi/SX4w/la+No43OQ748TjeaDzGsk2w5LZrbNyu6fLb7fr7CTcyNh/M4dpXf3CoA5dKOMwCxw8trwYe2H9QVV0OXH64mlqsJNurambSfYyb2zVd3K7psRS3aalcrXQ7sDbJCUmOBs4Ftk64J0k6Yi2JmUNV7Uvy98CNwFHAlVV194TbkqQj1pIIB4CqugG4YdJ9jMmSP/S1SG7XdHG7pseS26ZUded9JUlHuKVyzkGStIQYDmP2ZLgNSJLjk9ySZEeSu5O8o9WPTbItyc72uGLSvS5GkqOS3JnkX9vyCUlua9v15XZRxFRJsjzJdUm+3/bbq58M+yvJu9rfwe8muTrJMdO4v5JcmeShJN8dqs27fzJwSfs/5DtJTppEz4bDGA3dBuRM4ETgr5KcONmuFmUf8J6qehmwDtjUtuNC4KaqWgvc1Jan0TuAHUPLHwcubtv1MHDBRLoazaeBf6+qlwJ/xGD7pnp/JVkFvB2YqapXMLhY5Vymc399AThjv9pC++dMYG37sxG47DD1+BiGw3j99jYgVfUrYO42IFOlqnZX1bfa858y+I9mFYNt2dKGbQHOnkyHi5dkNfAG4HNtOcCpwHVtyNRtV5JnAa8DrgCoql9V1SM8CfYXg4tmnpZkGfB0YDdTuL+q6uvA3v3KC+2f9cBVNfANYHmS5x+eTn/HcBiv+W4DsmpCvYxFkjXAK4HbgOdV1W4YBAjw3Ml1tmifAt4L/KYtPwd4pKr2teVp3GcvAvYAn2+Hyz6X5BlM+f6qqh8CnwDuYxAKjwJ3MP37a85C+2dJ/D9iOIzXId0GZFokeSbwFeCdVfWTSfczqiRvBB6qqjuGy/MMnbZ9tgw4Cbisql4J/IwpO4Q0n3YMfj1wAvAC4BkMDrnsb9r218Esib+ThsN4HdJtQKZBkqcwCIYvVdVXW/nBuelte3xoUv0t0muBNyX5bwaH/E5lMJNY3g5bwHTus1lgtqpua8vXMQiLad9ffwbcW1V7qurXwFeB1zD9+2vOQvtnSfw/YjiM15PiNiDtOPwVwI6qGr7d61ZgQ3u+Abj+cPc2iqp6f1Wtrqo1DPbNzVX1FuAW4M1t2DRu14+A+5O8pJVOY3C7+6neXwwOJ61L8vT2d3Juu6Z6fw1ZaP9sBc5rVy2tAx6dO/x0OPkluDFLchaDn0bnbgNy0YRbetyS/CnwH8Bd/O7Y/D8yOO9wLfBCBv9wz6mq/U+yTYUkpwD/UFVvTPIiBjOJY4E7gb+uql9Osr/HK8kfMzjJfjRwD3A+gx/+pnp/Jfkg8JcMrqC7E/hbBsffp2p/JbkaOIXBHWUfBDYD/8I8+6cF4T8xuLrp58D5VbX9sPdsOEiS9udhJUlSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHUMB0lSx3CQJHX+H6Fw0PqKkMWvAAAAAElFTkSuQmCC\n",
 229 |       "text/plain": [
 230 |        "<Figure size 432x288 with 1 Axes>"
 231 |       ]
 232 |      },
 233 |      "metadata": {
 234 |       "needs_background": "light"
 235 |      },
 236 |      "output_type": "display_data"
 237 |     }
 238 |    ],
 239 |    "source": [
 240 |     "plt.hist(processed_df.values[:,7])\n",
 241 |     "plt.hist(processed_df.values[:,8])\n",
 242 |     "plt.hist(processed_df.values[:,9])\n",
 243 |     "plt.hist(processed_df.values[:,10])\n",
 244 |     "plt.hist(processed_df.values[:,11])\n",
 245 |     "plt.hist(processed_df.values[:,12])"
 246 |    ]
 247 |   },
 248 |   {
 249 |    "cell_type": "code",
 250 |    "execution_count": 11,
 251 |    "metadata": {},
 252 |    "outputs": [],
 253 |    "source": [
 254 |     "df = processed_df.values\n",
 255 |     "n, p = np.shape(df)\n",
 256 |     "\n",
 257 |     "# keep only pos values\n",
 258 |     "for i in range (p):\n",
 259 |     "    df = df[df[:,i] >= 0]   "
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "code",
 264 |    "execution_count": 12,
 265 |    "metadata": {},
 266 |    "outputs": [],
 267 |    "source": [
 268 |     "# get label\n",
 269 |     "y = df[:,0]\n",
 270 |     "# get df\n",
 271 |     "df_ = df[:,1::]"
 272 |    ]
 273 |   },
 274 |   {
 275 |    "cell_type": "code",
 276 |    "execution_count": 13,
 277 |    "metadata": {},
 278 |    "outputs": [],
 279 |    "source": [
 280 |     "col[1::]\n",
 281 |     "free = pd.DataFrame(df_, dtype='float')\n",
 282 |     "free.columns = [col[1::]]\n",
 283 |     "free = free + 1\n",
 284 |     "\n",
 285 |     "labels = pd.DataFrame(y, dtype='float')\n",
 286 |     "labels.columns =  [col[0]]"
 287 |    ]
 288 |   },
 289 |   {
 290 |    "cell_type": "code",
 291 |    "execution_count": 14,
 292 |    "metadata": {},
 293 |    "outputs": [],
 294 |    "source": [
 295 |     "conditionals = free[['ExternalRiskEstimate', 'MSinceOldestTradeOpen', 'AverageMInFile']]"
 296 |    ]
 297 |   },
 298 |   {
 299 |    "cell_type": "code",
 300 |    "execution_count": 15,
 301 |    "metadata": {},
 302 |    "outputs": [
 303 |     {
 304 |      "data": {
 305 |       "text/html": [
 306 |        "<div>\n",
 307 |        "<style scoped>\n",
 308 |        "    .dataframe tbody tr th:only-of-type {\n",
 309 |        "        vertical-align: middle;\n",
 310 |        "    }\n",
 311 |        "\n",
 312 |        "    .dataframe tbody tr th {\n",
 313 |        "        vertical-align: top;\n",
 314 |        "    }\n",
 315 |        "\n",
 316 |        "    .dataframe thead tr th {\n",
 317 |        "        text-align: left;\n",
 318 |        "    }\n",
 319 |        "</style>\n",
 320 |        "<table border=\"1\" class=\"dataframe\">\n",
 321 |        "  <thead>\n",
 322 |        "    <tr>\n",
 323 |        "      <th></th>\n",
 324 |        "      <th>ExternalRiskEstimate</th>\n",
 325 |        "      <th>MSinceOldestTradeOpen</th>\n",
 326 |        "      <th>AverageMInFile</th>\n",
 327 |        "    </tr>\n",
 328 |        "  </thead>\n",
 329 |        "  <tbody>\n",
 330 |        "    <tr>\n",
 331 |        "      <th>0</th>\n",
 332 |        "      <td>56.0</td>\n",
 333 |        "      <td>145.0</td>\n",
 334 |        "      <td>85.0</td>\n",
 335 |        "    </tr>\n",
 336 |        "    <tr>\n",
 337 |        "      <th>1</th>\n",
 338 |        "      <td>68.0</td>\n",
 339 |        "      <td>67.0</td>\n",
 340 |        "      <td>25.0</td>\n",
 341 |        "    </tr>\n",
 342 |        "    <tr>\n",
 343 |        "      <th>2</th>\n",
 344 |        "      <td>67.0</td>\n",
 345 |        "      <td>170.0</td>\n",
 346 |        "      <td>74.0</td>\n",
 347 |        "    </tr>\n",
 348 |        "    <tr>\n",
 349 |        "      <th>3</th>\n",
 350 |        "      <td>82.0</td>\n",
 351 |        "      <td>334.0</td>\n",
 352 |        "      <td>133.0</td>\n",
 353 |        "    </tr>\n",
 354 |        "    <tr>\n",
 355 |        "      <th>4</th>\n",
 356 |        "      <td>60.0</td>\n",
 357 |        "      <td>138.0</td>\n",
 358 |        "      <td>79.0</td>\n",
 359 |        "    </tr>\n",
 360 |        "  </tbody>\n",
 361 |        "</table>\n",
 362 |        "</div>"
 363 |       ],
 364 |       "text/plain": [
 365 |        "  ExternalRiskEstimate MSinceOldestTradeOpen AverageMInFile\n",
 366 |        "0                 56.0                 145.0           85.0\n",
 367 |        "1                 68.0                  67.0           25.0\n",
 368 |        "2                 67.0                 170.0           74.0\n",
 369 |        "3                 82.0                 334.0          133.0\n",
 370 |        "4                 60.0                 138.0           79.0"
 371 |       ]
 372 |      },
 373 |      "execution_count": 15,
 374 |      "metadata": {},
 375 |      "output_type": "execute_result"
 376 |     }
 377 |    ],
 378 |    "source": [
 379 |     "conditionals.head()"
 380 |    ]
 381 |   },
 382 |   {
 383 |    "cell_type": "code",
 384 |    "execution_count": 16,
 385 |    "metadata": {},
 386 |    "outputs": [
 387 |     {
 388 |      "data": {
 389 |       "text/html": [
 390 |        "<div>\n",
 391 |        "<style scoped>\n",
 392 |        "    .dataframe tbody tr th:only-of-type {\n",
 393 |        "        vertical-align: middle;\n",
 394 |        "    }\n",
 395 |        "\n",
 396 |        "    .dataframe tbody tr th {\n",
 397 |        "        vertical-align: top;\n",
 398 |        "    }\n",
 399 |        "\n",
 400 |        "    .dataframe thead tr th {\n",
 401 |        "        text-align: left;\n",
 402 |        "    }\n",
 403 |        "</style>\n",
 404 |        "<table border=\"1\" class=\"dataframe\">\n",
 405 |        "  <thead>\n",
 406 |        "    <tr>\n",
 407 |        "      <th></th>\n",
 408 |        "      <th>ExternalRiskEstimate</th>\n",
 409 |        "      <th>MSinceOldestTradeOpen</th>\n",
 410 |        "      <th>MSinceMostRecentTradeOpen</th>\n",
 411 |        "      <th>AverageMInFile</th>\n",
 412 |        "      <th>NumSatisfactoryTrades</th>\n",
 413 |        "      <th>NumTrades60Ever2DerogPubRec</th>\n",
 414 |        "      <th>NumTrades90Ever2DerogPubRec</th>\n",
 415 |        "      <th>PercentTradesNeverDelq</th>\n",
 416 |        "      <th>MaxDelq2PublicRecLast12M</th>\n",
 417 |        "      <th>MaxDelqEver</th>\n",
 418 |        "      <th>NumTotalTrades</th>\n",
 419 |        "      <th>NumTradesOpeninLast12M</th>\n",
 420 |        "      <th>PercentInstallTrades</th>\n",
 421 |        "      <th>NumInqLast6M</th>\n",
 422 |        "      <th>NumInqLast6Mexcl7days</th>\n",
 423 |        "      <th>NetFractionRevolvingBurden</th>\n",
 424 |        "      <th>NumRevolvingTradesWBalance</th>\n",
 425 |        "      <th>NumInstallTradesWBalance</th>\n",
 426 |        "      <th>NumBank2NatlTradesWHighUtilization</th>\n",
 427 |        "      <th>PercentTradesWBalance</th>\n",
 428 |        "    </tr>\n",
 429 |        "  </thead>\n",
 430 |        "  <tbody>\n",
 431 |        "    <tr>\n",
 432 |        "      <th>count</th>\n",
 433 |        "      <td>8291.000000</td>\n",
 434 |        "      <td>8291.000000</td>\n",
 435 |        "      <td>8291.000000</td>\n",
 436 |        "      <td>8291.000000</td>\n",
 437 |        "      <td>8291.000000</td>\n",
 438 |        "      <td>8291.000000</td>\n",
 439 |        "      <td>8291.000000</td>\n",
 440 |        "      <td>8291.000000</td>\n",
 441 |        "      <td>8291.000000</td>\n",
 442 |        "      <td>8291.000000</td>\n",
 443 |        "      <td>8291.00000</td>\n",
 444 |        "      <td>8291.000000</td>\n",
 445 |        "      <td>8291.000000</td>\n",
 446 |        "      <td>8291.000000</td>\n",
 447 |        "      <td>8291.000000</td>\n",
 448 |        "      <td>8291.000000</td>\n",
 449 |        "      <td>8291.000000</td>\n",
 450 |        "      <td>8291.000000</td>\n",
 451 |        "      <td>8291.000000</td>\n",
 452 |        "      <td>8291.000000</td>\n",
 453 |        "    </tr>\n",
 454 |        "    <tr>\n",
 455 |        "      <th>mean</th>\n",
 456 |        "      <td>72.783018</td>\n",
 457 |        "      <td>202.620552</td>\n",
 458 |        "      <td>9.284043</td>\n",
 459 |        "      <td>78.446026</td>\n",
 460 |        "      <td>23.580871</td>\n",
 461 |        "      <td>1.556025</td>\n",
 462 |        "      <td>1.357617</td>\n",
 463 |        "      <td>93.831022</td>\n",
 464 |        "      <td>6.741527</td>\n",
 465 |        "      <td>7.383187</td>\n",
 466 |        "      <td>25.29321</td>\n",
 467 |        "      <td>2.987577</td>\n",
 468 |        "      <td>35.673381</td>\n",
 469 |        "      <td>2.473405</td>\n",
 470 |        "      <td>2.415873</td>\n",
 471 |        "      <td>36.923290</td>\n",
 472 |        "      <td>5.327464</td>\n",
 473 |        "      <td>3.513328</td>\n",
 474 |        "      <td>2.125075</td>\n",
 475 |        "      <td>67.845495</td>\n",
 476 |        "    </tr>\n",
 477 |        "    <tr>\n",
 478 |        "      <th>std</th>\n",
 479 |        "      <td>9.665108</td>\n",
 480 |        "      <td>94.478782</td>\n",
 481 |        "      <td>8.661243</td>\n",
 482 |        "      <td>30.963539</td>\n",
 483 |        "      <td>11.059356</td>\n",
 484 |        "      <td>1.218511</td>\n",
 485 |        "      <td>0.961419</td>\n",
 486 |        "      <td>10.708137</td>\n",
 487 |        "      <td>1.628200</td>\n",
 488 |        "      <td>1.809892</td>\n",
 489 |        "      <td>12.59546</td>\n",
 490 |        "      <td>1.863552</td>\n",
 491 |        "      <td>15.553447</td>\n",
 492 |        "      <td>2.120066</td>\n",
 493 |        "      <td>2.079832</td>\n",
 494 |        "      <td>28.446931</td>\n",
 495 |        "      <td>3.038434</td>\n",
 496 |        "      <td>1.656994</td>\n",
 497 |        "      <td>1.547062</td>\n",
 498 |        "      <td>20.508632</td>\n",
 499 |        "    </tr>\n",
 500 |        "    <tr>\n",
 501 |        "      <th>min</th>\n",
 502 |        "      <td>37.000000</td>\n",
 503 |        "      <td>3.000000</td>\n",
 504 |        "      <td>1.000000</td>\n",
 505 |        "      <td>5.000000</td>\n",
 506 |        "      <td>2.000000</td>\n",
 507 |        "      <td>1.000000</td>\n",
 508 |        "      <td>1.000000</td>\n",
 509 |        "      <td>21.000000</td>\n",
 510 |        "      <td>1.000000</td>\n",
 511 |        "      <td>3.000000</td>\n",
 512 |        "      <td>2.00000</td>\n",
 513 |        "      <td>1.000000</td>\n",
 514 |        "      <td>3.000000</td>\n",
 515 |        "      <td>1.000000</td>\n",
 516 |        "      <td>1.000000</td>\n",
 517 |        "      <td>1.000000</td>\n",
 518 |        "      <td>1.000000</td>\n",
 519 |        "      <td>2.000000</td>\n",
 520 |        "      <td>1.000000</td>\n",
 521 |        "      <td>8.000000</td>\n",
 522 |        "    </tr>\n",
 523 |        "    <tr>\n",
 524 |        "      <th>25%</th>\n",
 525 |        "      <td>65.000000</td>\n",
 526 |        "      <td>139.000000</td>\n",
 527 |        "      <td>4.000000</td>\n",
 528 |        "      <td>59.000000</td>\n",
 529 |        "      <td>16.000000</td>\n",
 530 |        "      <td>1.000000</td>\n",
 531 |        "      <td>1.000000</td>\n",
 532 |        "      <td>91.000000</td>\n",
 533 |        "      <td>6.000000</td>\n",
 534 |        "      <td>7.000000</td>\n",
 535 |        "      <td>16.00000</td>\n",
 536 |        "      <td>2.000000</td>\n",
 537 |        "      <td>24.000000</td>\n",
 538 |        "      <td>1.000000</td>\n",
 539 |        "      <td>1.000000</td>\n",
 540 |        "      <td>11.500000</td>\n",
 541 |        "      <td>3.000000</td>\n",
 542 |        "      <td>2.000000</td>\n",
 543 |        "      <td>1.000000</td>\n",
 544 |        "      <td>51.000000</td>\n",
 545 |        "    </tr>\n",
 546 |        "    <tr>\n",
 547 |        "      <th>50%</th>\n",
 548 |        "      <td>73.000000</td>\n",
 549 |        "      <td>188.000000</td>\n",
 550 |        "      <td>7.000000</td>\n",
 551 |        "      <td>76.000000</td>\n",
 552 |        "      <td>22.000000</td>\n",
 553 |        "      <td>1.000000</td>\n",
 554 |        "      <td>1.000000</td>\n",
 555 |        "      <td>98.000000</td>\n",
 556 |        "      <td>7.000000</td>\n",
 557 |        "      <td>7.000000</td>\n",
 558 |        "      <td>23.00000</td>\n",
 559 |        "      <td>3.000000</td>\n",
 560 |        "      <td>34.000000</td>\n",
 561 |        "      <td>2.000000</td>\n",
 562 |        "      <td>2.000000</td>\n",
 563 |        "      <td>32.000000</td>\n",
 564 |        "      <td>5.000000</td>\n",
 565 |        "      <td>3.000000</td>\n",
 566 |        "      <td>2.000000</td>\n",
 567 |        "      <td>68.000000</td>\n",
 568 |        "    </tr>\n",
 569 |        "    <tr>\n",
 570 |        "      <th>75%</th>\n",
 571 |        "      <td>81.000000</td>\n",
 572 |        "      <td>257.500000</td>\n",
 573 |        "      <td>12.000000</td>\n",
 574 |        "      <td>96.000000</td>\n",
 575 |        "      <td>30.000000</td>\n",
 576 |        "      <td>2.000000</td>\n",
 577 |        "      <td>1.000000</td>\n",
 578 |        "      <td>101.000000</td>\n",
 579 |        "      <td>8.000000</td>\n",
 580 |        "      <td>9.000000</td>\n",
 581 |        "      <td>32.00000</td>\n",
 582 |        "      <td>4.000000</td>\n",
 583 |        "      <td>46.000000</td>\n",
 584 |        "      <td>3.000000</td>\n",
 585 |        "      <td>3.000000</td>\n",
 586 |        "      <td>58.000000</td>\n",
 587 |        "      <td>7.000000</td>\n",
 588 |        "      <td>4.000000</td>\n",
 589 |        "      <td>3.000000</td>\n",
 590 |        "      <td>84.000000</td>\n",
 591 |        "    </tr>\n",
 592 |        "    <tr>\n",
 593 |        "      <th>max</th>\n",
 594 |        "      <td>95.000000</td>\n",
 595 |        "      <td>804.000000</td>\n",
 596 |        "      <td>107.000000</td>\n",
 597 |        "      <td>245.000000</td>\n",
 598 |        "      <td>80.000000</td>\n",
 599 |        "      <td>20.000000</td>\n",
 600 |        "      <td>20.000000</td>\n",
 601 |        "      <td>101.000000</td>\n",
 602 |        "      <td>10.000000</td>\n",
 603 |        "      <td>9.000000</td>\n",
 604 |        "      <td>105.00000</td>\n",
 605 |        "      <td>20.000000</td>\n",
 606 |        "      <td>94.000000</td>\n",
 607 |        "      <td>67.000000</td>\n",
 608 |        "      <td>67.000000</td>\n",
 609 |        "      <td>233.000000</td>\n",
 610 |        "      <td>33.000000</td>\n",
 611 |        "      <td>24.000000</td>\n",
 612 |        "      <td>19.000000</td>\n",
 613 |        "      <td>101.000000</td>\n",
 614 |        "    </tr>\n",
 615 |        "  </tbody>\n",
 616 |        "</table>\n",
 617 |        "</div>"
 618 |       ],
 619 |       "text/plain": [
 620 |        "      ExternalRiskEstimate MSinceOldestTradeOpen MSinceMostRecentTradeOpen  \\\n",
 621 |        "count          8291.000000           8291.000000               8291.000000   \n",
 622 |        "mean             72.783018            202.620552                  9.284043   \n",
 623 |        "std               9.665108             94.478782                  8.661243   \n",
 624 |        "min              37.000000              3.000000                  1.000000   \n",
 625 |        "25%              65.000000            139.000000                  4.000000   \n",
 626 |        "50%              73.000000            188.000000                  7.000000   \n",
 627 |        "75%              81.000000            257.500000                 12.000000   \n",
 628 |        "max              95.000000            804.000000                107.000000   \n",
 629 |        "\n",
 630 |        "      AverageMInFile NumSatisfactoryTrades NumTrades60Ever2DerogPubRec  \\\n",
 631 |        "count    8291.000000           8291.000000                 8291.000000   \n",
 632 |        "mean       78.446026             23.580871                    1.556025   \n",
 633 |        "std        30.963539             11.059356                    1.218511   \n",
 634 |        "min         5.000000              2.000000                    1.000000   \n",
 635 |        "25%        59.000000             16.000000                    1.000000   \n",
 636 |        "50%        76.000000             22.000000                    1.000000   \n",
 637 |        "75%        96.000000             30.000000                    2.000000   \n",
 638 |        "max       245.000000             80.000000                   20.000000   \n",
 639 |        "\n",
 640 |        "      NumTrades90Ever2DerogPubRec PercentTradesNeverDelq  \\\n",
 641 |        "count                 8291.000000            8291.000000   \n",
 642 |        "mean                     1.357617              93.831022   \n",
 643 |        "std                      0.961419              10.708137   \n",
 644 |        "min                      1.000000              21.000000   \n",
 645 |        "25%                      1.000000              91.000000   \n",
 646 |        "50%                      1.000000              98.000000   \n",
 647 |        "75%                      1.000000             101.000000   \n",
 648 |        "max                     20.000000             101.000000   \n",
 649 |        "\n",
 650 |        "      MaxDelq2PublicRecLast12M  MaxDelqEver NumTotalTrades  \\\n",
 651 |        "count              8291.000000  8291.000000     8291.00000   \n",
 652 |        "mean                  6.741527     7.383187       25.29321   \n",
 653 |        "std                   1.628200     1.809892       12.59546   \n",
 654 |        "min                   1.000000     3.000000        2.00000   \n",
 655 |        "25%                   6.000000     7.000000       16.00000   \n",
 656 |        "50%                   7.000000     7.000000       23.00000   \n",
 657 |        "75%                   8.000000     9.000000       32.00000   \n",
 658 |        "max                  10.000000     9.000000      105.00000   \n",
 659 |        "\n",
 660 |        "      NumTradesOpeninLast12M PercentInstallTrades NumInqLast6M  \\\n",
 661 |        "count            8291.000000          8291.000000  8291.000000   \n",
 662 |        "mean                2.987577            35.673381     2.473405   \n",
 663 |        "std                 1.863552            15.553447     2.120066   \n",
 664 |        "min                 1.000000             3.000000     1.000000   \n",
 665 |        "25%                 2.000000            24.000000     1.000000   \n",
 666 |        "50%                 3.000000            34.000000     2.000000   \n",
 667 |        "75%                 4.000000            46.000000     3.000000   \n",
 668 |        "max                20.000000            94.000000    67.000000   \n",
 669 |        "\n",
 670 |        "      NumInqLast6Mexcl7days NetFractionRevolvingBurden  \\\n",
 671 |        "count           8291.000000                8291.000000   \n",
 672 |        "mean               2.415873                  36.923290   \n",
 673 |        "std                2.079832                  28.446931   \n",
 674 |        "min                1.000000                   1.000000   \n",
 675 |        "25%                1.000000                  11.500000   \n",
 676 |        "50%                2.000000                  32.000000   \n",
 677 |        "75%                3.000000                  58.000000   \n",
 678 |        "max               67.000000                 233.000000   \n",
 679 |        "\n",
 680 |        "      NumRevolvingTradesWBalance NumInstallTradesWBalance  \\\n",
 681 |        "count                8291.000000              8291.000000   \n",
 682 |        "mean                    5.327464                 3.513328   \n",
 683 |        "std                     3.038434                 1.656994   \n",
 684 |        "min                     1.000000                 2.000000   \n",
 685 |        "25%                     3.000000                 2.000000   \n",
 686 |        "50%                     5.000000                 3.000000   \n",
 687 |        "75%                     7.000000                 4.000000   \n",
 688 |        "max                    33.000000                24.000000   \n",
 689 |        "\n",
 690 |        "      NumBank2NatlTradesWHighUtilization PercentTradesWBalance  \n",
 691 |        "count                        8291.000000           8291.000000  \n",
 692 |        "mean                            2.125075             67.845495  \n",
 693 |        "std                             1.547062             20.508632  \n",
 694 |        "min                             1.000000              8.000000  \n",
 695 |        "25%                             1.000000             51.000000  \n",
 696 |        "50%                             2.000000             68.000000  \n",
 697 |        "75%                             3.000000             84.000000  \n",
 698 |        "max                            19.000000            101.000000  "
 699 |       ]
 700 |      },
 701 |      "execution_count": 16,
 702 |      "metadata": {},
 703 |      "output_type": "execute_result"
 704 |     }
 705 |    ],
 706 |    "source": [
 707 |     "free.describe()"
 708 |    ]
 709 |   },
 710 |   {
 711 |    "cell_type": "code",
 712 |    "execution_count": 17,
 713 |    "metadata": {},
 714 |    "outputs": [
 715 |     {
 716 |      "data": {
 717 |       "text/html": [
 718 |        "<div>\n",
 719 |        "<style scoped>\n",
 720 |        "    .dataframe tbody tr th:only-of-type {\n",
 721 |        "        vertical-align: middle;\n",
 722 |        "    }\n",
 723 |        "\n",
 724 |        "    .dataframe tbody tr th {\n",
 725 |        "        vertical-align: top;\n",
 726 |        "    }\n",
 727 |        "\n",
 728 |        "    .dataframe thead tr th {\n",
 729 |        "        text-align: left;\n",
 730 |        "    }\n",
 731 |        "</style>\n",
 732 |        "<table border=\"1\" class=\"dataframe\">\n",
 733 |        "  <thead>\n",
 734 |        "    <tr>\n",
 735 |        "      <th></th>\n",
 736 |        "      <th>ExternalRiskEstimate</th>\n",
 737 |        "      <th>MSinceOldestTradeOpen</th>\n",
 738 |        "      <th>AverageMInFile</th>\n",
 739 |        "    </tr>\n",
 740 |        "  </thead>\n",
 741 |        "  <tbody>\n",
 742 |        "    <tr>\n",
 743 |        "      <th>count</th>\n",
 744 |        "      <td>8291.000000</td>\n",
 745 |        "      <td>8291.000000</td>\n",
 746 |        "      <td>8291.000000</td>\n",
 747 |        "    </tr>\n",
 748 |        "    <tr>\n",
 749 |        "      <th>mean</th>\n",
 750 |        "      <td>72.783018</td>\n",
 751 |        "      <td>202.620552</td>\n",
 752 |        "      <td>78.446026</td>\n",
 753 |        "    </tr>\n",
 754 |        "    <tr>\n",
 755 |        "      <th>std</th>\n",
 756 |        "      <td>9.665108</td>\n",
 757 |        "      <td>94.478782</td>\n",
 758 |        "      <td>30.963539</td>\n",
 759 |        "    </tr>\n",
 760 |        "    <tr>\n",
 761 |        "      <th>min</th>\n",
 762 |        "      <td>37.000000</td>\n",
 763 |        "      <td>3.000000</td>\n",
 764 |        "      <td>5.000000</td>\n",
 765 |        "    </tr>\n",
 766 |        "    <tr>\n",
 767 |        "      <th>25%</th>\n",
 768 |        "      <td>65.000000</td>\n",
 769 |        "      <td>139.000000</td>\n",
 770 |        "      <td>59.000000</td>\n",
 771 |        "    </tr>\n",
 772 |        "    <tr>\n",
 773 |        "      <th>50%</th>\n",
 774 |        "      <td>73.000000</td>\n",
 775 |        "      <td>188.000000</td>\n",
 776 |        "      <td>76.000000</td>\n",
 777 |        "    </tr>\n",
 778 |        "    <tr>\n",
 779 |        "      <th>75%</th>\n",
 780 |        "      <td>81.000000</td>\n",
 781 |        "      <td>257.500000</td>\n",
 782 |        "      <td>96.000000</td>\n",
 783 |        "    </tr>\n",
 784 |        "    <tr>\n",
 785 |        "      <th>max</th>\n",
 786 |        "      <td>95.000000</td>\n",
 787 |        "      <td>804.000000</td>\n",
 788 |        "      <td>245.000000</td>\n",
 789 |        "    </tr>\n",
 790 |        "  </tbody>\n",
 791 |        "</table>\n",
 792 |        "</div>"
 793 |       ],
 794 |       "text/plain": [
 795 |        "      ExternalRiskEstimate MSinceOldestTradeOpen AverageMInFile\n",
 796 |        "count          8291.000000           8291.000000    8291.000000\n",
 797 |        "mean             72.783018            202.620552      78.446026\n",
 798 |        "std               9.665108             94.478782      30.963539\n",
 799 |        "min              37.000000              3.000000       5.000000\n",
 800 |        "25%              65.000000            139.000000      59.000000\n",
 801 |        "50%              73.000000            188.000000      76.000000\n",
 802 |        "75%              81.000000            257.500000      96.000000\n",
 803 |        "max              95.000000            804.000000     245.000000"
 804 |       ]
 805 |      },
 806 |      "execution_count": 17,
 807 |      "metadata": {},
 808 |      "output_type": "execute_result"
 809 |     }
 810 |    ],
 811 |    "source": [
 812 |     "conditionals.describe()"
 813 |    ]
 814 |   },
 815 |   {
 816 |    "cell_type": "code",
 817 |    "execution_count": 18,
 818 |    "metadata": {},
 819 |    "outputs": [
 820 |     {
 821 |      "name": "stderr",
 822 |      "output_type": "stream",
 823 |      "text": [
 824 |       "C:\\Users\\fred0\\Anaconda3\\lib\\site-packages\\pandas\\core\\generic.py:3812: PerformanceWarning: dropping on a non-lexsorted multi-index without a level parameter may impact performance.\n",
 825 |       "  new_axis = axis.drop(labels, errors=errors)\n"
 826 |      ]
 827 |     },
 828 |     {
 829 |      "data": {
 830 |       "text/html": [
 831 |        "<div>\n",
 832 |        "<style scoped>\n",
 833 |        "    .dataframe tbody tr th:only-of-type {\n",
 834 |        "        vertical-align: middle;\n",
 835 |        "    }\n",
 836 |        "\n",
 837 |        "    .dataframe tbody tr th {\n",
 838 |        "        vertical-align: top;\n",
 839 |        "    }\n",
 840 |        "\n",
 841 |        "    .dataframe thead tr th {\n",
 842 |        "        text-align: left;\n",
 843 |        "    }\n",
 844 |        "</style>\n",
 845 |        "<table border=\"1\" class=\"dataframe\">\n",
 846 |        "  <thead>\n",
 847 |        "    <tr>\n",
 848 |        "      <th></th>\n",
 849 |        "      <th>MSinceMostRecentTradeOpen</th>\n",
 850 |        "      <th>NumSatisfactoryTrades</th>\n",
 851 |        "      <th>NumTrades60Ever2DerogPubRec</th>\n",
 852 |        "      <th>NumTrades90Ever2DerogPubRec</th>\n",
 853 |        "      <th>PercentTradesNeverDelq</th>\n",
 854 |        "      <th>MaxDelq2PublicRecLast12M</th>\n",
 855 |        "      <th>MaxDelqEver</th>\n",
 856 |        "      <th>NumTotalTrades</th>\n",
 857 |        "      <th>NumTradesOpeninLast12M</th>\n",
 858 |        "      <th>PercentInstallTrades</th>\n",
 859 |        "      <th>NumInqLast6M</th>\n",
 860 |        "      <th>NumInqLast6Mexcl7days</th>\n",
 861 |        "      <th>NetFractionRevolvingBurden</th>\n",
 862 |        "      <th>NumRevolvingTradesWBalance</th>\n",
 863 |        "      <th>NumInstallTradesWBalance</th>\n",
 864 |        "      <th>NumBank2NatlTradesWHighUtilization</th>\n",
 865 |        "      <th>PercentTradesWBalance</th>\n",
 866 |        "    </tr>\n",
 867 |        "  </thead>\n",
 868 |        "  <tbody>\n",
 869 |        "    <tr>\n",
 870 |        "      <th>count</th>\n",
 871 |        "      <td>8291.000000</td>\n",
 872 |        "      <td>8291.000000</td>\n",
 873 |        "      <td>8291.000000</td>\n",
 874 |        "      <td>8291.000000</td>\n",
 875 |        "      <td>8291.000000</td>\n",
 876 |        "      <td>8291.000000</td>\n",
 877 |        "      <td>8291.000000</td>\n",
 878 |        "      <td>8291.00000</td>\n",
 879 |        "      <td>8291.000000</td>\n",
 880 |        "      <td>8291.000000</td>\n",
 881 |        "      <td>8291.000000</td>\n",
 882 |        "      <td>8291.000000</td>\n",
 883 |        "      <td>8291.000000</td>\n",
 884 |        "      <td>8291.000000</td>\n",
 885 |        "      <td>8291.000000</td>\n",
 886 |        "      <td>8291.000000</td>\n",
 887 |        "      <td>8291.000000</td>\n",
 888 |        "    </tr>\n",
 889 |        "    <tr>\n",
 890 |        "      <th>mean</th>\n",
 891 |        "      <td>9.284043</td>\n",
 892 |        "      <td>23.580871</td>\n",
 893 |        "      <td>1.556025</td>\n",
 894 |        "      <td>1.357617</td>\n",
 895 |        "      <td>93.831022</td>\n",
 896 |        "      <td>6.741527</td>\n",
 897 |        "      <td>7.383187</td>\n",
 898 |        "      <td>25.29321</td>\n",
 899 |        "      <td>2.987577</td>\n",
 900 |        "      <td>35.673381</td>\n",
 901 |        "      <td>2.473405</td>\n",
 902 |        "      <td>2.415873</td>\n",
 903 |        "      <td>36.923290</td>\n",
 904 |        "      <td>5.327464</td>\n",
 905 |        "      <td>3.513328</td>\n",
 906 |        "      <td>2.125075</td>\n",
 907 |        "      <td>67.845495</td>\n",
 908 |        "    </tr>\n",
 909 |        "    <tr>\n",
 910 |        "      <th>std</th>\n",
 911 |        "      <td>8.661243</td>\n",
 912 |        "      <td>11.059356</td>\n",
 913 |        "      <td>1.218511</td>\n",
 914 |        "      <td>0.961419</td>\n",
 915 |        "      <td>10.708137</td>\n",
 916 |        "      <td>1.628200</td>\n",
 917 |        "      <td>1.809892</td>\n",
 918 |        "      <td>12.59546</td>\n",
 919 |        "      <td>1.863552</td>\n",
 920 |        "      <td>15.553447</td>\n",
 921 |        "      <td>2.120066</td>\n",
 922 |        "      <td>2.079832</td>\n",
 923 |        "      <td>28.446931</td>\n",
 924 |        "      <td>3.038434</td>\n",
 925 |        "      <td>1.656994</td>\n",
 926 |        "      <td>1.547062</td>\n",
 927 |        "      <td>20.508632</td>\n",
 928 |        "    </tr>\n",
 929 |        "    <tr>\n",
 930 |        "      <th>min</th>\n",
 931 |        "      <td>1.000000</td>\n",
 932 |        "      <td>2.000000</td>\n",
 933 |        "      <td>1.000000</td>\n",
 934 |        "      <td>1.000000</td>\n",
 935 |        "      <td>21.000000</td>\n",
 936 |        "      <td>1.000000</td>\n",
 937 |        "      <td>3.000000</td>\n",
 938 |        "      <td>2.00000</td>\n",
 939 |        "      <td>1.000000</td>\n",
 940 |        "      <td>3.000000</td>\n",
 941 |        "      <td>1.000000</td>\n",
 942 |        "      <td>1.000000</td>\n",
 943 |        "      <td>1.000000</td>\n",
 944 |        "      <td>1.000000</td>\n",
 945 |        "      <td>2.000000</td>\n",
 946 |        "      <td>1.000000</td>\n",
 947 |        "      <td>8.000000</td>\n",
 948 |        "    </tr>\n",
 949 |        "    <tr>\n",
 950 |        "      <th>25%</th>\n",
 951 |        "      <td>4.000000</td>\n",
 952 |        "      <td>16.000000</td>\n",
 953 |        "      <td>1.000000</td>\n",
 954 |        "      <td>1.000000</td>\n",
 955 |        "      <td>91.000000</td>\n",
 956 |        "      <td>6.000000</td>\n",
 957 |        "      <td>7.000000</td>\n",
 958 |        "      <td>16.00000</td>\n",
 959 |        "      <td>2.000000</td>\n",
 960 |        "      <td>24.000000</td>\n",
 961 |        "      <td>1.000000</td>\n",
 962 |        "      <td>1.000000</td>\n",
 963 |        "      <td>11.500000</td>\n",
 964 |        "      <td>3.000000</td>\n",
 965 |        "      <td>2.000000</td>\n",
 966 |        "      <td>1.000000</td>\n",
 967 |        "      <td>51.000000</td>\n",
 968 |        "    </tr>\n",
 969 |        "    <tr>\n",
 970 |        "      <th>50%</th>\n",
 971 |        "      <td>7.000000</td>\n",
 972 |        "      <td>22.000000</td>\n",
 973 |        "      <td>1.000000</td>\n",
 974 |        "      <td>1.000000</td>\n",
 975 |        "      <td>98.000000</td>\n",
 976 |        "      <td>7.000000</td>\n",
 977 |        "      <td>7.000000</td>\n",
 978 |        "      <td>23.00000</td>\n",
 979 |        "      <td>3.000000</td>\n",
 980 |        "      <td>34.000000</td>\n",
 981 |        "      <td>2.000000</td>\n",
 982 |        "      <td>2.000000</td>\n",
 983 |        "      <td>32.000000</td>\n",
 984 |        "      <td>5.000000</td>\n",
 985 |        "      <td>3.000000</td>\n",
 986 |        "      <td>2.000000</td>\n",
 987 |        "      <td>68.000000</td>\n",
 988 |        "    </tr>\n",
 989 |        "    <tr>\n",
 990 |        "      <th>75%</th>\n",
 991 |        "      <td>12.000000</td>\n",
 992 |        "      <td>30.000000</td>\n",
 993 |        "      <td>2.000000</td>\n",
 994 |        "      <td>1.000000</td>\n",
 995 |        "      <td>101.000000</td>\n",
 996 |        "      <td>8.000000</td>\n",
 997 |        "      <td>9.000000</td>\n",
 998 |        "      <td>32.00000</td>\n",
 999 |        "      <td>4.000000</td>\n",
1000 |        "      <td>46.000000</td>\n",
1001 |        "      <td>3.000000</td>\n",
1002 |        "      <td>3.000000</td>\n",
1003 |        "      <td>58.000000</td>\n",
1004 |        "      <td>7.000000</td>\n",
1005 |        "      <td>4.000000</td>\n",
1006 |        "      <td>3.000000</td>\n",
1007 |        "      <td>84.000000</td>\n",
1008 |        "    </tr>\n",
1009 |        "    <tr>\n",
1010 |        "      <th>max</th>\n",
1011 |        "      <td>107.000000</td>\n",
1012 |        "      <td>80.000000</td>\n",
1013 |        "      <td>20.000000</td>\n",
1014 |        "      <td>20.000000</td>\n",
1015 |        "      <td>101.000000</td>\n",
1016 |        "      <td>10.000000</td>\n",
1017 |        "      <td>9.000000</td>\n",
1018 |        "      <td>105.00000</td>\n",
1019 |        "      <td>20.000000</td>\n",
1020 |        "      <td>94.000000</td>\n",
1021 |        "      <td>67.000000</td>\n",
1022 |        "      <td>67.000000</td>\n",
1023 |        "      <td>233.000000</td>\n",
1024 |        "      <td>33.000000</td>\n",
1025 |        "      <td>24.000000</td>\n",
1026 |        "      <td>19.000000</td>\n",
1027 |        "      <td>101.000000</td>\n",
1028 |        "    </tr>\n",
1029 |        "  </tbody>\n",
1030 |        "</table>\n",
1031 |        "</div>"
1032 |       ],
1033 |       "text/plain": [
1034 |        "      MSinceMostRecentTradeOpen NumSatisfactoryTrades  \\\n",
1035 |        "count               8291.000000           8291.000000   \n",
1036 |        "mean                   9.284043             23.580871   \n",
1037 |        "std                    8.661243             11.059356   \n",
1038 |        "min                    1.000000              2.000000   \n",
1039 |        "25%                    4.000000             16.000000   \n",
1040 |        "50%                    7.000000             22.000000   \n",
1041 |        "75%                   12.000000             30.000000   \n",
1042 |        "max                  107.000000             80.000000   \n",
1043 |        "\n",
1044 |        "      NumTrades60Ever2DerogPubRec NumTrades90Ever2DerogPubRec  \\\n",
1045 |        "count                 8291.000000                 8291.000000   \n",
1046 |        "mean                     1.556025                    1.357617   \n",
1047 |        "std                      1.218511                    0.961419   \n",
1048 |        "min                      1.000000                    1.000000   \n",
1049 |        "25%                      1.000000                    1.000000   \n",
1050 |        "50%                      1.000000                    1.000000   \n",
1051 |        "75%                      2.000000                    1.000000   \n",
1052 |        "max                     20.000000                   20.000000   \n",
1053 |        "\n",
1054 |        "      PercentTradesNeverDelq MaxDelq2PublicRecLast12M  MaxDelqEver  \\\n",
1055 |        "count            8291.000000              8291.000000  8291.000000   \n",
1056 |        "mean               93.831022                 6.741527     7.383187   \n",
1057 |        "std                10.708137                 1.628200     1.809892   \n",
1058 |        "min                21.000000                 1.000000     3.000000   \n",
1059 |        "25%                91.000000                 6.000000     7.000000   \n",
1060 |        "50%                98.000000                 7.000000     7.000000   \n",
1061 |        "75%               101.000000                 8.000000     9.000000   \n",
1062 |        "max               101.000000                10.000000     9.000000   \n",
1063 |        "\n",
1064 |        "      NumTotalTrades NumTradesOpeninLast12M PercentInstallTrades NumInqLast6M  \\\n",
1065 |        "count     8291.00000            8291.000000          8291.000000  8291.000000   \n",
1066 |        "mean        25.29321               2.987577            35.673381     2.473405   \n",
1067 |        "std         12.59546               1.863552            15.553447     2.120066   \n",
1068 |        "min          2.00000               1.000000             3.000000     1.000000   \n",
1069 |        "25%         16.00000               2.000000            24.000000     1.000000   \n",
1070 |        "50%         23.00000               3.000000            34.000000     2.000000   \n",
1071 |        "75%         32.00000               4.000000            46.000000     3.000000   \n",
1072 |        "max        105.00000              20.000000            94.000000    67.000000   \n",
1073 |        "\n",
1074 |        "      NumInqLast6Mexcl7days NetFractionRevolvingBurden  \\\n",
1075 |        "count           8291.000000                8291.000000   \n",
1076 |        "mean               2.415873                  36.923290   \n",
1077 |        "std                2.079832                  28.446931   \n",
1078 |        "min                1.000000                   1.000000   \n",
1079 |        "25%                1.000000                  11.500000   \n",
1080 |        "50%                2.000000                  32.000000   \n",
1081 |        "75%                3.000000                  58.000000   \n",
1082 |        "max               67.000000                 233.000000   \n",
1083 |        "\n",
1084 |        "      NumRevolvingTradesWBalance NumInstallTradesWBalance  \\\n",
1085 |        "count                8291.000000              8291.000000   \n",
1086 |        "mean                    5.327464                 3.513328   \n",
1087 |        "std                     3.038434                 1.656994   \n",
1088 |        "min                     1.000000                 2.000000   \n",
1089 |        "25%                     3.000000                 2.000000   \n",
1090 |        "50%                     5.000000                 3.000000   \n",
1091 |        "75%                     7.000000                 4.000000   \n",
1092 |        "max                    33.000000                24.000000   \n",
1093 |        "\n",
1094 |        "      NumBank2NatlTradesWHighUtilization PercentTradesWBalance  \n",
1095 |        "count                        8291.000000           8291.000000  \n",
1096 |        "mean                            2.125075             67.845495  \n",
1097 |        "std                             1.547062             20.508632  \n",
1098 |        "min                             1.000000              8.000000  \n",
1099 |        "25%                             1.000000             51.000000  \n",
1100 |        "50%                             2.000000             68.000000  \n",
1101 |        "75%                             3.000000             84.000000  \n",
1102 |        "max                            19.000000            101.000000  "
1103 |       ]
1104 |      },
1105 |      "execution_count": 18,
1106 |      "metadata": {},
1107 |      "output_type": "execute_result"
1108 |     }
1109 |    ],
1110 |    "source": [
1111 |     "free = free.drop(columns=['ExternalRiskEstimate', 'MSinceOldestTradeOpen', 'AverageMInFile'])\n",
1112 |     "free.describe()"
1113 |    ]
1114 |   },
1115 |   {
1116 |    "cell_type": "code",
1117 |    "execution_count": 19,
1118 |    "metadata": {},
1119 |    "outputs": [
1120 |     {
1121 |      "data": {
1122 |       "text/html": [
1123 |        "<div>\n",
1124 |        "<style scoped>\n",
1125 |        "    .dataframe tbody tr th:only-of-type {\n",
1126 |        "        vertical-align: middle;\n",
1127 |        "    }\n",
1128 |        "\n",
1129 |        "    .dataframe tbody tr th {\n",
1130 |        "        vertical-align: top;\n",
1131 |        "    }\n",
1132 |        "\n",
1133 |        "    .dataframe thead th {\n",
1134 |        "        text-align: right;\n",
1135 |        "    }\n",
1136 |        "</style>\n",
1137 |        "<table border=\"1\" class=\"dataframe\">\n",
1138 |        "  <thead>\n",
1139 |        "    <tr style=\"text-align: right;\">\n",
1140 |        "      <th></th>\n",
1141 |        "      <th>RiskPerformance</th>\n",
1142 |        "    </tr>\n",
1143 |        "  </thead>\n",
1144 |        "  <tbody>\n",
1145 |        "    <tr>\n",
1146 |        "      <th>0</th>\n",
1147 |        "      <td>1.0</td>\n",
1148 |        "    </tr>\n",
1149 |        "    <tr>\n",
1150 |        "      <th>1</th>\n",
1151 |        "      <td>1.0</td>\n",
1152 |        "    </tr>\n",
1153 |        "    <tr>\n",
1154 |        "      <th>2</th>\n",
1155 |        "      <td>1.0</td>\n",
1156 |        "    </tr>\n",
1157 |        "    <tr>\n",
1158 |        "      <th>3</th>\n",
1159 |        "      <td>1.0</td>\n",
1160 |        "    </tr>\n",
1161 |        "    <tr>\n",
1162 |        "      <th>4</th>\n",
1163 |        "      <td>1.0</td>\n",
1164 |        "    </tr>\n",
1165 |        "  </tbody>\n",
1166 |        "</table>\n",
1167 |        "</div>"
1168 |       ],
1169 |       "text/plain": [
1170 |        "   RiskPerformance\n",
1171 |        "0              1.0\n",
1172 |        "1              1.0\n",
1173 |        "2              1.0\n",
1174 |        "3              1.0\n",
1175 |        "4              1.0"
1176 |       ]
1177 |      },
1178 |      "execution_count": 19,
1179 |      "metadata": {},
1180 |      "output_type": "execute_result"
1181 |     }
1182 |    ],
1183 |    "source": [
1184 |     "labels.head()"
1185 |    ]
1186 |   },
1187 |   {
1188 |    "cell_type": "code",
1189 |    "execution_count": 20,
1190 |    "metadata": {},
1191 |    "outputs": [
1192 |     {
1193 |      "data": {
1194 |       "text/html": [
1195 |        "<div>\n",
1196 |        "<style scoped>\n",
1197 |        "    .dataframe tbody tr th:only-of-type {\n",
1198 |        "        vertical-align: middle;\n",
1199 |        "    }\n",
1200 |        "\n",
1201 |        "    .dataframe tbody tr th {\n",
1202 |        "        vertical-align: top;\n",
1203 |        "    }\n",
1204 |        "\n",
1205 |        "    .dataframe thead tr th {\n",
1206 |        "        text-align: left;\n",
1207 |        "    }\n",
1208 |        "</style>\n",
1209 |        "<table border=\"1\" class=\"dataframe\">\n",
1210 |        "  <thead>\n",
1211 |        "    <tr>\n",
1212 |        "      <th></th>\n",
1213 |        "      <th>ExternalRiskEstimate</th>\n",
1214 |        "      <th>MSinceOldestTradeOpen</th>\n",
1215 |        "      <th>AverageMInFile</th>\n",
1216 |        "    </tr>\n",
1217 |        "  </thead>\n",
1218 |        "  <tbody>\n",
1219 |        "    <tr>\n",
1220 |        "      <th>0</th>\n",
1221 |        "      <td>56.0</td>\n",
1222 |        "      <td>145.0</td>\n",
1223 |        "      <td>85.0</td>\n",
1224 |        "    </tr>\n",
1225 |        "    <tr>\n",
1226 |        "      <th>1</th>\n",
1227 |        "      <td>68.0</td>\n",
1228 |        "      <td>67.0</td>\n",
1229 |        "      <td>25.0</td>\n",
1230 |        "    </tr>\n",
1231 |        "    <tr>\n",
1232 |        "      <th>2</th>\n",
1233 |        "      <td>67.0</td>\n",
1234 |        "      <td>170.0</td>\n",
1235 |        "      <td>74.0</td>\n",
1236 |        "    </tr>\n",
1237 |        "    <tr>\n",
1238 |        "      <th>3</th>\n",
1239 |        "      <td>82.0</td>\n",
1240 |        "      <td>334.0</td>\n",
1241 |        "      <td>133.0</td>\n",
1242 |        "    </tr>\n",
1243 |        "    <tr>\n",
1244 |        "      <th>4</th>\n",
1245 |        "      <td>60.0</td>\n",
1246 |        "      <td>138.0</td>\n",
1247 |        "      <td>79.0</td>\n",
1248 |        "    </tr>\n",
1249 |        "  </tbody>\n",
1250 |        "</table>\n",
1251 |        "</div>"
1252 |       ],
1253 |       "text/plain": [
1254 |        "  ExternalRiskEstimate MSinceOldestTradeOpen AverageMInFile\n",
1255 |        "0                 56.0                 145.0           85.0\n",
1256 |        "1                 68.0                  67.0           25.0\n",
1257 |        "2                 67.0                 170.0           74.0\n",
1258 |        "3                 82.0                 334.0          133.0\n",
1259 |        "4                 60.0                 138.0           79.0"
1260 |       ]
1261 |      },
1262 |      "execution_count": 20,
1263 |      "metadata": {},
1264 |      "output_type": "execute_result"
1265 |     }
1266 |    ],
1267 |    "source": [
1268 |     "conditionals.head()"
1269 |    ]
1270 |   },
1271 |   {
1272 |    "cell_type": "code",
1273 |    "execution_count": 21,
1274 |    "metadata": {},
1275 |    "outputs": [],
1276 |    "source": [
1277 |     "#Save to CSV\n",
1278 |     "#pd.DataFrame(free.values).to_csv('heloc_x.csv', header = False, index = False)\n",
1279 |     "#pd.DataFrame(conditionals.values).to_csv(\"heloc_x_c.csv\", header = False, index = False)\n",
1280 |     "#labels.to_csv('heloc_y.csv', header = False, index = False)"
1281 |    ]
1282 |   }
1283 |  ],
1284 |  "metadata": {
1285 |   "kernelspec": {
1286 |    "display_name": "Python 3",
1287 |    "language": "python",
1288 |    "name": "python3"
1289 |   },
1290 |   "language_info": {
1291 |    "codemirror_mode": {
1292 |     "name": "ipython",
1293 |     "version": 3
1294 |    },
1295 |    "file_extension": ".py",
1296 |    "mimetype": "text/x-python",
1297 |    "name": "python",
1298 |    "nbconvert_exporter": "python",
1299 |    "pygments_lexer": "ipython3",
1300 |    "version": "3.6.8"
1301 |   }
1302 |  },
1303 |  "nbformat": 4,
1304 |  "nbformat_minor": 2
1305 | }
1306 | 


--------------------------------------------------------------------------------