├── GAIN.py ├── MIWAE.py ├── README.md ├── data ├── Index ├── Sensorless_drive_diagnosis.txt ├── data_drive.py ├── data_text.py ├── drive_x ├── drive_y ├── page-blocks.data ├── page-blocks.names ├── text_x └── text_y ├── mice.py ├── mice_NN.py ├── missforest.py └── utils.py /GAIN.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code is adapted from https://github.com/jsyoon0823/GAIN 3 | Information about GAIN: 4 | Reference: J. Yoon, J. Jordon, M. van der Schaar, "GAIN: Missing Data Imputation using Generative Adversarial Nets," ICML, 2018. 5 | Paper Link: http://medianetlab.ee.ucla.edu/papers/ICML_GAIN.pdf 6 | Appendix Link: http://medianetlab.ee.ucla.edu/papers/ICML_GAIN_Supp.pdf 7 | """ 8 | 9 | import tensorflow as tf 10 | from tqdm import tqdm 11 | import numpy as np 12 | from utils import mse, load_data 13 | 14 | 15 | def main(p_miss = 0.5, p_hint=0.3, alpha=800, num_epochs=2000, dataset="text", 16 | mode="mcar", para=0.5, train=None, rand_seed=42): 17 | 18 | np.random.seed(rand_seed) 19 | tf.set_random_seed(rand_seed) 20 | 21 | n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed) 22 | 23 | # Mini batch size 24 | mb_size = 64 25 | # Imput Dim (Fixed) 26 | train_rate = 1 27 | 28 | Data = data_x 29 | # Parameters 30 | No = n 31 | Dim = p 32 | 33 | # Hidden state dimensions 34 | H_Dim1 = Dim 35 | H_Dim2 = Dim 36 | 37 | # %% Missing introducing 38 | Missing = mask*1 39 | 40 | # %% Train Test Division 41 | 42 | idx = np.random.permutation(No) 43 | 44 | Train_No = int(No * train_rate) 45 | Test_No = No - Train_No 46 | 47 | # Train / Test Features 48 | trainX = Data[idx[:Train_No], :] 49 | testX = Data[idx[Train_No:], :] 50 | 51 | # Train / Test Missing Indicators 52 | trainM = Missing[idx[:Train_No], :] 53 | testM = Missing[idx[Train_No:], :] 54 | 55 | 56 | # %% Necessary Functions 57 | 58 | # 1. Xavier Initialization Definition 59 | def xavier_init(size): 60 | in_dim = size[0] 61 | xavier_stddev = 1. / tf.sqrt(in_dim / 2.) 62 | return tf.random_normal(shape=size, stddev=xavier_stddev) 63 | 64 | 65 | # Hint Vector Generation 66 | def sample_M(m, n, p): 67 | A = np.random.uniform(0., 1., size=[m, n]) 68 | B = A > p 69 | C = 1. * B 70 | return C 71 | 72 | 73 | ''' 74 | GAIN Consists of 3 Components 75 | - Generator 76 | - Discriminator 77 | - Hint Mechanism 78 | ''' 79 | 80 | # %% GAIN Architecture 81 | 82 | # %% 1. Input Placeholders 83 | # 1.1. Data Vector 84 | X = tf.placeholder(tf.float32, shape=[None, Dim]) 85 | # 1.2. Mask Vector 86 | M = tf.placeholder(tf.float32, shape=[None, Dim]) 87 | # 1.3. Hint vector 88 | H = tf.placeholder(tf.float32, shape=[None, Dim]) 89 | # 1.4. X with missing values 90 | New_X = tf.placeholder(tf.float32, shape=[None, Dim]) 91 | 92 | # %% 2. Discriminator 93 | D_W1 = tf.Variable(xavier_init([Dim * 2, H_Dim1])) # Data + Hint as inputs 94 | D_b1 = tf.Variable(tf.zeros(shape=[H_Dim1])) 95 | 96 | D_W2 = tf.Variable(xavier_init([H_Dim1, H_Dim2])) 97 | D_b2 = tf.Variable(tf.zeros(shape=[H_Dim2])) 98 | 99 | D_W3 = tf.Variable(xavier_init([H_Dim2, Dim])) 100 | D_b3 = tf.Variable(tf.zeros(shape=[Dim])) # Output is multi-variate 101 | 102 | theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] 103 | 104 | # %% 3. Generator 105 | G_W1 = tf.Variable(xavier_init([Dim * 2, H_Dim1])) # Data + Mask as inputs (Random Noises are in Missing Components) 106 | G_b1 = tf.Variable(tf.zeros(shape=[H_Dim1])) 107 | 108 | G_W2 = tf.Variable(xavier_init([H_Dim1, H_Dim2])) 109 | G_b2 = tf.Variable(tf.zeros(shape=[H_Dim2])) 110 | 111 | G_W3 = tf.Variable(xavier_init([H_Dim2, Dim])) 112 | G_b3 = tf.Variable(tf.zeros(shape=[Dim])) 113 | 114 | theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3] 115 | 116 | 117 | # %% GAIN Function 118 | 119 | # %% 1. Generator 120 | def generator(new_x, m): 121 | inputs = tf.concat(axis=1, values=[new_x, m]) # Mask + Data Concatenate 122 | G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1) 123 | G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2) 124 | G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3) # [0,1] normalized Output 125 | 126 | return G_prob 127 | 128 | 129 | # %% 2. Discriminator 130 | def discriminator(new_x, h): 131 | inputs = tf.concat(axis=1, values=[new_x, h]) # Hint + Data Concatenate 132 | D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1) 133 | D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) 134 | D_logit = tf.matmul(D_h2, D_W3) + D_b3 135 | D_prob = tf.nn.sigmoid(D_logit) # [0,1] Probability Output 136 | 137 | return D_prob 138 | 139 | 140 | # %% 3. Other functions 141 | # Random sample generator for Z 142 | def sample_Z(m, n): 143 | return np.random.uniform(0., 0.01, size=[m, n]) 144 | 145 | 146 | # Mini-batch generation 147 | def sample_idx(m, n): 148 | A = np.random.permutation(m) 149 | idx = A[:n] 150 | return idx 151 | 152 | 153 | # %% Structure 154 | # Generator 155 | G_sample = generator(New_X, M) 156 | 157 | # Combine with original data 158 | Hat_New_X = New_X * M + G_sample * (1 - M) 159 | 160 | # Discriminator 161 | D_prob = discriminator(Hat_New_X, H) 162 | 163 | # %% Loss 164 | D_loss1 = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) + (1 - M) * tf.log(1. - D_prob + 1e-8)) 165 | G_loss1 = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8)) 166 | 167 | MSE_train_loss = tf.reduce_mean((M * New_X - M * G_sample) ** 2) / tf.reduce_mean(M) 168 | 169 | D_loss = D_loss1 170 | G_loss = G_loss1 + alpha * MSE_train_loss 171 | 172 | # %% MSE Performance metric 173 | MSE_test_loss = tf.reduce_mean(((1 - M) * X - (1 - M) * G_sample) ** 2) / tf.reduce_mean(1 - M) 174 | 175 | # %% Solver 176 | D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D) 177 | G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G) 178 | 179 | # Sessions 180 | sess = tf.Session() 181 | sess.run(tf.global_variables_initializer()) 182 | 183 | # %% Iterations 184 | errors = [] 185 | # %% Start Iterations 186 | for it in tqdm(range(num_epochs)): 187 | 188 | # %% Inputs 189 | mb_idx = sample_idx(Train_No, mb_size) 190 | X_mb = trainX[mb_idx, :] 191 | 192 | Z_mb = sample_Z(mb_size, Dim) 193 | M_mb = trainM[mb_idx, :] 194 | H_mb1 = sample_M(mb_size, Dim, 1 - p_hint) 195 | H_mb = M_mb * H_mb1 196 | 197 | New_X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb # Missing Data Introduce 198 | 199 | _, D_loss_curr = sess.run([D_solver, D_loss1], feed_dict={M: M_mb, New_X: New_X_mb, H: H_mb}) 200 | _, G_loss_curr, MSE_train_loss_curr, MSE_test_loss_curr = sess.run( 201 | [G_solver, G_loss1, MSE_train_loss, MSE_test_loss], 202 | feed_dict={X: X_mb, M: M_mb, New_X: New_X_mb, H: H_mb}) 203 | 204 | # %% Intermediate Losses 205 | if it % 50 == 0: 206 | Z_mb = sample_Z(n, p) 207 | New_X_mb = Missing * data_x + (1 - Missing) * Z_mb 208 | 209 | x_filled = sess.run(G_sample, feed_dict={X: data_x, M: Missing, New_X: New_X_mb}) 210 | 211 | print('Iter: {}'.format(it)) 212 | print('Train_loss: {:.4}'.format(np.sqrt(MSE_train_loss_curr))) 213 | print('Test_loss: {:.4}'.format(np.sqrt(MSE_test_loss_curr))) 214 | errors.append(mse(x_filled, data_x, mask)) 215 | print("Real MSE: ", errors[-1]) 216 | 217 | # %% Final Loss 218 | if train_rate != 1: 219 | Z_mb = sample_Z(Test_No, Dim) 220 | M_mb = testM 221 | X_mb = testX 222 | 223 | New_X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb # Missing Data Introduce 224 | 225 | MSE_final, Sample = sess.run([MSE_test_loss, G_sample], feed_dict={X: testX, M: testM, New_X: New_X_mb}) 226 | 227 | print('Final Test MSE: ' + str(MSE_final)) 228 | # Real Error 229 | 230 | Z_mb = sample_Z(n, p) 231 | New_X_mb = Missing * data_x + (1 - Missing) * Z_mb 232 | 233 | x_filled = sess.run(G_sample, feed_dict={X: data_x, M: Missing, New_X: New_X_mb}) 234 | real_mse = mse(x_filled, data_x, mask) 235 | print("Real final MSE: " + str(real_mse)) 236 | 237 | return x_filled, real_mse 238 | 239 | 240 | if __name__ == "__main__": 241 | main() -------------------------------------------------------------------------------- /MIWAE.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code is adapted from https://github.com/pamattei/miwae 3 | For more information on MIWAE see: 4 | http://proceedings.mlr.press/v97/mattei19a.html 5 | """ 6 | import tensorflow as tf 7 | import numpy as np 8 | 9 | import tensorflow_probability as tfp 10 | tfd = tfp.distributions 11 | tfk = tf.keras 12 | tfkl = tf.keras.layers 13 | from utils import mse 14 | from utils import load_data 15 | 16 | 17 | def main(p_miss=0.5, hidden_units=50, lr=0.001, epochs=500, dataset="drive", 18 | mode="mcar", para=0.5, train=None, rand_seed=42): 19 | 20 | np.random.seed(rand_seed) 21 | tf.set_random_seed(rand_seed) 22 | 23 | n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed) 24 | 25 | x = tf.placeholder(tf.float32, shape=[None, p]) # Placeholder for xhat_0 26 | learning_rate = tf.placeholder(tf.float32, shape=[]) 27 | batch_size = tf.shape(x)[0] 28 | xmask = tf.placeholder(tf.bool, shape=[None, p]) 29 | K= tf.placeholder(tf.int32, shape=[]) # Placeholder for the number of importance weights 30 | 31 | d = np.floor(p/2).astype(int) # dimension of the latent space 32 | 33 | p_z = tfd.MultivariateNormalDiag(loc=tf.zeros(d, tf.float32)) 34 | 35 | h = hidden_units # number of hidden units (same for all MLPs) 36 | 37 | sigma = "relu" 38 | 39 | decoder = tfk.Sequential([ 40 | tfkl.InputLayer(input_shape=[d,]), 41 | tfkl.Dense(h, activation=sigma,kernel_initializer="orthogonal"), 42 | tfkl.Dense(h, activation=sigma,kernel_initializer="orthogonal"), 43 | tfkl.Dense(3*p,kernel_initializer="orthogonal") # the decoder will output both the mean, the scale, and the number of degrees of freedoms (hence the 3*p) 44 | ]) 45 | 46 | tiledmask = tf.tile(xmask,[K,1]) 47 | tiledmask_float = tf.cast(tiledmask,tf.float32) 48 | mask_not_float = tf.abs(-tf.cast(xmask,tf.float32)) 49 | 50 | iota = tf.Variable(np.zeros([1,p]),dtype=tf.float32) 51 | tilediota = tf.tile(iota,[batch_size,1]) 52 | iotax = x + tf.multiply(tilediota,mask_not_float) 53 | 54 | encoder = tfk.Sequential([ 55 | tfkl.InputLayer(input_shape=[p,]), 56 | tfkl.Dense(h, activation=sigma,kernel_initializer="orthogonal"), 57 | tfkl.Dense(h, activation=sigma,kernel_initializer="orthogonal"), 58 | tfkl.Dense(3*d,kernel_initializer="orthogonal") 59 | ]) 60 | 61 | out_encoder = encoder(iotax) 62 | q_zgivenxobs = tfd.Independent(distribution=tfd.StudentT(loc=out_encoder[..., :d], scale=tf.nn.softplus(out_encoder[..., d:(2*d)]), df=3 + tf.nn.softplus(out_encoder[..., (2*d):(3*d)]))) 63 | zgivenx = q_zgivenxobs.sample(K) 64 | zgivenx_flat = tf.reshape(zgivenx,[K*batch_size,d]) 65 | data_flat = tf.reshape(tf.tile(x,[K,1]),[-1,1]) 66 | 67 | out_decoder = decoder(zgivenx_flat) 68 | all_means_obs_model = out_decoder[..., :p] 69 | all_scales_obs_model = tf.nn.softplus(out_decoder[..., p:(2*p)]) + 0.001 70 | all_degfreedom_obs_model = tf.nn.softplus(out_decoder[..., (2*p):(3*p)]) + 3 71 | all_log_pxgivenz_flat = tfd.StudentT(loc=tf.reshape(all_means_obs_model,[-1,1]),scale=tf.reshape(all_scales_obs_model,[-1,1]),df=tf.reshape(all_degfreedom_obs_model,[-1,1])).log_prob(data_flat) 72 | all_log_pxgivenz = tf.reshape(all_log_pxgivenz_flat,[K*batch_size,p]) 73 | 74 | logpxobsgivenz = tf.reshape(tf.reduce_sum(tf.multiply(all_log_pxgivenz,tiledmask_float),1),[K,batch_size]) 75 | logpz = p_z.log_prob(zgivenx) 76 | logq = q_zgivenxobs.log_prob(zgivenx) 77 | 78 | miwae_loss = -tf.reduce_mean(tf.reduce_logsumexp(logpxobsgivenz + logpz - logq,0)) +tf.log(tf.cast(K,tf.float32)) 79 | train_miss = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(miwae_loss) 80 | 81 | xgivenz = tfd.Independent( 82 | distribution=tfd.StudentT(loc=all_means_obs_model, scale=all_scales_obs_model, df=all_degfreedom_obs_model)) 83 | 84 | imp_weights = tf.nn.softmax(logpxobsgivenz + logpz - logq,0) # these are w_1,....,w_L for all observations in the batch 85 | xms = tf.reshape(xgivenz.mean(),[K,batch_size,p]) 86 | xm=tf.einsum('ki,kij->ij', imp_weights, xms) 87 | 88 | miwae_loss_train=np.array([]) 89 | 90 | mse_train=np.array([]) 91 | bs = 64 # batch size 92 | n_epochs = epochs 93 | xhat = np.copy(xhat_0) # This will be out imputed data matrix 94 | 95 | with tf.Session() as sess: 96 | sess.run(tf.global_variables_initializer()) 97 | for ep in range(1,n_epochs): 98 | perm = np.random.permutation(n) # We use the "random reshuffling" version of SGD 99 | batches_data = np.array_split(xhat_0[perm,], n/bs) 100 | batches_mask = np.array_split(mask[perm,], n/bs) 101 | for it in range(len(batches_data)): 102 | train_miss.run(feed_dict={x: batches_data[it], learning_rate: lr, K:20, xmask: batches_mask[it]}) # Gradient step 103 | if ep % 50 == 1 or ep == (n_epochs -1): 104 | losstrain = np.array([miwae_loss.eval(feed_dict={x: xhat_0, K:20, xmask: mask})]) # MIWAE bound evaluation 105 | miwae_loss_train = np.append(miwae_loss_train,-losstrain,axis=0) 106 | print('Epoch %g' %ep) 107 | print('MIWAE likelihood bound %g' %-losstrain) 108 | for i in range(n): # We impute the observations one at a time for memory reasons 109 | xhat[i,:][~mask[i,:]]=xm.eval(feed_dict={x: xhat_0[i,:].reshape([1,p]), K:1000, xmask: mask[i,:].reshape([1,p])})[~mask[i,:].reshape([1,p])] 110 | err = np.array(mse(xhat,data_x,mask)) 111 | print('Imputation MSE %g' %err) 112 | print('-----') 113 | 114 | return xhat, err 115 | 116 | if __name__ == "__main__": 117 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Missing-Data-Imputation-Methods-Performance-Comparison 2 | The data imputation methods [MissForest](https://cran.r-project.org/web/packages/missForest/missForest.pdf), [GAIN](https://arxiv.org/abs/1806.02920), [MICE](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3074241/), MICE-NN and [MIWAE](http://proceedings.mlr.press/v97/mattei19a.html) are tested on two UCI datasets (Dataset for Sensorless Drive Diagnosis Data Set, Page Blocks Classification Dataset). MICE-NN is a modified version of MICE, where instead of linear regresssion fully connected neural networks are used. The tests are done by taking the complete dataset (without missing values) introducing either MAR or MCAR missingness with the desired missing rate and then using the imputation methods to impute the missing values. Since the correct values are known, the real MSE can be computed. To test other datasets, save the dataset as a 2-dim numpy array in the folder [data](data). Now set dataset = "name" when calling the imputation method, where your dataset in the folder [data](data) is named "name_y" and name_x. 3 | 4 | MCAR missing values are introduced by dropping each value in the data independently with probability "p_miss". MAR missing values are introduced by summing over one third of each observation and dropping each value in the rest of the observation independently with a probability proportional to the computed sum. For this the variable "para" is used (for details see load_data in utils.py). 5 | 6 | 7 | 8 | ## Requirements 9 | The code requires Python 3.6 or later. 10 | Required packages are: 11 | 12 | * fanyimpute >= 0.5.3 13 | * mathplotlib >= 2.2.2 14 | * missingpy >= 0.2.0 15 | * numpy >= 1.16.2 16 | * pathlib >= 2.3.3 17 | * pickle 18 | * Pillow >= 5.4.1 19 | * pylab 20 | * scipy >= 1.2.1 21 | * sklearn 22 | * tensorflow >= 1.14 23 | * tensorflow_probability >=0.7.0 24 | * torch >= 1.0.1 25 | * torchvision >= 0.2.2 26 | * tqdm >= 4.31.1 27 | 28 | -------------------------------------------------------------------------------- /data/Index: -------------------------------------------------------------------------------- 1 | Index of page-blocks 2 | 3 | 02 Dec 1996 128 Index 4 | 20 Jul 1995 104579 page-blocks.data.Z 5 | 20 Jul 1995 3900 page-blocks.names 6 | -------------------------------------------------------------------------------- /data/data_drive.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import pandas as pd 4 | from sklearn import preprocessing 5 | 6 | data = pd.read_csv("Sensorless_drive_diagnosis.txt", header=None, sep=" ") 7 | data = data.values 8 | print(data[:2,:]) 9 | np.random.shuffle(data) 10 | 11 | y = data[:, -1:].astype(np.int) - 1 12 | x = data[:, :-1] 13 | 14 | print(y[:20]) 15 | 16 | scaler = preprocessing.MinMaxScaler() 17 | x_numpy = scaler.fit_transform(x) 18 | 19 | with open("drive_x", "wb") as file: 20 | pickle.dump(x_numpy, file) 21 | 22 | with open("drive_y", "wb") as file: 23 | pickle.dump(y, file) 24 | -------------------------------------------------------------------------------- /data/data_text.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle 3 | import pandas as pd 4 | from sklearn import preprocessing 5 | 6 | data = pd.read_csv("page-blocks.data", header=None, sep="[ ]+") 7 | data = data.values 8 | 9 | np.random.shuffle(data) 10 | 11 | y = data[:, -1:].astype(np.int) - 1 12 | x = data[:, :-1] 13 | 14 | scaler = preprocessing.MinMaxScaler() 15 | x_numpy = scaler.fit_transform(x) 16 | 17 | with open("text_x", "wb") as file: 18 | pickle.dump(x_numpy, file) 19 | 20 | with open("text_y", "wb") as file: 21 | pickle.dump(y, file) 22 | -------------------------------------------------------------------------------- /data/drive_x: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fschur/Missing-Data-Imputation-Methods-Performance-Comparison/246cd58b339c56704d745df1d8691b44803ca49e/data/drive_x -------------------------------------------------------------------------------- /data/drive_y: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fschur/Missing-Data-Imputation-Methods-Performance-Comparison/246cd58b339c56704d745df1d8691b44803ca49e/data/drive_y -------------------------------------------------------------------------------- /data/page-blocks.names: -------------------------------------------------------------------------------- 1 | 1. Title of Database: Blocks Classification 2 | 2. Sources: 3 | (a) Donato Malerba 4 | Dipartimento di Informatica 5 | University of Bari 6 | via Orabona 4 7 | 70126 Bari - Italy 8 | phone: +39 - 80 - 5443269 9 | fax: +39 - 80 - 5443196 10 | malerbad@vm.csata.it 11 | (b) Donor: Donato Malerba 12 | (c) Date: July 1995 13 | 3. Past Usage: 14 | This data set have been used to try different simplification methods 15 | for decision trees. A summary of the results can be found in: 16 | 17 | Malerba, D., Esposito, F., and Semeraro, G. 18 | "A Further Comparison of Simplification Methods for Decision-Tree Induction." 19 | In D. Fisher and H. Lenz (Eds.), "Learning from Data: 20 | Artificial Intelligence and Statistics V", Lecture Notes in Statistics, 21 | Springer Verlag, Berlin, 1995. 22 | 23 | The problem consists in classifying all the blocks of the page 24 | layout of a document that has been detected by a segmentation 25 | process. This is an essential step in document analysis 26 | in order to separate text from graphic areas. Indeed, 27 | the five classes are: text (1), horizontal line (2), 28 | picture (3), vertical line (4) and graphic (5). 29 | For a detailed presentation of the problem see: 30 | 31 | Esposito F., Malerba D., & Semeraro G. 32 | Multistrategy Learning for Document Recognition 33 | Applied Artificial Intelligence, 8, pp. 33-84, 1994 34 | 35 | All instances have been personally checked so that 36 | low noise is present in the data. 37 | 38 | 4. Relevant Information Paragraph: 39 | 40 | The 5473 examples comes from 54 distinct documents. 41 | Each observation concerns one block. 42 | All attributes are numeric. 43 | Data are in a format readable by C4.5. 44 | 45 | 5. Number of Instances: 5473. 46 | 47 | 6. Number of Attributes 48 | 49 | height: integer. | Height of the block. 50 | lenght: integer. | Length of the block. 51 | area: integer. | Area of the block (height * lenght); 52 | eccen: continuous. | Eccentricity of the block (lenght / height); 53 | p_black: continuous. | Percentage of black pixels within the block (blackpix / area); 54 | p_and: continuous. | Percentage of black pixels after the application of the Run Length Smoothing Algorithm (RLSA) (blackand / area); 55 | mean_tr: continuous. | Mean number of white-black transitions (blackpix / wb_trans); 56 | blackpix: integer. | Total number of black pixels in the original bitmap of the block. 57 | blackand: integer. | Total number of black pixels in the bitmap of the block after the RLSA. 58 | wb_trans: integer. | Number of white-black transitions in the original bitmap of the block. 59 | 60 | 61 | 62 | 7. Missing Attribute Values: No missing value. 63 | 64 | 8. Class Distribution: 65 | 66 | Valid Cum 67 | Class Frequency Percent Percent Percent 68 | 69 | text 4913 89.8 89.8 89.8 70 | horiz. line 329 6.0 6.0 95.8 71 | graphic 28 .5 .5 96.3 72 | vert. line 88 1.6 1.6 97.9 73 | picture 115 2.1 2.1 100.0 74 | ------- ------- ------- 75 | TOTAL 5473 100.0 100.0 76 | 77 | Summary Statistics: 78 | 79 | Variable Mean Std Dev Minimum Maximum Correlation 80 | 81 | HEIGHT 10.47 18.96 1 804 .3510 82 | LENGTH 89.57 114.72 1 553 -.0045 83 | AREA 1198.41 4849.38 7 143993 .2343 84 | ECCEN 13.75 30.70 .007 537.00 .0992 85 | P_BLACK .37 .18 .052 1.00 .2130 86 | P_AND .79 .17 .062 1.00 -.1771 87 | MEAN_TR 6.22 69.08 1.00 4955.00 .0723 88 | BLACKPIX 365.93 1270.33 7 33017 .1656 89 | BLACKAND 741.11 1881.50 7 46133 .1565 90 | WB_TRANS 106.66 167.31 1 3212 .0337 91 | 92 | -------------------------------------------------------------------------------- /data/text_x: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fschur/Missing-Data-Imputation-Methods-Performance-Comparison/246cd58b339c56704d745df1d8691b44803ca49e/data/text_x -------------------------------------------------------------------------------- /data/text_y: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fschur/Missing-Data-Imputation-Methods-Performance-Comparison/246cd58b339c56704d745df1d8691b44803ca49e/data/text_y -------------------------------------------------------------------------------- /mice.py: -------------------------------------------------------------------------------- 1 | from fancyimpute import IterativeImputer 2 | import numpy as np 3 | from utils import mse as mse_own 4 | from utils import load_data 5 | 6 | 7 | def main(p_miss=0.5, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42): 8 | np.random.seed(rand_seed) 9 | 10 | n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed) 11 | 12 | x_filled = IterativeImputer().fit_transform(xmiss) 13 | 14 | mse = mse_own(x_filled, data_x, mask) 15 | 16 | print("MSE for MICE: " + str(mse)) 17 | 18 | return x_filled, mse 19 | 20 | if __name__ == "__main__": 21 | main() -------------------------------------------------------------------------------- /mice_NN.py: -------------------------------------------------------------------------------- 1 | # This code is adapted from https://github.com/Ouwen/scikit-mice/blob/master/skmice.py 2 | 3 | from utils import load_data, mse 4 | from sklearn.preprocessing import Imputer 5 | 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.neural_network import MLPRegressor 8 | import numpy as np 9 | 10 | 11 | class MiceImputer(object): 12 | 13 | def __init__(self, missing_values="NaN", strategy="mean", axis=0, verbose=0, copy=True): 14 | self.missing_values = missing_values 15 | self.strategy = strategy 16 | self.axis = axis 17 | self.verbose = verbose 18 | self.copy = copy 19 | self.imp = Imputer(missing_values=self.missing_values, strategy=self.strategy, axis=self.axis, 20 | verbose=self.verbose, copy=self.copy) 21 | 22 | def _seed_values(self, X): 23 | self.imp.fit(X) 24 | return self.imp.transform(X) 25 | 26 | def _get_mask(self, X, value_to_mask): 27 | if value_to_mask == "NaN" or np.isnan(value_to_mask): 28 | return np.isnan(X) 29 | else: 30 | return np.array(X == value_to_mask) 31 | 32 | def _process(self, X, column, sizes, activation, epochs, mask, lr): 33 | # Remove values that are in mask 34 | mask_col = mask[:, column] 35 | #!!mask = np.array(self._get_mask(X)[:, column].T)[0] 36 | mask_indices = np.where(mask_col == True)[0] 37 | X_data = np.delete(X, mask_indices, 0) 38 | 39 | # Instantiate the model 40 | model = MLPRegressor(hidden_layer_sizes=sizes, activation=activation, 41 | solver='adam', learning_rate_init=lr,max_iter=epochs) 42 | 43 | # Slice out the column to predict and delete the column. 44 | y_data = X_data[:, column] 45 | X_data = np.delete(X_data, column, 1) 46 | 47 | # Split training and test data 48 | X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.1, random_state=42) 49 | 50 | # Fit the model 51 | model.fit(X_train, y_train) 52 | 53 | # Score the model 54 | scores = model.score(X_test, y_test) 55 | 56 | # Predict missing vars 57 | X_predict = np.delete(X, column, 1) 58 | y = model.predict(X_predict) 59 | 60 | # Replace values in X with their predictions 61 | X[mask_indices, column] = np.take(y, mask_indices) 62 | #np.put(X, predict_indicies, np.take(y, mask_col)) 63 | # Return model and scores 64 | return X, scores 65 | 66 | def transform(self, X, sizes, activation='relu', epochs=500, iterations=10, lr=0.001): 67 | X = np.array(X) 68 | mask = self._get_mask(X, self.missing_values) 69 | X = self._seed_values(X) 70 | specs = np.zeros((iterations, X.shape[1])) 71 | 72 | for i in range(iterations): 73 | print(i) 74 | for c in range(X.shape[1]): 75 | X, specs[i][c] = self._process(X, c, sizes, activation, epochs, mask, lr) 76 | 77 | # Return X matrix with imputed values 78 | return X, specs 79 | 80 | 81 | def main(p_miss=0.5, hidden_size=100, epochs=70, lr=0.001, 82 | dataset="drive", mode="mcar", para=0.5, train=None): 83 | 84 | n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train) 85 | 86 | imputer = MiceImputer(np.nan) 87 | X = xmiss 88 | 89 | X_filled, specs = imputer.transform(np.array(X), (hidden_size, hidden_size, hidden_size), 90 | epochs=epochs, lr=lr, iterations=10) 91 | 92 | mse_nn = mse(X_filled, data_x, mask) 93 | print("MSE MICE_NN : ", mse_nn) 94 | 95 | return X_filled, mse_nn 96 | 97 | if __name__ == "__main__": 98 | main() 99 | -------------------------------------------------------------------------------- /missforest.py: -------------------------------------------------------------------------------- 1 | """ 2 | For more information about MissForest see: 3 | https://academic.oup.com/bioinformatics/article/28/1/112/219101 4 | """ 5 | from missingpy import MissForest 6 | import numpy as np 7 | from utils import load_data 8 | from utils import mse as mse_own 9 | 10 | 11 | def main(p_miss=0.5, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42): 12 | np.random.seed(rand_seed) 13 | 14 | n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed) 15 | 16 | imputer = MissForest(decreasing=True, random_state=rand_seed, verbose=True) 17 | x_filled = imputer.fit_transform(xmiss) 18 | 19 | mse = mse_own(x_filled, data_x, mask) 20 | 21 | print("MSE for MissForest: ", mse) 22 | 23 | return x_filled, mse 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import matplotlib.gridspec as gridspec 5 | from fancyimpute import SimpleFill 6 | 7 | def sigmoid(x, para=0.5): 8 | s = 1/(1+np.exp(-15*(x-para))) 9 | return s 10 | 11 | def load_data(p_miss, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42): 12 | np.random.seed(rand_seed) 13 | 14 | with open("data/" + dataset + "_x", "rb") as file: 15 | data_x = pickle.load(file) 16 | with open("data/" + dataset + "_y", "rb") as file: 17 | data_y = pickle.load(file) 18 | 19 | n = data_x.shape[0] 20 | p = data_x.shape[1] 21 | 22 | perc_miss = p_miss 23 | xmiss = np.copy(data_x) 24 | 25 | if mode == "mcar": 26 | xmiss_flat = xmiss.flatten() 27 | miss_pattern = np.random.choice(n*p, np.floor(n*p*perc_miss).astype(np.int), replace=False) 28 | xmiss_flat[miss_pattern] = np.nan 29 | xmiss = xmiss_flat.reshape([n, p]) # in xmiss, the missing values are represented by nans 30 | elif mode == "mar": 31 | fixed_len = int(np.floor(p/3)) 32 | prob = para*np.mean(data_x[:, :fixed_len], 1) 33 | prob = sigmoid(prob, 0.5) 34 | for i in range(n): 35 | mask_tmp = np.random.choice([1, 0], size=p, p=[1 - prob[i], prob[i]]) 36 | for j in range(fixed_len, p): 37 | if mask_tmp[j] == 0: 38 | xmiss[i, j] = np.nan 39 | print("missing rate: ", np.sum(np.isnan(xmiss.flatten()))/(n*p)) 40 | else: 41 | raise Exception("mode is not valid") 42 | 43 | mask = np.isfinite(xmiss) # binary mask that indicates which values are missing 44 | 45 | xhat_0 = np.copy(xmiss) 46 | xhat_0[np.isnan(xmiss)] = 0 47 | 48 | x_filled = SimpleFill().fit_transform(xmiss) 49 | 50 | print("MSE mean imputation full data: " + str(mse(x_filled, data_x, mask))) 51 | 52 | if train == True: 53 | part = int(np.floor(n/2)) 54 | return (n-part), p, xmiss[part:,:], xhat_0[part:,:], mask[part:,:], data_x[part:,:], data_y[part:,:] 55 | elif train == False: 56 | part = int(np.floor(n/2)) 57 | return part, p, xmiss[:part,:], xhat_0[:part,:], mask[:part,:], data_x[:part,:], data_y[:part,:] 58 | elif train == None: 59 | return n, p, xmiss, xhat_0, mask, data_x, data_y 60 | 61 | def mse(xhat,xtrue,mask): 62 | xhat = np.array(xhat) 63 | xtrue = np.array(xtrue) 64 | t = np.power(xhat-xtrue, 2) 65 | return np.mean(t[~mask]) 66 | --------------------------------------------------------------------------------