├── GAIN.py
├── MIWAE.py
├── README.md
├── data
    ├── Index
    ├── Sensorless_drive_diagnosis.txt
    ├── data_drive.py
    ├── data_text.py
    ├── drive_x
    ├── drive_y
    ├── page-blocks.data
    ├── page-blocks.names
    ├── text_x
    └── text_y
├── mice.py
├── mice_NN.py
├── missforest.py
└── utils.py


/GAIN.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This code is adapted from https://github.com/jsyoon0823/GAIN
  3 | Information about GAIN:
  4 | Reference: J. Yoon, J. Jordon, M. van der Schaar, "GAIN: Missing Data Imputation using Generative Adversarial Nets," ICML, 2018.
  5 | Paper Link: http://medianetlab.ee.ucla.edu/papers/ICML_GAIN.pdf
  6 | Appendix Link: http://medianetlab.ee.ucla.edu/papers/ICML_GAIN_Supp.pdf
  7 | """
  8 | 
  9 | import tensorflow as tf
 10 | from tqdm import tqdm
 11 | import numpy as np
 12 | from utils import mse, load_data
 13 | 
 14 | 
 15 | def main(p_miss = 0.5, p_hint=0.3, alpha=800, num_epochs=2000, dataset="text",
 16 |          mode="mcar", para=0.5, train=None, rand_seed=42):
 17 | 
 18 |     np.random.seed(rand_seed)
 19 |     tf.set_random_seed(rand_seed)
 20 | 
 21 |     n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed)
 22 | 
 23 |     # Mini batch size
 24 |     mb_size = 64
 25 |     # Imput Dim (Fixed)
 26 |     train_rate = 1
 27 | 
 28 |     Data = data_x
 29 |     # Parameters
 30 |     No = n
 31 |     Dim = p
 32 | 
 33 |     # Hidden state dimensions
 34 |     H_Dim1 = Dim
 35 |     H_Dim2 = Dim
 36 | 
 37 |     # %% Missing introducing
 38 |     Missing = mask*1
 39 | 
 40 |     # %% Train Test Division
 41 | 
 42 |     idx = np.random.permutation(No)
 43 | 
 44 |     Train_No = int(No * train_rate)
 45 |     Test_No = No - Train_No
 46 | 
 47 |     # Train / Test Features
 48 |     trainX = Data[idx[:Train_No], :]
 49 |     testX = Data[idx[Train_No:], :]
 50 | 
 51 |     # Train / Test Missing Indicators
 52 |     trainM = Missing[idx[:Train_No], :]
 53 |     testM = Missing[idx[Train_No:], :]
 54 | 
 55 | 
 56 |     # %% Necessary Functions
 57 | 
 58 |     # 1. Xavier Initialization Definition
 59 |     def xavier_init(size):
 60 |         in_dim = size[0]
 61 |         xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
 62 |         return tf.random_normal(shape=size, stddev=xavier_stddev)
 63 | 
 64 | 
 65 |     # Hint Vector Generation
 66 |     def sample_M(m, n, p):
 67 |         A = np.random.uniform(0., 1., size=[m, n])
 68 |         B = A > p
 69 |         C = 1. * B
 70 |         return C
 71 | 
 72 | 
 73 |     '''
 74 |     GAIN Consists of 3 Components
 75 |     - Generator
 76 |     - Discriminator
 77 |     - Hint Mechanism
 78 |     '''
 79 | 
 80 |     # %% GAIN Architecture
 81 | 
 82 |     # %% 1. Input Placeholders
 83 |     # 1.1. Data Vector
 84 |     X = tf.placeholder(tf.float32, shape=[None, Dim])
 85 |     # 1.2. Mask Vector
 86 |     M = tf.placeholder(tf.float32, shape=[None, Dim])
 87 |     # 1.3. Hint vector
 88 |     H = tf.placeholder(tf.float32, shape=[None, Dim])
 89 |     # 1.4. X with missing values
 90 |     New_X = tf.placeholder(tf.float32, shape=[None, Dim])
 91 | 
 92 |     # %% 2. Discriminator
 93 |     D_W1 = tf.Variable(xavier_init([Dim * 2, H_Dim1]))  # Data + Hint as inputs
 94 |     D_b1 = tf.Variable(tf.zeros(shape=[H_Dim1]))
 95 | 
 96 |     D_W2 = tf.Variable(xavier_init([H_Dim1, H_Dim2]))
 97 |     D_b2 = tf.Variable(tf.zeros(shape=[H_Dim2]))
 98 | 
 99 |     D_W3 = tf.Variable(xavier_init([H_Dim2, Dim]))
100 |     D_b3 = tf.Variable(tf.zeros(shape=[Dim]))  # Output is multi-variate
101 | 
102 |     theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
103 | 
104 |     # %% 3. Generator
105 |     G_W1 = tf.Variable(xavier_init([Dim * 2, H_Dim1]))  # Data + Mask as inputs (Random Noises are in Missing Components)
106 |     G_b1 = tf.Variable(tf.zeros(shape=[H_Dim1]))
107 | 
108 |     G_W2 = tf.Variable(xavier_init([H_Dim1, H_Dim2]))
109 |     G_b2 = tf.Variable(tf.zeros(shape=[H_Dim2]))
110 | 
111 |     G_W3 = tf.Variable(xavier_init([H_Dim2, Dim]))
112 |     G_b3 = tf.Variable(tf.zeros(shape=[Dim]))
113 | 
114 |     theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]
115 | 
116 | 
117 |     # %% GAIN Function
118 | 
119 |     # %% 1. Generator
120 |     def generator(new_x, m):
121 |         inputs = tf.concat(axis=1, values=[new_x, m])  # Mask + Data Concatenate
122 |         G_h1 = tf.nn.relu(tf.matmul(inputs, G_W1) + G_b1)
123 |         G_h2 = tf.nn.relu(tf.matmul(G_h1, G_W2) + G_b2)
124 |         G_prob = tf.nn.sigmoid(tf.matmul(G_h2, G_W3) + G_b3)  # [0,1] normalized Output
125 | 
126 |         return G_prob
127 | 
128 | 
129 |     # %% 2. Discriminator
130 |     def discriminator(new_x, h):
131 |         inputs = tf.concat(axis=1, values=[new_x, h])  # Hint + Data Concatenate
132 |         D_h1 = tf.nn.relu(tf.matmul(inputs, D_W1) + D_b1)
133 |         D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
134 |         D_logit = tf.matmul(D_h2, D_W3) + D_b3
135 |         D_prob = tf.nn.sigmoid(D_logit)  # [0,1] Probability Output
136 | 
137 |         return D_prob
138 | 
139 | 
140 |     # %% 3. Other functions
141 |     # Random sample generator for Z
142 |     def sample_Z(m, n):
143 |         return np.random.uniform(0., 0.01, size=[m, n])
144 | 
145 | 
146 |     # Mini-batch generation
147 |     def sample_idx(m, n):
148 |         A = np.random.permutation(m)
149 |         idx = A[:n]
150 |         return idx
151 | 
152 | 
153 |     # %% Structure
154 |     # Generator
155 |     G_sample = generator(New_X, M)
156 | 
157 |     # Combine with original data
158 |     Hat_New_X = New_X * M + G_sample * (1 - M)
159 | 
160 |     # Discriminator
161 |     D_prob = discriminator(Hat_New_X, H)
162 | 
163 |     # %% Loss
164 |     D_loss1 = -tf.reduce_mean(M * tf.log(D_prob + 1e-8) + (1 - M) * tf.log(1. - D_prob + 1e-8))
165 |     G_loss1 = -tf.reduce_mean((1 - M) * tf.log(D_prob + 1e-8))
166 | 
167 |     MSE_train_loss = tf.reduce_mean((M * New_X - M * G_sample) ** 2) / tf.reduce_mean(M)
168 | 
169 |     D_loss = D_loss1
170 |     G_loss = G_loss1 + alpha * MSE_train_loss
171 | 
172 |     # %% MSE Performance metric
173 |     MSE_test_loss = tf.reduce_mean(((1 - M) * X - (1 - M) * G_sample) ** 2) / tf.reduce_mean(1 - M)
174 | 
175 |     # %% Solver
176 |     D_solver = tf.train.AdamOptimizer().minimize(D_loss, var_list=theta_D)
177 |     G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)
178 | 
179 |     # Sessions
180 |     sess = tf.Session()
181 |     sess.run(tf.global_variables_initializer())
182 | 
183 |     # %% Iterations
184 |     errors = []
185 |     # %% Start Iterations
186 |     for it in tqdm(range(num_epochs)):
187 | 
188 |         # %% Inputs
189 |         mb_idx = sample_idx(Train_No, mb_size)
190 |         X_mb = trainX[mb_idx, :]
191 | 
192 |         Z_mb = sample_Z(mb_size, Dim)
193 |         M_mb = trainM[mb_idx, :]
194 |         H_mb1 = sample_M(mb_size, Dim, 1 - p_hint)
195 |         H_mb = M_mb * H_mb1
196 | 
197 |         New_X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb  # Missing Data Introduce
198 | 
199 |         _, D_loss_curr = sess.run([D_solver, D_loss1], feed_dict={M: M_mb, New_X: New_X_mb, H: H_mb})
200 |         _, G_loss_curr, MSE_train_loss_curr, MSE_test_loss_curr = sess.run(
201 |             [G_solver, G_loss1, MSE_train_loss, MSE_test_loss],
202 |             feed_dict={X: X_mb, M: M_mb, New_X: New_X_mb, H: H_mb})
203 | 
204 |         # %% Intermediate Losses
205 |         if it % 50 == 0:
206 |             Z_mb = sample_Z(n, p)
207 |             New_X_mb = Missing * data_x + (1 - Missing) * Z_mb
208 | 
209 |             x_filled = sess.run(G_sample, feed_dict={X: data_x, M: Missing, New_X: New_X_mb})
210 | 
211 |             print('Iter: {}'.format(it))
212 |             print('Train_loss: {:.4}'.format(np.sqrt(MSE_train_loss_curr)))
213 |             print('Test_loss: {:.4}'.format(np.sqrt(MSE_test_loss_curr)))
214 |             errors.append(mse(x_filled, data_x, mask))
215 |             print("Real MSE: ", errors[-1])
216 | 
217 |     # %% Final Loss
218 |     if train_rate != 1:
219 |         Z_mb = sample_Z(Test_No, Dim)
220 |         M_mb = testM
221 |         X_mb = testX
222 | 
223 |         New_X_mb = M_mb * X_mb + (1 - M_mb) * Z_mb  # Missing Data Introduce
224 | 
225 |         MSE_final, Sample = sess.run([MSE_test_loss, G_sample], feed_dict={X: testX, M: testM, New_X: New_X_mb})
226 | 
227 |         print('Final Test MSE: ' + str(MSE_final))
228 |     # Real Error
229 | 
230 |     Z_mb = sample_Z(n, p)
231 |     New_X_mb = Missing * data_x + (1 - Missing) * Z_mb
232 | 
233 |     x_filled = sess.run(G_sample, feed_dict={X: data_x, M: Missing, New_X: New_X_mb})
234 |     real_mse = mse(x_filled, data_x, mask)
235 |     print("Real final MSE: " + str(real_mse))
236 | 
237 |     return x_filled, real_mse
238 | 
239 | 
240 | if __name__ == "__main__":
241 |     main()


--------------------------------------------------------------------------------
/MIWAE.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This code is adapted from https://github.com/pamattei/miwae
  3 | For more information on MIWAE see:
  4 | http://proceedings.mlr.press/v97/mattei19a.html
  5 | """
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | 
  9 | import tensorflow_probability as tfp
 10 | tfd = tfp.distributions
 11 | tfk = tf.keras
 12 | tfkl = tf.keras.layers
 13 | from utils import mse
 14 | from utils import load_data
 15 | 
 16 | 
 17 | def main(p_miss=0.5, hidden_units=50, lr=0.001, epochs=500, dataset="drive",
 18 |          mode="mcar", para=0.5, train=None, rand_seed=42):
 19 | 
 20 |     np.random.seed(rand_seed)
 21 |     tf.set_random_seed(rand_seed)
 22 | 
 23 |     n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed)
 24 | 
 25 |     x = tf.placeholder(tf.float32, shape=[None, p]) # Placeholder for xhat_0
 26 |     learning_rate = tf.placeholder(tf.float32, shape=[])
 27 |     batch_size = tf.shape(x)[0]
 28 |     xmask = tf.placeholder(tf.bool, shape=[None, p])
 29 |     K= tf.placeholder(tf.int32, shape=[]) # Placeholder for the number of importance weights
 30 | 
 31 |     d = np.floor(p/2).astype(int) # dimension of the latent space
 32 | 
 33 |     p_z = tfd.MultivariateNormalDiag(loc=tf.zeros(d, tf.float32))
 34 | 
 35 |     h = hidden_units # number of hidden units (same for all MLPs)
 36 | 
 37 |     sigma = "relu"
 38 | 
 39 |     decoder = tfk.Sequential([
 40 |       tfkl.InputLayer(input_shape=[d,]),
 41 |       tfkl.Dense(h, activation=sigma,kernel_initializer="orthogonal"),
 42 |       tfkl.Dense(h, activation=sigma,kernel_initializer="orthogonal"),
 43 |       tfkl.Dense(3*p,kernel_initializer="orthogonal") # the decoder will output both the mean, the scale, and the number of degrees of freedoms (hence the 3*p)
 44 |     ])
 45 | 
 46 |     tiledmask = tf.tile(xmask,[K,1])
 47 |     tiledmask_float = tf.cast(tiledmask,tf.float32)
 48 |     mask_not_float = tf.abs(-tf.cast(xmask,tf.float32))
 49 | 
 50 |     iota = tf.Variable(np.zeros([1,p]),dtype=tf.float32)
 51 |     tilediota = tf.tile(iota,[batch_size,1])
 52 |     iotax = x + tf.multiply(tilediota,mask_not_float)
 53 | 
 54 |     encoder = tfk.Sequential([
 55 |       tfkl.InputLayer(input_shape=[p,]),
 56 |       tfkl.Dense(h, activation=sigma,kernel_initializer="orthogonal"),
 57 |       tfkl.Dense(h, activation=sigma,kernel_initializer="orthogonal"),
 58 |       tfkl.Dense(3*d,kernel_initializer="orthogonal")
 59 |     ])
 60 | 
 61 |     out_encoder = encoder(iotax)
 62 |     q_zgivenxobs = tfd.Independent(distribution=tfd.StudentT(loc=out_encoder[..., :d], scale=tf.nn.softplus(out_encoder[..., d:(2*d)]), df=3 + tf.nn.softplus(out_encoder[..., (2*d):(3*d)])))
 63 |     zgivenx = q_zgivenxobs.sample(K)
 64 |     zgivenx_flat = tf.reshape(zgivenx,[K*batch_size,d])
 65 |     data_flat = tf.reshape(tf.tile(x,[K,1]),[-1,1])
 66 | 
 67 |     out_decoder = decoder(zgivenx_flat)
 68 |     all_means_obs_model = out_decoder[..., :p]
 69 |     all_scales_obs_model = tf.nn.softplus(out_decoder[..., p:(2*p)]) + 0.001
 70 |     all_degfreedom_obs_model = tf.nn.softplus(out_decoder[..., (2*p):(3*p)]) + 3
 71 |     all_log_pxgivenz_flat = tfd.StudentT(loc=tf.reshape(all_means_obs_model,[-1,1]),scale=tf.reshape(all_scales_obs_model,[-1,1]),df=tf.reshape(all_degfreedom_obs_model,[-1,1])).log_prob(data_flat)
 72 |     all_log_pxgivenz = tf.reshape(all_log_pxgivenz_flat,[K*batch_size,p])
 73 | 
 74 |     logpxobsgivenz = tf.reshape(tf.reduce_sum(tf.multiply(all_log_pxgivenz,tiledmask_float),1),[K,batch_size])
 75 |     logpz = p_z.log_prob(zgivenx)
 76 |     logq = q_zgivenxobs.log_prob(zgivenx)
 77 | 
 78 |     miwae_loss = -tf.reduce_mean(tf.reduce_logsumexp(logpxobsgivenz + logpz - logq,0)) +tf.log(tf.cast(K,tf.float32))
 79 |     train_miss = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(miwae_loss)
 80 | 
 81 |     xgivenz = tfd.Independent(
 82 |           distribution=tfd.StudentT(loc=all_means_obs_model, scale=all_scales_obs_model, df=all_degfreedom_obs_model))
 83 | 
 84 |     imp_weights = tf.nn.softmax(logpxobsgivenz + logpz - logq,0) # these are w_1,....,w_L for all observations in the batch
 85 |     xms = tf.reshape(xgivenz.mean(),[K,batch_size,p])
 86 |     xm=tf.einsum('ki,kij->ij', imp_weights, xms)
 87 | 
 88 |     miwae_loss_train=np.array([])
 89 | 
 90 |     mse_train=np.array([])
 91 |     bs = 64 # batch size
 92 |     n_epochs = epochs
 93 |     xhat = np.copy(xhat_0) # This will be out imputed data matrix
 94 | 
 95 |     with tf.Session() as sess:
 96 |         sess.run(tf.global_variables_initializer())
 97 |         for ep in range(1,n_epochs):
 98 |           perm = np.random.permutation(n) # We use the "random reshuffling" version of SGD
 99 |           batches_data = np.array_split(xhat_0[perm,], n/bs)
100 |           batches_mask = np.array_split(mask[perm,], n/bs)
101 |           for it in range(len(batches_data)):
102 |               train_miss.run(feed_dict={x: batches_data[it], learning_rate: lr, K:20, xmask: batches_mask[it]}) # Gradient step
103 |           if ep % 50 == 1 or ep == (n_epochs -1):
104 |               losstrain = np.array([miwae_loss.eval(feed_dict={x: xhat_0, K:20, xmask: mask})]) # MIWAE bound evaluation
105 |               miwae_loss_train = np.append(miwae_loss_train,-losstrain,axis=0)
106 |               print('Epoch %g' %ep)
107 |               print('MIWAE likelihood bound  %g' %-losstrain)
108 |               for i in range(n): # We impute the observations one at a time for memory reasons
109 |                   xhat[i,:][~mask[i,:]]=xm.eval(feed_dict={x: xhat_0[i,:].reshape([1,p]), K:1000, xmask: mask[i,:].reshape([1,p])})[~mask[i,:].reshape([1,p])]
110 |               err = np.array(mse(xhat,data_x,mask))
111 |               print('Imputation MSE  %g' %err)
112 |               print('-----')
113 | 
114 |     return xhat, err
115 | 
116 | if __name__ == "__main__":
117 |     main()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Missing-Data-Imputation-Methods-Performance-Comparison
 2 | The data imputation methods [MissForest](https://cran.r-project.org/web/packages/missForest/missForest.pdf), [GAIN](https://arxiv.org/abs/1806.02920), [MICE](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3074241/), MICE-NN and [MIWAE](http://proceedings.mlr.press/v97/mattei19a.html) are tested on two UCI datasets (Dataset for Sensorless Drive Diagnosis Data Set, Page Blocks Classification Dataset). MICE-NN is a modified version of MICE, where instead of linear regresssion fully connected neural networks are used. The tests are done by taking the complete dataset (without missing values) introducing either MAR or MCAR missingness with the desired missing rate and then using the imputation methods to impute the missing values. Since the correct values are known, the real MSE can be computed. To test other datasets, save the dataset as a 2-dim numpy array in the folder [data](data). Now set dataset = "name" when calling the imputation method, where your dataset in the folder [data](data) is named "name_y" and name_x.
 3 | 
 4 | MCAR missing values are introduced by dropping each value in the data independently with probability "p_miss". MAR missing values are introduced by summing over one third of each observation and dropping each value in the rest of the observation independently with a probability proportional to the computed sum. For this the variable "para" is used (for details see load_data in utils.py).
 5 | 
 6 | 
 7 | 
 8 | ## Requirements
 9 | The code requires Python 3.6 or later.
10 | Required packages are:
11 | 
12 | * fanyimpute >= 0.5.3
13 | * mathplotlib >= 2.2.2
14 | * missingpy >= 0.2.0
15 | * numpy >= 1.16.2
16 | * pathlib >= 2.3.3
17 | * pickle 
18 | * Pillow >= 5.4.1
19 | * pylab 
20 | * scipy >= 1.2.1
21 | * sklearn 
22 | * tensorflow >= 1.14
23 | * tensorflow_probability >=0.7.0
24 | * torch >= 1.0.1
25 | * torchvision >= 0.2.2
26 | * tqdm >= 4.31.1
27 | 
28 | 


--------------------------------------------------------------------------------
/data/Index:
--------------------------------------------------------------------------------
1 | Index of page-blocks
2 | 
3 | 02 Dec 1996      128 Index
4 | 20 Jul 1995   104579 page-blocks.data.Z
5 | 20 Jul 1995     3900 page-blocks.names
6 | 


--------------------------------------------------------------------------------
/data/data_drive.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pickle
 3 | import pandas as pd
 4 | from sklearn import preprocessing
 5 | 
 6 | data = pd.read_csv("Sensorless_drive_diagnosis.txt", header=None, sep=" ")
 7 | data = data.values
 8 | print(data[:2,:])
 9 | np.random.shuffle(data)
10 | 
11 | y = data[:, -1:].astype(np.int) - 1
12 | x = data[:, :-1]
13 | 
14 | print(y[:20])
15 | 
16 | scaler = preprocessing.MinMaxScaler()
17 | x_numpy = scaler.fit_transform(x)
18 | 
19 | with open("drive_x", "wb") as file:
20 |     pickle.dump(x_numpy, file)
21 | 
22 | with open("drive_y", "wb") as file:
23 |     pickle.dump(y, file)
24 | 


--------------------------------------------------------------------------------
/data/data_text.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pickle
 3 | import pandas as pd
 4 | from sklearn import preprocessing
 5 | 
 6 | data = pd.read_csv("page-blocks.data", header=None, sep="[ ]+")
 7 | data = data.values
 8 | 
 9 | np.random.shuffle(data)
10 | 
11 | y = data[:, -1:].astype(np.int) - 1
12 | x = data[:, :-1]
13 | 
14 | scaler = preprocessing.MinMaxScaler()
15 | x_numpy = scaler.fit_transform(x)
16 | 
17 | with open("text_x", "wb") as file:
18 |     pickle.dump(x_numpy, file)
19 | 
20 | with open("text_y", "wb") as file:
21 |     pickle.dump(y, file)
22 | 


--------------------------------------------------------------------------------
/data/drive_x:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fschur/Missing-Data-Imputation-Methods-Performance-Comparison/246cd58b339c56704d745df1d8691b44803ca49e/data/drive_x


--------------------------------------------------------------------------------
/data/drive_y:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fschur/Missing-Data-Imputation-Methods-Performance-Comparison/246cd58b339c56704d745df1d8691b44803ca49e/data/drive_y


--------------------------------------------------------------------------------
/data/page-blocks.names:
--------------------------------------------------------------------------------
 1 | 1. Title of Database: Blocks Classification
 2 | 2. Sources:
 3 |    (a) Donato Malerba
 4 |        Dipartimento di Informatica
 5 |        University of Bari
 6 |        via Orabona 4
 7 |        70126 Bari - Italy
 8 |        phone: +39 - 80 - 5443269
 9 |        fax: +39 - 80 - 5443196
10 |        malerbad@vm.csata.it
11 |    (b) Donor: Donato Malerba
12 |    (c) Date: July 1995
13 | 3. Past Usage:
14 |    This data set have been used to try different simplification methods
15 |    for decision trees. A summary of the results can be found in:
16 | 
17 |    Malerba, D., Esposito, F., and Semeraro, G.
18 |    "A Further Comparison of Simplification Methods for Decision-Tree Induction."
19 |    In D. Fisher and H. Lenz (Eds.), "Learning  from Data: 
20 |    Artificial Intelligence and Statistics V", Lecture Notes in Statistics,
21 |    Springer Verlag, Berlin, 1995.
22 | 
23 |    The problem consists in classifying all the blocks of the page
24 |    layout of a document that has been detected by a segmentation
25 |    process. This is an essential step in document analysis
26 |    in order to separate text from graphic areas. Indeed, 
27 |    the five classes are: text (1), horizontal line (2),
28 |    picture (3), vertical line (4) and graphic (5).
29 |    For a detailed presentation of the problem see:
30 | 
31 |     Esposito F., Malerba D., & Semeraro G.
32 |   Multistrategy Learning for Document Recognition
33 |          Applied Artificial Intelligence, 8, pp. 33-84, 1994
34 | 
35 |    All instances have been personally checked so that
36 |    low noise is present in the data.
37 | 
38 | 4. Relevant Information Paragraph:
39 | 
40 |    The 5473 examples comes from 54 distinct documents. 
41 |    Each observation concerns one block. 
42 |    All attributes are numeric.
43 |    Data are in a format readable by C4.5.
44 | 
45 | 5. Number of Instances: 5473.
46 | 
47 | 6. Number of Attributes 
48 | 
49 |    height:   integer.         | Height of the block.
50 |    lenght:   integer.     | Length of the block. 
51 |    area:     integer.    | Area of the block (height * lenght);
52 |    eccen:    continuous.  | Eccentricity of the block (lenght / height);
53 |    p_black:  continuous.  | Percentage of black pixels within the block (blackpix / area);
54 |    p_and:    continuous.        | Percentage of black pixels after the application of the Run Length Smoothing Algorithm (RLSA) (blackand / area);
55 |    mean_tr:  continuous.      | Mean number of white-black transitions (blackpix / wb_trans);
56 |    blackpix: integer.    | Total number of black pixels in the original bitmap of the block.
57 |    blackand: integer.        | Total number of black pixels in the bitmap of the block after the RLSA.
58 |    wb_trans: integer.          | Number of white-black transitions in the original bitmap of the block.
59 | 
60 | 
61 | 
62 | 7. Missing Attribute Values:  No missing value.
63 | 
64 | 8. Class Distribution: 
65 | 
66 |                                            Valid    Cum
67 |    Class               Frequency  Percent  Percent  Percent
68 |  
69 | text                      4913     89.8     89.8     89.8
70 | horiz. line                329      6.0      6.0     95.8
71 | graphic                     28       .5       .5     96.3
72 | vert. line                  88      1.6      1.6     97.9
73 | picture                    115      2.1      2.1    100.0
74 |                                 -------  -------  -------
75 |                         TOTAL      5473    100.0    100.0
76 | 
77 | Summary Statistics:
78 | 
79 | Variable      Mean    Std Dev   Minimum   Maximum   Correlation 
80 | 
81 | HEIGHT       10.47      18.96         1       804         .3510
82 | LENGTH       89.57     114.72         1       553        -.0045
83 | AREA       1198.41    4849.38         7    143993         .2343
84 | ECCEN        13.75      30.70      .007    537.00         .0992
85 | P_BLACK        .37        .18      .052      1.00         .2130
86 | P_AND          .79        .17      .062      1.00        -.1771
87 | MEAN_TR       6.22      69.08      1.00   4955.00         .0723
88 | BLACKPIX    365.93    1270.33         7     33017         .1656
89 | BLACKAND    741.11    1881.50         7     46133         .1565
90 | WB_TRANS    106.66     167.31         1      3212         .0337
91 | 
92 | 


--------------------------------------------------------------------------------
/data/text_x:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fschur/Missing-Data-Imputation-Methods-Performance-Comparison/246cd58b339c56704d745df1d8691b44803ca49e/data/text_x


--------------------------------------------------------------------------------
/data/text_y:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fschur/Missing-Data-Imputation-Methods-Performance-Comparison/246cd58b339c56704d745df1d8691b44803ca49e/data/text_y


--------------------------------------------------------------------------------
/mice.py:
--------------------------------------------------------------------------------
 1 | from fancyimpute import IterativeImputer
 2 | import numpy as np
 3 | from utils import mse as mse_own
 4 | from utils import load_data
 5 | 
 6 | 
 7 | def main(p_miss=0.5, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42):
 8 |     np.random.seed(rand_seed)
 9 | 
10 |     n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed)
11 | 
12 |     x_filled = IterativeImputer().fit_transform(xmiss)
13 | 
14 |     mse = mse_own(x_filled, data_x, mask)
15 | 
16 |     print("MSE for MICE: " + str(mse))
17 | 
18 |     return x_filled, mse
19 | 
20 | if __name__ == "__main__":
21 |     main()


--------------------------------------------------------------------------------
/mice_NN.py:
--------------------------------------------------------------------------------
 1 | # This code is adapted from https://github.com/Ouwen/scikit-mice/blob/master/skmice.py
 2 | 
 3 | from utils import load_data, mse
 4 | from sklearn.preprocessing import Imputer
 5 | 
 6 | from sklearn.model_selection import train_test_split
 7 | from sklearn.neural_network import MLPRegressor
 8 | import numpy as np
 9 | 
10 | 
11 | class MiceImputer(object):
12 | 
13 |     def __init__(self, missing_values="NaN", strategy="mean", axis=0, verbose=0, copy=True):
14 |         self.missing_values = missing_values
15 |         self.strategy = strategy
16 |         self.axis = axis
17 |         self.verbose = verbose
18 |         self.copy = copy
19 |         self.imp = Imputer(missing_values=self.missing_values, strategy=self.strategy, axis=self.axis,
20 |                            verbose=self.verbose, copy=self.copy)
21 | 
22 |     def _seed_values(self, X):
23 |         self.imp.fit(X)
24 |         return self.imp.transform(X)
25 | 
26 |     def _get_mask(self, X, value_to_mask):
27 |         if value_to_mask == "NaN" or np.isnan(value_to_mask):
28 |             return np.isnan(X)
29 |         else:
30 |             return np.array(X == value_to_mask)
31 | 
32 |     def _process(self, X, column, sizes, activation, epochs, mask, lr):
33 |         # Remove values that are in mask
34 |         mask_col = mask[:, column]
35 |         #!!mask = np.array(self._get_mask(X)[:, column].T)[0]
36 |         mask_indices = np.where(mask_col == True)[0]
37 |         X_data = np.delete(X, mask_indices, 0)
38 | 
39 |         # Instantiate the model
40 |         model = MLPRegressor(hidden_layer_sizes=sizes, activation=activation,
41 |                              solver='adam', learning_rate_init=lr,max_iter=epochs)
42 | 
43 |         # Slice out the column to predict and delete the column.
44 |         y_data = X_data[:, column]
45 |         X_data = np.delete(X_data, column, 1)
46 | 
47 |         # Split training and test data
48 |         X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.1, random_state=42)
49 | 
50 |         # Fit the model
51 |         model.fit(X_train, y_train)
52 | 
53 |         # Score the model
54 |         scores = model.score(X_test, y_test)
55 | 
56 |         # Predict missing vars
57 |         X_predict = np.delete(X, column, 1)
58 |         y = model.predict(X_predict)
59 | 
60 |         # Replace values in X with their predictions
61 |         X[mask_indices, column] = np.take(y, mask_indices)
62 |         #np.put(X, predict_indicies, np.take(y, mask_col))
63 |         # Return model and scores
64 |         return X, scores
65 | 
66 |     def transform(self, X, sizes, activation='relu', epochs=500, iterations=10, lr=0.001):
67 |         X = np.array(X)
68 |         mask = self._get_mask(X, self.missing_values)
69 |         X = self._seed_values(X)
70 |         specs = np.zeros((iterations, X.shape[1]))
71 | 
72 |         for i in range(iterations):
73 |             print(i)
74 |             for c in range(X.shape[1]):
75 |                 X, specs[i][c] = self._process(X, c, sizes, activation, epochs, mask, lr)
76 | 
77 |         # Return X matrix with imputed values
78 |         return X, specs
79 | 
80 | 
81 | def main(p_miss=0.5, hidden_size=100, epochs=70, lr=0.001,
82 |          dataset="drive", mode="mcar", para=0.5, train=None):
83 | 
84 |     n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train)
85 | 
86 |     imputer = MiceImputer(np.nan)
87 |     X = xmiss
88 | 
89 |     X_filled, specs = imputer.transform(np.array(X), (hidden_size, hidden_size, hidden_size),
90 |                                         epochs=epochs, lr=lr, iterations=10)
91 | 
92 |     mse_nn = mse(X_filled, data_x, mask)
93 |     print("MSE MICE_NN : ", mse_nn)
94 | 
95 |     return X_filled, mse_nn
96 | 
97 | if __name__ == "__main__":
98 |     main()
99 | 


--------------------------------------------------------------------------------
/missforest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | For more information about MissForest see:
 3 | https://academic.oup.com/bioinformatics/article/28/1/112/219101
 4 | """
 5 | from missingpy import MissForest
 6 | import numpy as np
 7 | from utils import load_data
 8 | from utils import mse as mse_own
 9 | 
10 | 
11 | def main(p_miss=0.5, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42):
12 |     np.random.seed(rand_seed)
13 | 
14 |     n, p, xmiss, xhat_0, mask, data_x, data_y = load_data(p_miss, dataset=dataset, mode=mode, para=para, train=train, rand_seed=rand_seed)
15 | 
16 |     imputer = MissForest(decreasing=True, random_state=rand_seed, verbose=True)
17 |     x_filled = imputer.fit_transform(xmiss)
18 | 
19 |     mse = mse_own(x_filled, data_x, mask)
20 | 
21 |     print("MSE for MissForest: ", mse)
22 | 
23 |     return x_filled, mse
24 | 
25 | if __name__ == "__main__":
26 |     main()
27 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import matplotlib.gridspec as gridspec
 5 | from fancyimpute import SimpleFill
 6 | 
 7 | def sigmoid(x, para=0.5):
 8 |     s = 1/(1+np.exp(-15*(x-para)))
 9 |     return s
10 | 
11 | def load_data(p_miss, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42):
12 |     np.random.seed(rand_seed)
13 | 
14 |     with open("data/" + dataset + "_x", "rb") as file:
15 |         data_x = pickle.load(file)
16 |     with open("data/" + dataset + "_y", "rb") as file:
17 |         data_y = pickle.load(file)
18 | 
19 |     n = data_x.shape[0]
20 |     p = data_x.shape[1]
21 | 
22 |     perc_miss = p_miss
23 |     xmiss = np.copy(data_x)
24 | 
25 |     if mode == "mcar":
26 |         xmiss_flat = xmiss.flatten()
27 |         miss_pattern = np.random.choice(n*p, np.floor(n*p*perc_miss).astype(np.int), replace=False)
28 |         xmiss_flat[miss_pattern] = np.nan
29 |         xmiss = xmiss_flat.reshape([n, p])  # in xmiss, the missing values are represented by nans
30 |     elif mode == "mar":
31 |         fixed_len = int(np.floor(p/3))
32 |         prob = para*np.mean(data_x[:, :fixed_len], 1)
33 |         prob = sigmoid(prob, 0.5)
34 |         for i in range(n):
35 |             mask_tmp = np.random.choice([1, 0], size=p, p=[1 - prob[i], prob[i]])
36 |             for j in range(fixed_len, p):
37 |                 if mask_tmp[j] == 0:
38 |                     xmiss[i, j] = np.nan
39 |         print("missing rate: ", np.sum(np.isnan(xmiss.flatten()))/(n*p))
40 |     else:
41 |         raise Exception("mode is not valid")
42 | 
43 |     mask = np.isfinite(xmiss) # binary mask that indicates which values are missing
44 | 
45 |     xhat_0 = np.copy(xmiss)
46 |     xhat_0[np.isnan(xmiss)] = 0
47 | 
48 |     x_filled = SimpleFill().fit_transform(xmiss)
49 | 
50 |     print("MSE mean imputation full data: " + str(mse(x_filled, data_x, mask)))
51 | 
52 |     if train == True:
53 |         part = int(np.floor(n/2))
54 |         return (n-part), p, xmiss[part:,:], xhat_0[part:,:], mask[part:,:], data_x[part:,:], data_y[part:,:]
55 |     elif train == False:
56 |         part = int(np.floor(n/2))
57 |         return part, p, xmiss[:part,:], xhat_0[:part,:], mask[:part,:], data_x[:part,:], data_y[:part,:]
58 |     elif train == None:
59 |         return n, p, xmiss, xhat_0, mask, data_x, data_y
60 | 
61 | def mse(xhat,xtrue,mask):
62 |     xhat = np.array(xhat)
63 |     xtrue = np.array(xtrue)
64 |     t = np.power(xhat-xtrue, 2)
65 |     return np.mean(t[~mask])
66 | 


--------------------------------------------------------------------------------