├── DIMVC.py
├── Load_data.py
├── Nmetrics.py
├── README.txt
├── data
    └── README.txt
├── main.py
└── results
    └── Caltech
        └── Model here.txt


/DIMVC.py:
--------------------------------------------------------------------------------
  1 | from time import time
  2 | import numpy as np
  3 | import platform
  4 | from sklearn.metrics import log_loss
  5 | from sklearn.utils.sparsefuncs import mean_variance_axis
  6 | import tensorflow.keras.backend as K
  7 | from tensorflow.keras.layers import Conv1D, Conv2D, Conv2DTranspose, Flatten, Reshape, Conv3D, Conv3DTranspose, MaxPooling2D, Dropout, GlobalMaxPooling2D
  8 | from tensorflow.keras.layers import Layer, InputSpec, Input, Dense, Multiply, concatenate
  9 | from tensorflow.keras.models import Model
 10 | from tensorflow.keras import callbacks
 11 | from tensorflow.keras.initializers import VarianceScaling, Zeros, Constant, GlorotNormal, GlorotUniform, \
 12 |     LecunUniform, LecunNormal, Orthogonal, RandomNormal, RandomUniform, TruncatedNormal, HeNormal, HeUniform, Identity, Initializer
 13 | from tensorflow.keras.regularizers import Regularizer, l1, l2, l1_l2
 14 | from tensorflow.keras.preprocessing.image import ImageDataGenerator
 15 | from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering
 16 | from sklearn.decomposition import PCA, SparsePCA
 17 | from math import log
 18 | import Nmetrics
 19 | import matplotlib.pyplot as plt
 20 | 
 21 | 
 22 | def FAE(dims, act='relu', view=1, data='data'):
 23 |     n_stacks = len(dims) - 1
 24 |     init = VarianceScaling(scale=1. / 3., mode='fan_in', distribution='uniform')
 25 |     input_name = 'v'+str(view)+'_'
 26 |     # input
 27 |     x = Input(shape=(dims[0],), name='input' + str(view))
 28 |     h = x
 29 |     # internal layers in encoder
 30 |     for i in range(n_stacks-1):
 31 |         h = Dense(dims[i + 1], activation=act, kernel_initializer=init, name=input_name+'encoder_%d' % i)(h)
 32 |  
 33 |     # hidden layer
 34 |     h = Dense(dims[-1], kernel_initializer=init, name='embedding' + str(view))(h)  # hidden layer, features are extracted from here
 35 | 
 36 |     y = h
 37 |     # internal layers in decoder
 38 |     for i in range(n_stacks-1, 0, -1):
 39 |         y = Dense(dims[i], activation=act, kernel_initializer=init, name=input_name+'decoder_%d' % i)(y)
 40 | 
 41 |     # output
 42 |     y = Dense(dims[0], kernel_initializer=init, name=input_name+'decoder_0')(y)
 43 | 
 44 |     return Model(inputs=x, outputs=y, name=input_name+'Fae'), Model(inputs=x, outputs=h, name=input_name+'Fencoder')
 45 | 
 46 | 
 47 | def MAE(view=2, view_shape=[], dim=10, data='data'):
 48 |     ae = []
 49 |     encoder = []
 50 |     for v in range(view):
 51 |         ae_tmp, encoder_tmp = FAE(dims=[view_shape[v][0], 500, 500, 2000, dim], view=v + 1, data=data)
 52 |         ae.append(ae_tmp)
 53 |         encoder.append(encoder_tmp)
 54 | 
 55 |     return ae, encoder
 56 | 
 57 | 
 58 | class ClusteringLayer(Layer):
 59 |     """
 60 |     Clustering layer converts input sample (feature) to soft label, i.e. a vector that represents the probability of the
 61 |     sample belonging to each cluster. The probability is calculated with student's t-distribution.
 62 | 
 63 |     # Example
 64 |     ```
 65 |         model.add(ClusteringLayer(n_clusters=10))
 66 |     ```
 67 |     # Arguments
 68 |         n_clusters: number of clusters.
 69 |         weights: list of Numpy array with shape `(n_clusters, n_features)` witch represents the initial cluster centers.
 70 |         alpha: parameter in Student's t-distribution. Default to 1.0.
 71 |     # Input shape
 72 |         2D tensor with shape: `(n_samples, n_features)`.
 73 |     # Output shape
 74 |         2D tensor with shape: `(n_samples, n_clusters)`.
 75 |     """
 76 | 
 77 |     def __init__(self, n_clusters, weights=None, alpha=1.0, **kwargs):
 78 |         if 'input_shape' not in kwargs and 'input_dim' in kwargs:
 79 |             kwargs['input_shape'] = (kwargs.pop('input_dim'),)
 80 |         super(ClusteringLayer, self).__init__(**kwargs)
 81 |         self.n_clusters = n_clusters
 82 |         self.alpha = alpha
 83 |         self.initial_weights = weights
 84 |         self.input_spec = InputSpec(ndim=2)
 85 | 
 86 |     def build(self, input_shape):
 87 |         assert len(input_shape) == 2    
 88 |         input_dim = input_shape.as_list()[1]
 89 |         self.input_spec = InputSpec(dtype=K.floatx(), shape=(None, input_dim))
 90 |         self.clusters = self.add_weight(shape=(self.n_clusters, input_dim), initializer='glorot_uniform', name='clusters')
 91 |         if self.initial_weights is not None:
 92 |             self.set_weights(self.initial_weights)
 93 |             del self.initial_weights
 94 |         self.built = True
 95 | 
 96 |     def call(self, inputs, **kwargs):
 97 |         """ student t-distribution, as same as used in t-SNE algorithm.
 98 |                  q_ij = 1/(1+dist(x_i, u_j)^2), then normalize it.
 99 |         Arguments:
100 |             inputs: the variable containing data, shape=(n_samples, n_features)
101 |         Return:
102 |             q: student's t-distribution, or soft labels for each sample. shape=(n_samples, n_clusters)
103 |         """
104 |         q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
105 |         q **= (self.alpha + 1.0) / 2.0
106 |         q = K.transpose(K.transpose(q) / K.sum(q, axis=1))
107 |         return q
108 | 
109 |     def compute_output_shape(self, input_shape):
110 |         assert input_shape and len(input_shape) == 2
111 |         return input_shape[0], self.n_clusters
112 | 
113 |     def get_config(self):
114 |         config = {'n_clusters': self.n_clusters}
115 |         base_config = super(ClusteringLayer, self).get_config()
116 |         return dict(list(base_config.items()) + list(config.items()))
117 | 
118 | 
119 | class MvDEC(object):
120 |     def __init__(self,
121 |                  n_clusters=10,
122 |                  alpha=1.0, view_shape=[], dim=10, data='data'):
123 | 
124 |         super(MvDEC, self).__init__()
125 | 
126 |         self.view_shape = view_shape
127 |         self.n_clusters = n_clusters
128 |         self.alpha = alpha
129 |         self.pretrained = False
130 |         # prepare model
131 |         self.view = len(view_shape)
132 |         # print(len(view_shape))
133 | 
134 |         self.AEs, self.encoders = MAE(view=self.view, view_shape=self.view_shape, dim=dim, data=data)
135 |     
136 |         Input = []
137 |         Output = []
138 |         Input_e = []
139 |         Output_e = []
140 |         clustering_layer = []
141 | 
142 |         for v in range(self.view):
143 |                 Input.append(self.AEs[v].input)
144 |                 Output.append(self.AEs[v].output)
145 |                 Input_e.append(self.encoders[v].input)
146 |                 Output_e.append(self.encoders[v].output)
147 |                 clustering_layer.append(ClusteringLayer(self.n_clusters, name='clustering'+str(v+1))(self.encoders[v].output))
148 | 
149 |         self.autoencoder = Model(inputs=Input, outputs=Output)    # xin _ xout
150 | 
151 |         self.encoder = Model(inputs=Input_e, outputs=Output_e)   # xin _ q
152 | 
153 |         Output_m = []
154 |         for v in range(self.view):
155 |             Output_m.append(clustering_layer[v])
156 |             Output_m.append(Output[v])
157 |         self.model = Model(inputs=Input, outputs=Output_m)   # xin _ q _ xout
158 | 
159 |     def pretrain(self, x, y, optimizer='adam', epochs=200, batch_size=256,
160 |                  save_dir='results/temp', verbose=0):
161 |         print('Begin pretraining: ', '-' * 60)
162 |         multi_loss = []
163 |         for view in range(len(x)):
164 |             multi_loss.append('mse')
165 |         self.autoencoder.compile(optimizer=optimizer, loss=multi_loss)
166 |         save = '/ae_weights.h5'
167 |         # begin pretraining
168 |         self.autoencoder.fit(x, x, batch_size=batch_size, epochs=epochs, verbose=verbose)
169 |         self.autoencoder.save_weights(save_dir + save)
170 |         print('Pretrained weights are saved to ' + save_dir + save)
171 |         # self.pretrained = True
172 |         print('End pretraining: ', '-' * 60)
173 | 
174 |     def load_weights(self, weights):  # load weights of models
175 |         self.model.load_weights(weights)
176 | 
177 |     def predict_label(self, x):  # predict cluster labels using the output of clustering layer
178 |         input_dic = {}
179 |         for view in range(len(x)):
180 |             input_dic.update({'input' + str(view+1): x[view]})
181 |         Q_and_X = self.model.predict(input_dic, verbose=0)
182 |         y_pred = []
183 |         for view in range(len(x)):
184 |             # print(view)
185 |             y_pred.append(Q_and_X[view*2].argmax(1))
186 |         
187 |         y_q = Q_and_X[(len(x)-1)*2]
188 |         for view in range(len(x) - 1):
189 |             y_q += Q_and_X[view*2]
190 | 
191 |         # y_q = y_q/len(x)
192 |         y_mean_pred = y_q.argmax(1)
193 |         return y_pred, y_mean_pred
194 | 
195 |     @staticmethod    
196 |     def target_distribution(q):
197 |         t = 2
198 |         weight = q ** t
199 |         return (weight.T / weight.sum(1)).T
200 | 
201 |     def compile(self, optimizer='sgd', loss=['kld', 'mse'], loss_weights=[0.1, 1.0]):
202 |         self.model.compile(optimizer=optimizer, loss=loss, loss_weights=loss_weights)
203 | 
204 |     def train_on_batch(self, xin, yout, sample_weight=None):
205 |         return self.model.train_on_batch(xin, yout, sample_weight)
206 | 
207 |     def new_fit(self, arg, x, y, maxiter=2e4, batch_size=256, tol=1e-3,
208 |             UpdateCoo=200, save_dir='./results/tmp'):
209 |         print('Begin clustering:', '-' * 60)
210 |         print('Update Coo:', UpdateCoo)
211 |         save_interval = int(maxiter)  # only save the initial and final model
212 |         print('Save interval', save_interval)
213 |         # Step 1: initialize cluster centers using k-means
214 |         t1 = time()
215 |         ting = time() - t1
216 |         # print(ting)
217 | 
218 |         time_record = []
219 |         time_record.append(int(ting))
220 |         # print(time_record)
221 |         kmeans = KMeans(n_clusters=self.n_clusters, n_init=100)
222 | 
223 |         input_dic = {}
224 |         for view in range(len(x)):
225 |             input_dic.update({'input' + str(view + 1): x[view]})
226 |         features = self.encoder.predict(input_dic)
227 | 
228 |         y_pred = []
229 |         center = []
230 | 
231 |         from numpy import hstack
232 |         from sklearn import preprocessing
233 |         min_max_scaler = preprocessing.MinMaxScaler()
234 |         # --------------------------------------------
235 |         for view in range(len(x)):
236 |             y_pred.append(kmeans.fit_predict(features[view]))
237 |             center.append([kmeans.cluster_centers_])
238 |         # --------------------------------------------
239 | 
240 |         for view in range(len(x)):
241 |             print('Start-' + str(view + 1) + ':')
242 |             from Nmetrics import test
243 |             test(y[view], y_pred[view])
244 |         y_pred_last = []
245 |         y_pred_sp = []
246 |         for view in range(len(x)):
247 |             y_pred_last.append(y_pred[view])
248 |             y_pred_sp.append(y_pred[view])
249 | 
250 |         print('Initializing cluster centers with K-means.')
251 |         for view in range(len(x)):
252 |             self.model.get_layer(name='clustering' + str(view + 1)).set_weights(center[view])
253 | 
254 |         # Step 2: deep clustering
255 | 
256 |         index_array = np.arange(x[0].shape[0])
257 |         index = 0
258 | 
259 |         Loss = []
260 |         avg_loss = []
261 |         kl_loss = []
262 |         for view in range(len(x)):
263 |             Loss.append(0)
264 |             avg_loss.append(0)
265 |             kl_loss.append(100000)
266 | 
267 |         update_interval = arg.UpdateCoo
268 | 
269 |         ACC = []
270 |         NMI = []
271 |         ARI = []
272 |         vACC = []
273 |         vNMI = []
274 |         vARI = []
275 |         MVKLLoss = []
276 |         ite = 0
277 | 
278 |         initial_flag = 0
279 | 
280 |         while True:
281 |             if ite % update_interval == 0:
282 |                 print('\n')
283 |                 for view in range(len(x)):
284 |                     avg_loss[view] = Loss[view] / update_interval
285 |                     kl_loss[view] = kl_loss[view] / update_interval
286 | 
287 |                 Q_and_X = self.model.predict(input_dic)
288 | 
289 |                 for view in range(len(x)):
290 |                     # print(Q_and_X[view * 2][0])
291 |                     y_pred_sp[view] = Q_and_X[view * 2].argmax(1)
292 | 
293 |                 features = self.encoder.predict(input_dic)
294 | 
295 |                 uuu = []
296 |                 for view in range(len(x)):
297 |                     muu = self.model.get_layer(name='clustering' + str(view + 1)).get_weights()
298 |                     # print(muu)
299 |                     uuu.append(muu)
300 |                 # np.save(save_dir + '/Features/' + str(ite) + '.npy', features)
301 |                 # np.save(save_dir + '/Mu/' + str(ite) + '.npy', uuu)
302 | 
303 |                 n_features = []
304 |                 weights = []
305 |                 sum = 0
306 |                 for view in range(len(x)):
307 |                     MU = min_max_scaler.fit_transform(
308 |                         self.model.get_layer(name='clustering' + str(view + 1)).get_weights()[0])
309 |                     # print(MU.shape)
310 |                     # print(MU.var())
311 |                     sum += MU.var()
312 |                     weights.append(MU.var())
313 | 
314 |                 weights = 1 + np.log2(1 + weights / sum)
315 | 
316 |                 # print(weights)
317 | 
318 |                 for view in range(len(x)):
319 |                     # n_features.append(features[view])
320 |                     if arg.dataset == "Caltech":
321 |                         features_tmp = min_max_scaler.fit_transform(features[view])
322 |                         n_features.append(features_tmp * (weights[view]))
323 |                     else:
324 |                         features_tmp = features[view]
325 |                         n_features.append(features_tmp * (weights[view]))
326 |                 z = hstack(n_features)
327 | 
328 |                 kmean = KMeans(n_clusters=self.n_clusters, n_init=10)
329 |                 # kmean = KMeans(n_clusters=self.n_clusters, n_init=500, init='random', max_iter=1000, algorithm='full')
330 |                 # kmean = KMeans(n_clusters=self.n_clusters, n_init=10, max_iter=1000)
331 |                 # kmean = KMeans(n_clusters=self.n_clusters, n_init=1, max_iter=1000)
332 |                 # kmean = KMeans(n_clusters=self.n_clusters, n_init=500, max_iter=1000, algorithm='full')
333 |                 print('Iteration: %d' % (int(ite / update_interval)))
334 |                 print("P-step: update {P, C, A} with fixed {Z, U}.")
335 |                 y_pred = kmean.fit_predict(z)     # k-means on global features
336 |                 
337 |                 if initial_flag == 0:
338 |                     y_pred_global = np.copy(y_pred)
339 |                     initial_flag = 1
340 |                 
341 |                 # print("Update A with fixed C, {Z, U}.")
342 |                 new_y, row_ind, col_ind, matrix = self.Match(y_pred, y_pred_global)
343 |                 y_pred_global = np.copy(new_y)
344 |                 print(matrix)
345 | 
346 |                 # print(kmeans.cluster_centers_.shape)
347 |                 # print(y_pred_global[0:9])
348 |                 # print(y_pred[0:9])
349 | 
350 |                 acc = np.round(Nmetrics.acc(y[view], y_pred), 5)
351 |                 nmi = np.round(Nmetrics.nmi(y[view], y_pred), 5)
352 |                 ari = np.round(Nmetrics.ari(y[view], y_pred), 5)
353 |                 from Nmetrics import test
354 |                 test(y[view], y_pred)
355 |                 ACC.append(acc)
356 |                 NMI.append(nmi)
357 |                 ARI.append(ari)
358 |                 # print(kl_loss)
359 |                 # print(Loss)
360 |                 # print(np.sum(kl_loss), np.sum(Loss))
361 |                 print("Z-step: update {Z, U} with fixed {P, C, A}.")
362 |                 if y is not None:
363 |                     tmpACC = []
364 |                     tmpNMI = []
365 |                     tmpARI = []
366 |                     for view in range(len(x)):
367 |                         acc = np.round(Nmetrics.acc(y[view], y_pred_sp[view]), 5)
368 |                         nmi = np.round(Nmetrics.nmi(y[view], y_pred_sp[view]), 5)
369 |                         ari = np.round(Nmetrics.ari(y[view], y_pred_sp[view]), 5)
370 |                         from Nmetrics import test
371 |                         test(y[view], y_pred_sp[view])
372 |                         tmpACC.append(acc)
373 |                         tmpNMI.append(nmi)
374 |                         tmpARI.append(ari)
375 |                     vACC.append(tmpACC)
376 |                     vNMI.append(tmpNMI)
377 |                     vARI.append(tmpARI)
378 | 
379 |                 Center_init = kmean.cluster_centers_    # k-means on global features
380 |                 new_P = self.new_P(z, Center_init)      # similarity measure
381 |                 p = self.target_distribution(new_P)     # enhance
382 |                 # p = np.dot(p, matrix)                 # P = E(S(H, C))A,     adjust the arrangement of S
383 |                 p = np.dot(p, matrix.T)                 # P = E(S(H, C))A,     arrangement of S is aligned with last iteration
384 |                 P = []
385 |                 # unify P of supervision loss
386 |                 for view in range(len(x)):
387 |                     P.append(p)
388 | 
389 |                 # evaluate the clustering performance
390 |                 for view in range(len(x)):
391 |                     Loss[view] = 0.
392 |                     kl_loss[view] = 0.
393 | 
394 |             # train on batch
395 |             idx = index_array[index * batch_size: min((index + 1) * batch_size, x[0].shape[0])]
396 |             x_batch = []
397 |             y_batch = []
398 |             for view in range(len(x)):
399 |                 x_batch.append(x[view][idx])
400 |                 y_batch.append(P[view][idx])
401 |                 y_batch.append(x[view][idx])
402 |             tmp = self.train_on_batch(xin=x_batch, yout=y_batch)  # [sum, q, xn, q, x]
403 |             # print(tmp)
404 |             KLLoss = []
405 |             for view in range(len(x)):
406 |                 Loss[view] += tmp[2 * view + 2]       # lr
407 |                 kl_loss[view] += tmp[2 * view + 1]    # lc
408 |                 KLLoss.append(tmp[2 * view + 1])
409 |             # MVKLLoss.append(KLLoss)
410 |             MVKLLoss.append(tmp[0])
411 |             index = index + 1 if (index + 1) * batch_size <= x[0].shape[0] else 0
412 |             # print(ite)
413 |             ite += 1
414 |             if ite >= int(maxiter):
415 |                 break
416 | 
417 |         # save the trained model
418 |         # logfile.close()
419 |         print('Saving model to:', save_dir + '/model_final.h5')
420 |         self.model.save_weights(save_dir + '/model_final.h5')
421 |         # self.autoencoder.save_weights(save_dir + '/pre_model.h5')
422 |         # np.save(save_dir + '/AccNmiAriRate/ACC.npy', ACC)
423 |         # np.save(save_dir + '/AccNmiAriRate/NMI.npy', NMI)
424 |         # np.save(save_dir + '/AccNmiAriRate/ARI.npy', ARI)
425 |         # np.save(save_dir + '/AccNmiAriRate/vACC.npy', vACC)
426 |         # np.save(save_dir + '/AccNmiAriRate/vNMI.npy', vNMI)
427 |         # np.save(save_dir + '/AccNmiAriRate/vARI.npy', vARI)
428 |         # np.save(save_dir + '/AccNmiAriRate/TotalLoss.npy', MVKLLoss)
429 |         print('Clustering time: %ds' % (time() - t1))
430 |         print('End clustering:', '-' * 60)
431 | 
432 |         Q_and_X = self.model.predict(input_dic)
433 |         y_pred = []
434 |         y_softlabels = []
435 |         for view in range(len(x)):
436 |             y_pred.append(Q_and_X[view*2].argmax(1))
437 |             y_softlabels.append(Q_and_X[view*2])
438 | 
439 |         y_q = Q_and_X[(len(x) - 1) * 2]
440 |         for view in range(len(x) - 1):
441 |             y_q += Q_and_X[view * 2]
442 |         # y_q = y_q/len(x)
443 |         y_mean_pred = y_q.argmax(1)
444 |         return y_pred, y_mean_pred, y_softlabels
445 | 
446 |     def test_fit(self, arg, x, y, maxiter=2e4, batch_size=256, tol=1e-3,
447 |                 UpdateCoo=200, save_dir='./results/tmp'):
448 |         input_dic = {}
449 |         for view in range(len(x)):
450 |             input_dic.update({'input' + str(view + 1): x[view]})
451 |         Q_and_X = self.model.predict(input_dic)
452 |         y_pred = []
453 |         y_softlabels = []
454 |         for view in range(len(x)):
455 |             y_pred.append(Q_and_X[view * 2].argmax(1))
456 |             y_softlabels.append(Q_and_X[view * 2])
457 | 
458 |         y_q = Q_and_X[(len(x) - 1) * 2]
459 |         for view in range(len(x) - 1):
460 |             y_q += Q_and_X[view * 2]
461 |         # y_q = y_q/len(x)
462 |         y_mean_pred = y_q.argmax(1)
463 |         return y_pred, y_mean_pred, y_softlabels, self.encoder.predict(input_dic)
464 | 
465 |     def new_P(self, inputs, centers):
466 |         alpha = 1
467 |         q = 1.0 / (1.0 + (np.sum(np.square(np.expand_dims(inputs, axis=1) - centers), axis=2) / alpha))
468 |         q **= (alpha + 1.0) / 2.0
469 |         q = np.transpose(np.transpose(q) / np.sum(q, axis=1))
470 |         return q
471 | 
472 |     def Match(self, y_true, y_pred):
473 |         # y_modified = Match(y_modified_before, y_modified_target)
474 |         y_true = y_true.astype(np.int64)
475 |         y_pred = y_pred.astype(np.int64)
476 |         assert y_pred.size == y_true.size
477 |         D = max(y_pred.max(), y_true.max()) + 1
478 |         w = np.zeros((D, D), dtype=np.int64)
479 |         for i in range(y_pred.size):
480 |             w[y_pred[i], y_true[i]] += 1
481 |         from scipy.optimize import linear_sum_assignment
482 |         row_ind, col_ind = linear_sum_assignment(w.max() - w)
483 |         new_y = np.zeros(y_true.shape[0])
484 | 
485 |         matrix = np.zeros((D, D), dtype=np.int64)
486 |         matrix[row_ind, col_ind] = 1
487 |         for i in range(y_pred.size):
488 |             for j in row_ind:
489 |                 if y_true[i] == col_ind[j]:
490 |                     new_y[i] = row_ind[j]
491 |         return new_y, row_ind, col_ind, matrix
492 | 


--------------------------------------------------------------------------------
/Load_data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from keras_preprocessing import image
 3 | # from PIL import Image
 4 | from numpy import hstack
 5 | from scipy import misc
 6 | import matplotlib
 7 | # matplotlib.use('Agg')
 8 | import matplotlib.pyplot as plt
 9 | import scipy.io as scio
10 | from sklearn.preprocessing import normalize
11 | # from tensorflow.keras.preprocessing.image import ImageDataGenerator
12 | import warnings
13 | warnings.filterwarnings("ignore")
14 | 
15 | path = './data'
16 | 
17 | 
18 | def Caltech(missrate=0.5):
19 |     # Different IMVC methods might utilize different data preprocessing functions, 
20 |     # including but not limited to standardization, regularization, and min-max normalization, etc.
21 |     Data = scio.loadmat(path + "/Caltech.mat")     # Upload the dataset, which is already be pre-processed.
22 |     x1 = Data['X1']
23 |     x2 = Data['X2']
24 |     Y = Data['Y']
25 |     Y = Y.reshape(Y.shape[0])
26 |     size = Y.shape[0]
27 |     X, Y, index = Form_Incomplete_Data(missrate=missrate, X=[x1, x2], Y=[Y, Y])
28 |     return X, Y, size, index
29 | 
30 | 
31 | def Form_Incomplete_Data(missrate=0.5, X = [], Y = []):
32 |     size = len(Y[0])
33 |     view_num = len(X)
34 |     t = np.linspace(0, size - 1, size, dtype=int)
35 |     import random
36 |     random.shuffle(t)
37 |     Xtmp = []
38 |     Ytmp = []
39 |     for i in range(view_num):
40 |         xtmp = np.copy(X[i])
41 |         Xtmp.append(xtmp)
42 |         ytmp = np.copy(Y[i])
43 |         Ytmp.append(ytmp)
44 |     for v in range(view_num):
45 |         for i in range(size):
46 |             Xtmp[v][i] = X[v][t[i]]
47 |             Ytmp[v][i] = Y[v][t[i]]
48 |     X = Xtmp
49 |     Y = Ytmp
50 | 
51 |     # complete data index
52 |     index0 = np.linspace(0, (1 - missrate) * size - 1, num=int((1 - missrate) * size), dtype=int)
53 |     missindex = np.ones((int(missrate * size), view_num))
54 |     print(missindex.shape)
55 |     # incomplete data index
56 |     index = []
57 |     for i in range(missindex.shape[0]):
58 |         missdata = np.random.randint(0, high=view_num, size=view_num - 1)
59 |         # print(missdata)
60 |         missindex[i, missdata] = 0
61 |     # print(missindex)
62 |     for i in range(view_num):
63 |         index.append([])
64 |     miss_begain = (1 - missrate) * size
65 |     for i in range(missindex.shape[0]):
66 |         for j in range(view_num):
67 |             if missindex[i, j] == 1:
68 |                 index[j].append(int(miss_begain + i))
69 |     # print(index)
70 |     maxmissview = 0
71 |     for j in range(view_num):
72 |         if maxmissview < len(index[j]):
73 |             print(len(index[j]))
74 |             maxmissview = len(index[j])
75 |     print(maxmissview)
76 |     # add some incomplete views' data index to equal for convenience
77 |     for j in range(view_num):
78 |         flag = np.random.randint(0, high=size, size=maxmissview - len(index[j]))
79 |         index[j] = index[j] + list(flag)
80 |     # to form complete and incomplete views' data
81 |     for j in range(view_num):
82 |         index[j] = list(index0) + index[j]
83 |         X[j] = X[j][index[j]]
84 |         print(X[j].shape)
85 |         Y[j] = Y[j][index[j]]
86 |         print(Y[j].shape)
87 |     print("----------------generate incomplete multi-view data-----------------------")
88 |     return X, Y, index
89 | 
90 | 
91 | def load_data_conv(dataset, missrate):
92 |     print("load:", dataset)
93 |     if dataset == 'Caltech':                # Caltech
94 |         return Caltech(missrate=missrate)
95 |     else:
96 |         raise ValueError('Not defined for loading %s' % dataset)
97 | 


--------------------------------------------------------------------------------
/Nmetrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score, v_measure_score, accuracy_score
 3 | 
 4 | nmi = normalized_mutual_info_score
 5 | vmeasure = v_measure_score
 6 | ari = adjusted_rand_score
 7 | 
 8 | 
 9 | def acc(y_true, y_pred):
10 |     """
11 |     Calculate clustering accuracy. Require scikit-learn installed
12 | 
13 |     # Arguments
14 |         y: true labels, numpy.array with shape `(n_samples,)`
15 |         y_pred: predicted labels, numpy.array with shape `(n_samples,)`
16 | 
17 |     # Return
18 |         accuracy, in [0,1]
19 |     """
20 |     y_true = y_true.astype(np.int64)
21 |     assert y_pred.size == y_true.size
22 |     D = max(y_pred.max(), y_true.max()) + 1
23 |     w = np.zeros((D, D), dtype=np.int64)
24 |     for i in range(y_pred.size):
25 |         w[y_pred[i], y_true[i]] += 1
26 |     # from scipy.optimize import linear_sum_assignment
27 |     from sklearn.utils.linear_assignment_ import linear_assignment
28 |     ind = linear_assignment(w.max() - w)
29 |     # ind = linear_sum_assignment(w.max() - w)
30 |     return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size
31 | 
32 | 
33 | def purity(y_true, y_pred):
34 |     """Purity score
35 |         Args:
36 |             y_true(np.ndarray): n*1 matrix Ground truth labels
37 |             y_pred(np.ndarray): n*1 matrix Predicted clusters
38 | 
39 |         Returns:
40 |             float: Purity score
41 |     """
42 |     # matrix which will hold the majority-voted labels
43 |     y_voted_labels = np.zeros(y_true.shape)
44 |     # Ordering labels
45 |     ## Labels might be missing e.g with set like 0,2 where 1 is missing
46 |     ## First find the unique labels, then map the labels to an ordered set
47 |     ## 0,2 should become 0,1
48 |     labels = np.unique(y_true)
49 |     ordered_labels = np.arange(labels.shape[0])
50 |     for k in range(labels.shape[0]):
51 |         y_true[y_true==labels[k]] = ordered_labels[k]
52 |     # Update unique labels
53 |     labels = np.unique(y_true)
54 |     # We set the number of bins to be n_classes+2 so that 
55 |     # we count the actual occurence of classes between two consecutive bins
56 |     # the bigger being excluded [bin_i, bin_i+1[
57 |     bins = np.concatenate((labels, [np.max(labels)+1]), axis=0)
58 | 
59 |     for cluster in np.unique(y_pred):
60 |         hist, _ = np.histogram(y_true[y_pred==cluster], bins=bins)
61 |         # Find the most present label in the cluster
62 |         winner = np.argmax(hist)
63 |         y_voted_labels[y_pred==cluster] = winner
64 | 
65 |     return accuracy_score(y_true, y_voted_labels)
66 | 
67 | 
68 | def test(y_true, y_pred):
69 |     print("ACC:%.4f, NMI:%.4f, VME:%.4f, ARI:%.4f, PUR:%.4f" % (acc(y_true, y_pred),
70 |                                                             nmi(y_true, y_pred),
71 |                                                             vmeasure(y_true, y_pred),
72 |                                                             ari(y_true, y_pred),
73 |                                                             purity(y_true, y_pred)))
74 |     return acc(y_true, y_pred), nmi(y_true, y_pred), vmeasure(y_true, y_pred), ari(y_true, y_pred), purity(y_true, y_pred)
75 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | # Accepted by AAAI 2022.
 2 | 
 3 | # Settings in main.py
 4 |     data = 'Caltech'
 5 | 
 6 | # The following settings are adopted to all datasets
 7 |     Lc = 1.0
 8 |     Lr = 1.0
 9 |     lrate = 0.001
10 |     epochs = 500
11 |     Update_epoch = 1000
12 |     Max_iteration = 10
13 |     Batch = 256
14 | 
15 | # The datasets with different miss rates are generated each time
16 |     for missrate in [0.1, 0.3, 0.5, 0.7]:
17 | 
18 | # Run the code by
19 |     python main.py
20 | 
21 | # Requirement
22 |     python==3.7.10
23 |     scikit-learn==0.22.2.post1
24 |     scipy==1.4.1
25 |     tensorflow-gpu==2.5.0
26 | 
27 | # BibTex
28 | @InProceedings{Xu_2022_AAAI,
29 |     author    = {Xu, Jie and Li, Chao and Ren, Yazhou and Peng, Liang and Mo, Yujie and Shi, Xiaoshuang and Zhu, Xiaofeng},
30 |     title     = {Deep Incomplete Multi-View Clustering via Mining Cluster Complementarity},
31 |     booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence (AAAI)},
32 |     year      = {2022},
33 |     pages     = {8761-8769}
34 | }
35 | 


--------------------------------------------------------------------------------
/data/README.txt:
--------------------------------------------------------------------------------
1 | link：https://pan.baidu.com/s/1_3iZabF0lHF9gxgZilubJg 
2 | pass code：data
3 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from Load_data import load_data_conv
  2 | from tensorflow.keras.optimizers import SGD, Adam
  3 | import numpy as np
  4 | from sklearn.manifold import TSNE
  5 | import os
  6 | from time import time
  7 | import Nmetrics
  8 | from DIMVC import MvDEC
  9 | import matplotlib.pyplot as plt
 10 | import warnings
 11 | warnings.filterwarnings("ignore")
 12 | from sklearn import metrics
 13 | import tensorflow as tf
 14 | import tensorflow.keras.backend as K
 15 | from sklearn.cluster import KMeans
 16 | from tensorflow.keras.layers import Layer, InputSpec, Input, Dense, Multiply, concatenate
 17 | 
 18 | 
 19 | def _make_data_and_model(args, missrate):
 20 |     # prepare dataset
 21 |     x, y, size, index = load_data_conv(args.dataset, missrate=missrate)
 22 |     view = len(x)
 23 |     view_shapes = []
 24 |     Loss = []
 25 |     Loss_weights = []
 26 |     shap_max = 0
 27 |     for v in range(view):
 28 |         view_shapes.append(x[v].shape[1:])
 29 |         if shap_max < x[v].shape[1:][0]:
 30 |             shap_max = x[v].shape[1:][0]
 31 |     print(shap_max)
 32 |     for v in range(view):
 33 |         Loss.append('categorical_crossentropy')
 34 |         Loss.append('mse')
 35 |         Loss_weights.append(args.lc)
 36 |         Loss_weights.append(args.Idec)
 37 |     print(view_shapes)
 38 |     print(Loss)
 39 |     print(Loss_weights)
 40 |     # prepare optimizer
 41 |     optimizer = Adam(lr=args.lr)
 42 |     # prepare the model
 43 |     n_clusters = len(np.unique(y[0]))
 44 |     print("n_clusters:" + str(n_clusters))
 45 | 
 46 |     model = MvDEC(n_clusters=n_clusters, view_shape=view_shapes, data=args.dataset)
 47 | 
 48 |     model.compile(optimizer=optimizer, loss=Loss, loss_weights=Loss_weights)
 49 |     return x, y, model, size, index
 50 | 
 51 | 
 52 | def train(args):
 53 |     # get data and model
 54 |     missrate = args.missrate
 55 |     x, y, model, size, index_data = _make_data_and_model(args, missrate=missrate)
 56 |     model.model.summary()
 57 |     # pretraining
 58 |     t0 = time()
 59 |     if not os.path.exists(args.save_dir):
 60 |         os.makedirs(args.save_dir)
 61 |     # if args.pretrain_dir is not None and os.path.exists(args.pretrain_dir):  # load pretrained weights
 62 |     if not args.pretrain:
 63 |         model.autoencoder.load_weights(args.pretrain_dir)
 64 |         # model.load_weights(args.pretrain_dir)
 65 |     else:  # train
 66 |         optimizer = Adam(lr=args.lr)
 67 |         model.pretrain(x, y, optimizer=optimizer, epochs=args.pretrain_epochs,
 68 |                             batch_size=args.batch_size, save_dir=args.save_dir, verbose=0)
 69 |         args.pretrain_dir = args.save_dir + '/ae_weights.h5'
 70 |     t1 = time()
 71 |     print("Time for pretraining: %ds" % (t1 - t0))
 72 | 
 73 |     # clustering
 74 |     print('Data size:' + str(size))
 75 |     view_num = len(x)
 76 |     index = np.linspace(0, (1-missrate)*size-1, num=int((1-missrate)*size), dtype=int)
 77 |     args.centerinit = 0
 78 |     args.maxAR = 0
 79 |     # print(index_data)
 80 |     for i in ['DIMVC', 'TEST']:
 81 |         print(args.maxAR)
 82 |         if i == 'DIMVC':
 83 |             x_t = []
 84 |             y_t = []
 85 |             for v in range(len(x)):
 86 |                 x_t.append(x[v][index])
 87 |                 y_t.append(y[v][index])
 88 |             y_pred, y_mean_pred, _ = model.new_fit(arg=args, x=x_t, y=y_t, maxiter=args.maxiter,
 89 |                                                 batch_size=args.batch_size, UpdateCoo=args.UpdateCoo,
 90 |                                                 save_dir=args.save_dir)
 91 |         else:
 92 |             y_pred, y_mean_pred, y_softlabels, z = model.test_fit(arg=args, x=x, y=y, maxiter=args.maxiter,
 93 |                                                        batch_size=args.batch_size, UpdateCoo=args.UpdateCoo,
 94 |                                                        save_dir=args.save_dir)
 95 |     y_prediction = []
 96 |     y_true = []
 97 |     y_pre_nomean = []
 98 |     y_ture_nomean = []
 99 |     if y is not None:
100 |         for view in range(len(x)):
101 |             print(len(y_pred[view]))
102 |             Nmetrics.test(y[view], y_pred[view])   # each view
103 |             y_prediction = y_prediction + list(y_pred[view][int((1-missrate)*size):])
104 |             y_true = y_true + list(y[view][int((1-missrate)*size):])
105 | 
106 |             y_pre_nomean = y_pre_nomean + list(y_pred[view])
107 |             y_ture_nomean = y_ture_nomean + list(y[view])
108 | 
109 |         y_prediction = y_prediction + list(y_mean_pred[index])
110 |         y_true = y_true + list(y[0][index])
111 |         print(len(y_prediction))
112 |         Nmetrics.test(np.array(y_true), np.array(y_prediction))  # com mean, incom no mean
113 |         print(len(y_pre_nomean))
114 |         Nmetrics.test(np.array(y_ture_nomean), np.array(y_pre_nomean))  # no mean
115 |     # print(y_prediction)
116 |     true_labels = np.zeros((size, ))
117 |     # print(index_data)
118 |     # print(y)
119 |     for i in range(len(y[0])):
120 |         for v in range(view_num):
121 |             true_labels[index_data[v][i]] = y[v][i]
122 |     # print(true_labels)
123 |     pre_soft_labels = []
124 |     n_clusters = len(np.unique(y[0]))
125 |     for v in range(view_num):
126 |         pre_soft_labels.append(np.zeros((size, n_clusters)))
127 |     # print(pre_soft_labels)
128 |     for i in range(y_softlabels[0].shape[0]):
129 |         for v in range(view_num):
130 |             pre_soft_labels[v][index_data[v][i]] = y_softlabels[v][i]
131 |     # print(pre_soft_labels)
132 | 
133 |     y_q = np.copy(pre_soft_labels[view_num-1])
134 |     for v in range(view_num-1):
135 |         y_q += pre_soft_labels[v]
136 |     # print(pre_soft_labels)
137 |     # print(y_q)
138 |     y_mean_pred = y_q.argmax(1)
139 |     # print(y_mean_pred)
140 |     t2 = time()
141 |     print("Time for pretaining, clustering and total: (%ds, %ds, %ds)" % (t1 - t0, t2 - t1, t2 - t0))
142 |     print(len(y_mean_pred))
143 |     Nmetrics.test(true_labels, y_mean_pred)     # mean
144 |     print('=' * 60)
145 |     # return Nmetrics.test(true_labels, y_mean_pred)                          # mean
146 |     return Nmetrics.test(np.array(y_true), np.array(y_prediction))            # com mean, incom no mean
147 |     # return Nmetrics.test(np.array(y_ture_nomean), np.array(y_pre_nomean))   # no mean
148 | 
149 | 
150 | def test(args):
151 |     assert args.weights is not None
152 |     # x, y, model = _make_data_and_model(args)
153 |     x, y, model, size, index_data = _make_data_and_model(args, missrate=args.missrate)
154 |     model.model.summary()
155 |     print('Begin testing:', '-' * 60)
156 |     model.load_weights(args.weights)
157 |     y_pred, y_mean_pred = model.predict_label(x=x)
158 |     y = y[0]
159 |     if y is not None:
160 |         for view in range(len(x)):
161 |             print('Final: acc=%.4f, nmi=%.4f, ari=%.4f' %
162 |                     (Nmetrics.acc(y, y_pred[view]), Nmetrics.nmi(y, y_pred[view]), Nmetrics.ari(y, y_pred[view])))
163 |         print('Final: acc=%.4f, nmi=%.4f, ari=%.4f' %
164 |                   (Nmetrics.acc(y, y_mean_pred), Nmetrics.nmi(y, y_mean_pred), Nmetrics.ari(y, y_mean_pred)))
165 |         Nmetrics.test(y, y_mean_pred)
166 |     print('End testing:', '-' * 60)
167 | 
168 | 
169 | if __name__ == "__main__":
170 |     # settings
171 |     data = 'Caltech'
172 |     Lc = 1.0
173 |     Lr = 1.0
174 |     lrate = 0.001
175 |     epochs = 500
176 |     Update_epoch = 1000
177 |     Max_iteration = 10
178 |     Batch = 256
179 | 
180 |     run_times = 1
181 |     results = []
182 |     for missrate in [0.0, 0.1, 0.3, 0.5, 0.7]:
183 |         print("----------------------------missrate-----------------------------------")
184 |         print(missrate)
185 |         print("-----------------------------------------------------------------------")
186 |         import argparse
187 |         parser = argparse.ArgumentParser(description='main')
188 |         parser.add_argument('--dataset', default=data,
189 |                             help="Dataset name to train")
190 |         PATH = './results/'
191 |         path = PATH + data
192 |         train_ae = True
193 |         if train_ae:
194 |             load = None
195 |         else:
196 |             load = path + '/ae_weights.h5'
197 |         TEST = False
198 |         if TEST:
199 |             load_test = path + '/model_final.h5'
200 |         else:
201 |             load_test = None
202 | 
203 |         parser.add_argument('-d', '--save-dir', default=path,
204 |                             help="Dir to save the model")
205 |         # Parameters for pretraining
206 |         parser.add_argument('--pretrain_dir', default=load, type=str,
207 |                             help="Pretrained weights of the autoencoder")
208 |         parser.add_argument('--pretrain', default=train_ae, type=bool,
209 |                             help="Pretrain the autoencoder?")
210 |         parser.add_argument('--pretrain-epochs', default=epochs, type=int,   # 500
211 |                             help="Number of epochs for pretraining")
212 |         parser.add_argument('-v', '--verbose', default=1, type=int,
213 |                             help="Verbose for pretraining")
214 |         # Parameters for clustering
215 |         parser.add_argument('--testing', default=TEST, type=bool,
216 |                             help="Testing the clustering performance with provided weights")
217 |         parser.add_argument('--weights', default=load_test, type=str,
218 |                             help="Model weights, used for testing")
219 |         parser.add_argument('--lr', default=lrate, type=float,
220 |                             help="learning rate during clustering")
221 |         parser.add_argument('--batch-size', default=Batch, type=int,
222 |                             help="Batch size")
223 |         parser.add_argument('--missrate', default=missrate, type=float,
224 |                             help="Miss rate")
225 |         parser.add_argument('--maxiter', default=Max_iteration * Update_epoch, type=int,
226 |                             help="Maximum number of iterations")
227 |         parser.add_argument('-uc', '--UpdateCoo', default=Update_epoch, type=int,
228 |                             help="Number of iterations to update the target distribution")
229 |         parser.add_argument('--Idec', default=Lr, type=float,
230 |                             help="weight of AEs?")
231 |         parser.add_argument('--lc', default=Lc, type=float,
232 |                             help="weight of clustering")
233 |         args = parser.parse_args()
234 |         print('+' * 30, ' Parameters ', '+' * 30)
235 |         print(args)
236 |         print('+' * 75)
237 |         # testing
238 |         if args.testing:
239 |             test(args)
240 |         else:
241 |             performance = np.zeros(shape=(run_times, 5))
242 |             for i in range(run_times):
243 |                 print("---------------------------run_times------------------------------------")
244 |                 print(i)
245 |                 print("------------------------------------------------------------------------")
246 |                 ACC, NMI, V_measure, ARI, Purity = train(args)
247 |                 performance[i][0] = ACC
248 |                 performance[i][1] = NMI
249 |                 performance[i][2] = V_measure
250 |                 performance[i][3] = ARI
251 |                 performance[i][4] = Purity
252 |             means_per = np.around(np.mean(performance, axis=0), 4)
253 |             results.append(list(means_per))
254 |     # np.save(data + '.npy', results)
255 |     print(results)
256 | 


--------------------------------------------------------------------------------
/results/Caltech/Model here.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------