├── Baselines
    ├── 2D_representation.py
    ├── Readme
    ├── auroc_draw.py
    ├── base_classifiers.py
    ├── baseline_OCC.py
    ├── baseline_OCC_utils.py
    ├── baseline_VEWS_utils.py
    ├── data_generation.py
    ├── latent_repre_explore.py
    ├── libs.py
    ├── model_components.py
    ├── representation_libs.py
    ├── run_baseline.m
    ├── splitData.m
    └── utils.py
├── README.md
├── bg_dataset.py
├── bg_utils.py
├── data
    ├── credit_card
    │   ├── ben_hid_repre_r2.npy
    │   └── van_hid_repre_r2.npy
    ├── raw_credit_card
    │   ├── ben_raw_r0.npy
    │   └── van_raw_r0.npy
    └── wiki
    │   ├── X_v8_4_50_Ben.npy
    │   ├── X_v8_4_50_Van.npy
    │   ├── ben_hid_emd_4_50_8_200_r0.npy
    │   └── val_hid_emd_4_50_8_200_r0.npy
└── oc_gan.py


/Baselines/2D_representation.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |     Author: Panpan Zheng
 3 |     Date created:  1/15/2018
 4 |     Python Version: 2.7
 5 | '''
 6 | 
 7 | import os
 8 | import sys
 9 | sys.path.append(os.getcwd() + "\\..\\..\\")
10 | from baseline_OCC_utils import *
11 | from base_classifiers import LSTM_Autoencoder
12 | from model_components import train_gan
13 | 
14 | # Load data and preprocess.
15 | samples_path = os.getcwd() + "\\..\\..\\sampleData\\"
16 | f_ben, f_van = "X_v8_4_50_Ben", "X_v8_4_50_Van"
17 | x_ben, x_van = load_data(samples_path, f_ben, f_van)
18 | 
19 | train_ratio = .7
20 | max_len = 50
21 | 
22 | # Contruct the LSTM-AE
23 | input_dim = 8
24 | time_step = max_len
25 | hid_dim = [200]
26 | 
27 | sampling_ratio = train_ratio
28 | 
29 | x_train_P, x_ben_P, x_van_P, weights_P, seq_len_ben, seq_len_van = sampling_preprocessing_LSTM_AE(x_ben, x_van, train_ratio, \
30 |                                                                                                   max_len)
31 | 
32 | lstm_ae = LSTM_Autoencoder(input_dim, time_step, hid_dim)
33 | lstm_ae.compile()
34 | lstm_ae.fit(x_train_P, weights_P)
35 | 
36 | test_ben_P = x_ben_P[len(x_train_P):]
37 | test_van_P = x_van_P[0:len(test_ben_P)]
38 | 
39 | test_seq_len_ben = np.array(seq_len_ben[len(x_train_P):])
40 | test_seq_len_van = np.array(seq_len_van[0:len(test_ben_P)])
41 | 
42 | lstm_ae.get_hidden_layer_sequence()
43 | 
44 | ben_hid_repre_P = lstm_ae.get_hidden_representation(test_ben_P)
45 | van_hid_repre_P = lstm_ae.get_hidden_representation(test_van_P)
46 | 
47 | ben_hid_last_4 = ben_hid_repre_P[:,-4:]
48 | van_hid_last_4 = van_hid_repre_P[:,-4:]
49 | 
50 | a = ben_hid_last_4.shape
51 | b = van_hid_last_4.shape
52 | 
53 | print a
54 | 


--------------------------------------------------------------------------------
/Baselines/Readme:
--------------------------------------------------------------------------------
1 | For OCNN and OCGP, they are implemented by package, NDtool, which can be downloaded in the website, http://www.robots.ox.ac.uk/~davidc/publications_NDtool.php. 
2 | 
3 | 1. Create a folder named "MATLAB", and then download the package, "NDtoolv0.12", into "MATLAB". 
4 | 2. Move "run_baseline.m" into the folder "MATLAB", and then replace "splitData.m" in "MATLAB/NDtoolv0.12" with "splitData.m" in path "Baselines/"
5 | 
6 | For OCSVM, it's implemented by sklearn.  
7 | 
8 | For credit_card dataset, it needs to be downloaded from website, https://www.kaggle.com/dalpozz/creditcardfraud , and put it into path, 'OCAN/data/credit_card'. 
9 | 


--------------------------------------------------------------------------------
/Baselines/auroc_draw.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | from mpl_toolkits.mplot3d import axes3d
  4 | from sklearn.preprocessing import MinMaxScaler
  5 | from bg_utils import sample_shuffle_uspv
  6 | from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding
  7 | from sklearn import metrics
  8 | from sklearn.metrics import classification_report
  9 | 
 10 | 
 11 | # Create data
 12 | # N = 60
 13 | # g1 = (0.6 + 0.6 * np.random.rand(N), np.random.rand(N), 0.4 + 0.1 * np.random.rand(N))
 14 | # g2 = (0.4 + 0.3 * np.random.rand(N), 0.5 * np.random.rand(N), 0.1 * np.random.rand(N))
 15 | # g3 = (0.3 * np.random.rand(N), 0.3 * np.random.rand(N), 0.3 * np.random.rand(N))
 16 | 
 17 | 
 18 | def draw_3D(X,y):
 19 | 
 20 |     colors = ("blue", "magenta", "cyan")
 21 |     groups = ("Benign", "Fake", "Vandal")
 22 |     markers = ("*", "o", "v")
 23 |     # Create plot
 24 | 
 25 | 
 26 |     # fig = plt.figure()
 27 |     fig = plt.figure(facecolor='white')
 28 |     ax = fig.add_subplot(1, 1, 1, axisbg="1.0", projection='3d')
 29 | 
 30 |     for i in range(3):
 31 |         ax.scatter(X[y == i][:, 0], X[y == i][:, 1], X[y == i][:, 2], marker=markers[i], alpha=0.8, c=colors[i], edgecolors='face', s=5,
 32 |                    label=groups[i])
 33 | 
 34 |     # plt.axis('off')
 35 |     plt.title('Matplot 3d scatter plot')
 36 |     plt.legend(loc=2)
 37 |     plt.xlim(0, 1)
 38 |     plt.ylim(0, 1)
 39 |     # ax.set_zticklabels([])
 40 |     # ax.set_yticklabels([])
 41 |     # ax.set_xticklabels([])
 42 |     plt.show()
 43 | 
 44 | def draw_2D(X, y):
 45 | 
 46 |     colors = ("blue", "c")
 47 |     groups = ("Benign", "Vandal")
 48 |     markers = ("*", "v")
 49 | 
 50 |     # Create plot
 51 | 
 52 |     # fig = plt.figure()
 53 |     fig = plt.figure(facecolor='white')
 54 |     ax = fig.add_subplot(1, 1, 1, axisbg="1.0")
 55 | 
 56 |     for i in range(2):
 57 |         ax.scatter(X[y == i][:, 0], X[y == i][:, 1], marker=markers[i], alpha=0.8, c=colors[i], edgecolors='face', s=5, label=groups[i])
 58 | 
 59 |     plt.axis('off')
 60 |     # plt.title('Matplot 3d scatter plot')
 61 |     # plt.legend(loc=2)
 62 |     # ax.set_zticklabels([])
 63 |     # ax.set_yticklabels([])
 64 |     # ax.set_xticklabels([])
 65 |     plt.show()
 66 | 
 67 | def roc_curve(y, pred, title):
 68 | 
 69 |     fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=1)
 70 |     auc_val = metrics.auc(fpr, tpr)
 71 | 
 72 |     plt.figure()
 73 |     lw = 2
 74 |     plt.plot(fpr, tpr, color='darkorange',
 75 |          lw=lw, label='ROC curve (area = %0.4f)'%auc_val)
 76 |     plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
 77 |     plt.xlim([0.0, 1.0])
 78 |     plt.ylim([0.0, 1.05])
 79 |     plt.xlabel('False Positive Rate')
 80 |     plt.ylabel('True Positive Rate')
 81 |     plt.title('%s'%title)
 82 |     plt.legend(loc="lower right")
 83 |     plt.tight_layout()
 84 |     plt.show()
 85 | 
 86 | 
 87 | def roc_curve_two(y, pred, y2, pred2, title):
 88 | 
 89 |     fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=1)
 90 |     auc_val = metrics.auc(fpr, tpr)
 91 | 
 92 |     fpr2, tpr2, thresholds2 = metrics.roc_curve(y2, pred2, pos_label=1)
 93 |     auc_val2 = metrics.auc(fpr2, tpr2)
 94 | 
 95 | 
 96 |     plt.figure()
 97 |     lw = 2
 98 |     plt.plot(fpr, tpr, color='c',
 99 |          lw=lw, label='representation (area = %0.4f)'%auc_val)
100 | 
101 |     plt.plot(fpr2, tpr2, color='darkorange', linestyle=":",
102 |          lw=lw, label='raw feature (area = %0.4f)'%auc_val2)
103 | 
104 |     plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
105 |     plt.xlim([0.0, 1.0])
106 |     plt.ylim([0.0, 1.05])
107 |     plt.xlabel('False Positive Rate')
108 |     plt.ylabel('True Positive Rate')
109 |     plt.title('%s'%title)
110 |     plt.legend(loc="lower right")
111 |     plt.tight_layout()
112 |     plt.show()
113 | 
114 | 
115 | 
116 | #y_test_wiki = np.load("y_test_wiki.npy")[0:3300]
117 | #y_pred_wiki = np.load("y_prob_wiki.npy")[0:3300,1]
118 | 
119 | 
120 | y_test_credit = np.load("y_test_credit.npy")[0:1100]
121 | y_pred_credit = np.load("y_prob_credit.npy")[0:1100,1]
122 | 
123 | y_test_credit_noencoding = np.load("y_test_credit_noencoding.npy")[0:1100]
124 | y_pred_credit_noencoding = np.load("y_prob_credit_noencoding.npy")[0:1100,1]
125 | 
126 | roc_curve_two(y_test_credit, y_pred_credit, y_test_credit_noencoding, y_pred_credit_noencoding, "")
127 | 
128 | #roc_curve(y_test_wiki, y_pred_wiki, "")
129 | #roc_curve(y_test_credit, y_pred_credit, "")
130 | 
131 | # y_test_wiki = np.load("y_test_wiki.npy")[0:3300]
132 | # y_pred_wiki = (np.load("y_prob_wiki.npy")[0:3300,1] > 0.5).astype(int)
133 | 
134 | # y_test_credit = np.load("y_test_credit.npy")[0:1300]
135 | # y_pred_credit = (np.load("y_prob_credit.npy")[0:1300,1] > 0.5).astype(int)
136 | 
137 | 
138 | 
139 | 
140 | # conf_mat_wiki = classification_report(y_test_wiki, y_pred_wiki, target_names=['benign', 'vandal'], digits=4)
141 | # conf_mat_cred = classification_report(y_test_credit, y_pred_credit, target_names=['benign', 'vandal'], digits=4)
142 | 
143 | # print conf_mat_wiki
144 | #print conf_mat_cred
145 | 
146 | 
147 | exit(0)
148 | 
149 | 
150 | min_max_scaler = MinMaxScaler()
151 | x_benign = min_max_scaler.fit_transform(np.load("./hidden_repre/ben_hid_emd_4_50_8_200_r0.npy"))
152 | x_vandal = min_max_scaler.fit_transform(np.load("./hidden_repre/val_hid_emd_4_50_8_200_r0.npy"))
153 | 
154 | x_benign = sample_shuffle_uspv(x_benign)
155 | x_vandal = sample_shuffle_uspv(x_vandal)
156 | 
157 | X = x_benign[0:3000].tolist() + x_vandal[0:3000].tolist()
158 | y = np.zeros(3000).tolist() + np.ones(3000).tolist()
159 | X, y = np.array(X), np.array(y)
160 | 
161 | model_2D = Isomap(n_components=2)
162 | X_2D = model_2D.fit_transform(X)
163 | 
164 | 
165 | draw_2D(X_2D, y)
166 | 
167 | 
168 | 
169 | exit(0)
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | min_max_scaler = MinMaxScaler()
194 | 
195 | # if en_ae == 1:
196 | #     x_benign = min_max_scaler.fit_transform(np.load("./hidden_repre/ben_hid_emd_4_50_8_200_r0.npy"))
197 | #     x_vandal = min_max_scaler.transform(np.load("./hidden_repre/val_hid_emd_4_50_8_200_r0.npy"))
198 | # elif en_ae == 2:
199 | #     x_benign = min_max_scaler.fit_transform(np.load("./hidden_repre/credit_card/ben_hid_repre_r2.npy"))
200 | #     x_vandal = min_max_scaler.transform(np.load("./hidden_repre/credit_card/van_hid_repre_r2.npy"))
201 | # else:
202 | #     x_benign = min_max_scaler.fit_transform(np.load("./raw_credit_card/ben_raw_r0.npy"))
203 | #     x_vandal = min_max_scaler.transform(np.load("./raw_credit_card/van_raw_r0.npy"))
204 | 
205 | 
206 | #x_benign = min_max_scaler.fit_transform(np.load("./hidden_output/ben_hid_emd_4_50_8_200.npy"))
207 | #x_vandal = min_max_scaler.transform(np.load("./hidden_output/val_hid_emd_4_50_8_200.npy"))
208 | 
209 | 
210 | def gen_circle_data(num_samples=11000):
211 | 
212 |     # make a simple unit circle
213 |     theta = np.linspace(0, 2*np.pi, num_samples)
214 |     a, b = 1 * np.cos(theta), 1 * np.sin(theta)
215 |     r = np.random.rand((num_samples))
216 |     x, y = r * np.cos(theta), r * np.sin(theta)
217 | 
218 |     real_data = list()
219 |     for i, e in enumerate(y):
220 |         real_data.append([x[i], e])
221 |     return np.array(real_data)
222 | 
223 | 
224 | x_benign = gen_circle_data()
225 | x_benign = sample_shuffle_uspv(x_benign)
226 | # x_vandal = sample_shuffle_uspv(x_vandal)
227 | 
228 | x_benign = x_benign[0:10000]
229 | x_pre = x_benign[0:7000]
230 | 
231 | # exit(0)
232 | # print x_benign.shape, x_pre.shape
233 | # exit(0)
234 | 
235 | 
236 | # if en_ae == 1:
237 | #     x_benign = x_benign[0:10000]
238 | #     # x_vandal = x_vandal[0:10000]
239 | #     x_pre = x_benign[0:7000]
240 | # else:
241 | #     x_pre = x_benign[0:700]
242 | 
243 | y_pre = np.zeros(len(x_pre))
244 | y_pre = one_hot(y_pre, 2)
245 | 
246 | x_train = x_pre
247 | 
248 | y_real_mb = one_hot(np.zeros(mb_size), 2)
249 | y_fake_mb = one_hot(np.ones(mb_size), 2)
250 | 
251 | # if en_ae == 1:
252 | #     x_test = x_benign[-3000:].tolist() + x_vandal[-3000:].tolist()
253 | # else:
254 | #     x_test = x_benign[-490:].tolist() + x_vandal[-490:].tolist()
255 | # x_test = np.array(x_test)
256 | 
257 | 
258 | # y_test = np.zeros(len(x_test))
259 | # if en_ae == 1:
260 | #     y_test[3000:] = 1
261 | # else:
262 | #     y_test[490:] = 1
263 | 
264 | 
265 | sess = tf.Session()
266 | sess.run(tf.global_variables_initializer())
267 | 
268 | # pre-training for target distribution
269 | 
270 | _ = sess.run(T_solver,
271 |              feed_dict={
272 |                 X_tar:x_pre,
273 |                 y_tar:y_pre
274 |                 })
275 | 
276 | q = np.divide(len(x_train), mb_size)
277 | 
278 | # n_epoch = 1
279 | #
280 | # while n_epoch:
281 | 
282 | d_ben_pro, d_fake_pro, fm_loss_coll = list(), list(), list()
283 | f1_score  = list()
284 | d_val_pro = list()
285 | 
286 | n_round = 200
287 | 
288 | # if en_ae == 1:
289 | #     n_round = 50
290 | # else:
291 | #     n_round = 200
292 | 
293 | 
294 | # plt.scatter(x_train[0:2000,0], x_train[0:2000,1], c="r")
295 | #
296 | # plt.ylim([-1.5,1.5])
297 | # plt.xlim([-1.5,1.5])
298 | # plt.show()
299 | # exit(0)
300 | 
301 | for n_epoch in range(n_round):
302 | 
303 |     X_mb_oc = sample_shuffle_uspv(x_train)
304 | 
305 |     for n_batch in range(q):
306 | 
307 |         _, D_loss_curr, ent_real_curr = sess.run([D_solver, D_loss, ent_real_loss],
308 |                                           feed_dict={
309 |                                                      X_oc: X_mb_oc[n_batch*mb_size:(n_batch+1)*mb_size],
310 |                                                      Z: sample_Z(mb_size, Z_dim),
311 |                                                      y_real: y_real_mb,
312 |                                                      y_gen: y_fake_mb
313 |                                                      })
314 | 
315 |         _, G_loss_curr, fm_loss_curr = sess.run([G_solver, G_loss, fm_loss],
316 |         # _, G_loss_curr, fm_loss_, kld_ = sess.run([G_solver, G_loss, fm_loss, pt_loss + G_ent_loss],
317 |                                            feed_dict={Z: sample_Z(mb_size, Z_dim),
318 |                                                       X_oc: X_mb_oc[n_batch*mb_size:(n_batch+1)*mb_size],
319 |                                                       })
320 | 
321 |     D_prob_real_, D_prob_gen_ = sess.run([D_prob_real, D_prob_gen],
322 |                                          feed_dict={X_oc: x_train,
323 |                                                     Z: sample_Z(len(x_train), Z_dim)})
324 | 
325 |     # if en_ae == 1:
326 |     #     D_prob_vandal_ = sess.run(D_prob_real,
327 |     #                               feed_dict={X_oc: x_vandal[0:7000]})
328 |     #                               # feed_dict={X_oc:x_vandal[-490:]})
329 |     # else:
330 |     #     D_prob_vandal_ = sess.run(D_prob_real,
331 |     #                               #feed_dict={X_oc: x_vandal[0:7000]})
332 |     #                               feed_dict={X_oc:x_vandal[-490:]})
333 | 
334 |     d_ben_pro.append(np.mean(D_prob_real_[:, 0]))
335 |     d_fake_pro.append(np.mean(D_prob_gen_[:, 0]))
336 |     # d_val_pro.append(np.mean(D_prob_vandal_[:, 0]))
337 |     fm_loss_coll.append(fm_loss_curr)
338 |     print "epoch %s"%n_epoch, np.mean(fm_loss_coll)
339 | 
340 | 
341 | 
342 | 
343 | 
344 | bg_gen = sess.run([G_sample],
345 |                   feed_dict={Z:sample_Z(2000, Z_dim)})
346 | 
347 | 
348 | plt.scatter(bg_gen[:,0], bg_gen[:,1], c="r")
349 | plt.ylim([-1.5,1.5])
350 | plt.xlim([-1.5,1.5])
351 | plt.show()
352 | 
353 |     # prob, _ = sess.run([D_prob_real, D_logit_real], feed_dict={X_oc: x_test})
354 |     # y_pred = np.argmax(prob, axis=1)
355 |     # conf_mat = classification_report(y_test, y_pred, target_names=['benign', 'vandal'], digits=4)
356 |     # f1_score.append(float(filter(None, conf_mat.strip().split(" "))[12]))
357 |     # print conf_mat
358 | 
359 | # if not dra_tra_pro:
360 | #     acc = np.sum(y_pred == y_test)/float(len(y_pred))
361 | #     print conf_mat
362 | #     print "acc:%s"%acc
363 | #
364 | # if dra_tra_pro:
365 | #     draw_trend(d_ben_pro, d_fake_pro, d_val_pro, fm_loss_coll, f1_score)
366 | 
367 | exit(0)
368 | 
369 | 
370 | 
371 | 
372 | 
373 | 
374 | 
375 | 
376 | 
377 | 
378 | 
379 | 
380 | 
381 | 
382 | 
383 | 
384 | 
385 | 
386 | 
387 | 
388 | 
389 | 
390 | 
391 | 
392 | 
393 | 
394 | 
395 | 
396 | 
397 | 
398 | 
399 | 
400 | 
401 | 
402 | 
403 | 
404 | 
405 | 
406 | 
407 | 
408 | 


--------------------------------------------------------------------------------
/Baselines/base_classifiers.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     Author: Panpan Zheng
  3 |     Date created:  1/15/2018
  4 |     Python Version: 2.7
  5 | '''
  6 | 
  7 | import sys
  8 | sys.path.append("..\\..\\")
  9 | import os
 10 | import numpy as np
 11 | from libs import Autoencoder
 12 | from keras.models import Sequential, Model
 13 | from keras.layers import Input, LSTM
 14 | from keras.layers.core import Masking
 15 | 
 16 | from sklearn import tree, ensemble, neighbors, svm, covariance
 17 | 
 18 | def k_NN(X,y):
 19 | 	clf = neighbors.KNeighborsClassifier(n_neighbors=3)
 20 | 	return clf.fit(X,y)
 21 | 
 22 | def decision_tree(X,y):
 23 | 	clf = tree.DecisionTreeClassifier()
 24 | 	return clf.fit(X, y)
 25 | 
 26 | def random_forest(X,y):
 27 | 	clf = ensemble.RandomForestClassifier(n_estimators=10)
 28 | 	return clf.fit(X,y)
 29 | 
 30 | def svm_svc(X,y):
 31 | 	clf = svm.SVC()
 32 | 	return clf.fit(X,y)
 33 | 
 34 | def svm_nusvc(X,y):
 35 | 	clf = svm.NuSVC()
 36 | 	return clf.fit(X,y)
 37 | 
 38 | def svm_linearsvc(X,y):
 39 | 	clf = svm.LinearSVC()
 40 | 	return clf.fit(X,y)
 41 | 
 42 | def svm_oneclass(X):
 43 | 	clf = svm.OneClassSVM()
 44 | 	return clf.fit(X)
 45 | 
 46 | def elliptic_envelope(X):
 47 | 	clf = covariance.EllipticEnvelope()
 48 | 	return clf.fit(X)
 49 | 
 50 | def iso_forest(X):
 51 | 	clf = ensemble.IsolationForest(max_samples=X.shape[0], random_state=None)
 52 | 	return clf.fit(X)
 53 | 
 54 | class LSTM_Autoencoder(object):
 55 | 	"""docstring for LSTM_Autoencoder"""
 56 | 	def __init__(self, input_dim, time_step, hidden_dim):
 57 | 		self.input_dim = input_dim
 58 | 		self.time_step = time_step
 59 | 		self.hidden_dim = hidden_dim
 60 | 		self.autoencoder = Autoencoder()
 61 | 		self.autoencoder.modelMasking('lstm', [self.time_step, self.input_dim], self.hidden_dim)
 62 | 
 63 | 	def compile(self):
 64 | 		self.autoencoder.compile('temporal')
 65 | 
 66 | 	def fit(self, data, weights):
 67 | 		self.autoencoder.fit(data, 'rev', weights)
 68 | 
 69 | 	def get_hidden_layer_last_step(self):
 70 | 		# print "net summary: ", self.autoencoder.model.summary()
 71 | 		self.hidden_representation = Sequential()
 72 | 		self.hidden_representation.add(self.autoencoder.model.layers[0])
 73 | 		self.hidden_representation.add(self.autoencoder.model.layers[1])
 74 | 		self.hidden_representation.add(self.autoencoder.model.layers[2])
 75 | 
 76 | 	def get_hidden_layer_sequence(self):
 77 | 		inputData = Input(shape=(self.time_step, self.input_dim))
 78 | 		mask = Masking(mask_value=0.)(inputData)
 79 | 		encoded = LSTM(self.hidden_dim[0], return_sequences=True, weights=self.autoencoder.model.layers[2].get_weights())(mask)
 80 | 		self.hidden_representation = Model(inputData, encoded)
 81 | 
 82 | 	def get_hidden_representation(self, data):
 83 | 		return self.hidden_representation.predict(data)
 84 | 
 85 | class Dense_Autoencoder(object):
 86 | 	"""docstring for LSTM_Autoencoder"""
 87 | 	def __init__(self, input_dim, hidden_dim):
 88 | 		self.input_dim = input_dim
 89 | 		self.hidden_dim = hidden_dim
 90 | 		self.autoencoder = Autoencoder()
 91 | 		self.autoencoder.modelMasking('dense', [self.input_dim], self.hidden_dim)
 92 | 
 93 | 	def compile(self):
 94 | 		self.autoencoder.compile()
 95 | 
 96 | 	def fit(self, data):
 97 | 		self.autoencoder.fit(data, 'nor')
 98 | 
 99 | 	def get_hidden_layer(self):
100 | 		# print "net summary: ", self.autoencoder.model.summary()
101 | 		self.hidden_representation = Sequential()
102 | 		self.hidden_representation.add(self.autoencoder.model.layers[0])
103 | 		self.hidden_representation.add(self.autoencoder.model.layers[1])
104 | 		self.hidden_representation.add(self.autoencoder.model.layers[2])
105 | 
106 | 	def get_hidden_representation(self, data):
107 | 		return self.hidden_representation.predict(data)
108 | 
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/Baselines/baseline_OCC.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     Author: Panpan Zheng
  3 |     Date created:  1/15/2018
  4 |     Python Version: 2.7
  5 | '''
  6 | 
  7 | import os
  8 | import sys
  9 | sys.path.append(os.getcwd() + "\\..\\..\\")
 10 | import numpy as np
 11 | from utils import sample_shuffle
 12 | from base_classifiers import svm_oneclass, elliptic_envelope, iso_forest
 13 | from sklearn.metrics import classification_report
 14 | from sklearn.metrics import f1_score, accuracy_score
 15 | import matplotlib.pyplot as plt
 16 | 
 17 | from baseline_OCC_utils import *
 18 | from base_classifiers import LSTM_Autoencoder, svm_oneclass, Dense_Autoencoder
 19 | from model_components import train_gan, run_Gan,run_one_svm
 20 | from utils import sample_shuffle, draw_trend, plot_decision_boundary, TSNE_2D_show_tri
 21 | 
 22 | 
 23 | matlab_script_path = "C:\\Users\\Panpan_user\\Documents\\MATLAB\\"
 24 | matlab_eng = matlab_engine_setup(matlab_script_path)
 25 | 
 26 | # try_num = sys.argv[1:]
 27 | # Load data and preprocess.
 28 | en_ae = 1  # 1 for wiki; 2 for credit card with encoding; 3 for credit card without encoding.
 29 | dra_tra_pro = True # Observe the training process along epochs, or run training then test it.
 30 | 
 31 | if en_ae == 1:
 32 |     samples_path = os.getcwd() + "\\..\\..\\sampleData\\"
 33 |     f_ben, f_van = "X_v8_4_50_Ben", "X_v8_4_50_Van"
 34 |     x_ben, x_van = load_data(samples_path, f_ben, f_van)
 35 |     input_dim = 8
 36 |     hid_dim = [200]
 37 |     d_in = [200]
 38 |     epochs = 150
 39 | elif en_ae == 2:
 40 |     x_ben, x_van = getDataCCFD("creditcard.csv.zip")
 41 |     x_ben = sample_shuffle(x_ben)[0:2000]
 42 |     input_dim = 30
 43 |     hid_dim = [100]
 44 |     d_in = [50]  #autoencoding.
 45 |     epochs = 200
 46 | else:
 47 |     x_ben, x_van = getDataCCFD("creditcard.csv.zip")
 48 |     x_ben = sample_shuffle(x_ben)[0:2000]
 49 |     input_dim = 30
 50 |     d_in = [input_dim]  # without autoencoding.
 51 |     epochs = 200
 52 | 
 53 | train_ratio = .7
 54 | max_len = 50
 55 | time_step = max_len
 56 | g_in = [50]
 57 | gan_in = [50]
 58 | sampling_ratio = train_ratio
 59 | neg_label_OCC = 2
 60 | neg_label_GAN = 0
 61 | iter_num = 10
 62 | 
 63 | prec_coll = list()
 64 | reca_coll = list()
 65 | f1_score_coll = list()
 66 | accuracy_coll = list()
 67 | 
 68 | for i in range(iter_num):
 69 |     if en_ae == 1: # LSTM-autoencoder for wiki data.
 70 |         x_train_P, x_ben_P, x_van_P, weights_P, __, __ = sampling_preprocessing_LSTM_AE(x_ben, x_van, train_ratio, max_len)
 71 |         lstm_ae = LSTM_Autoencoder(input_dim, time_step, hid_dim)
 72 |         lstm_ae.compile()
 73 |         lstm_ae.fit(x_train_P, weights_P)
 74 |         lstm_ae.get_hidden_layer_last_step()
 75 |         ben_hid_repre, van_hid_repre = map(lambda x: lstm_ae.get_hidden_representation(x), [x_ben_P, x_van_P])
 76 |         ben_hid_repre, van_hid_repre = map(lambda x: preprocess_minus_1_and_pos_1(x), [ben_hid_repre, van_hid_repre])
 77 |     elif en_ae == 2:  # Dense encoder for Credit Card data.
 78 |         dense_ae = Dense_Autoencoder(input_dim, hid_dim)
 79 |         dense_ae.compile()
 80 |         dense_ae.fit(x_ben[0:700])
 81 |         dense_ae.get_hidden_layer()
 82 |         ben_hid_repre, van_hid_repre = map(lambda x: dense_ae.get_hidden_representation(x), [x_ben, x_van])
 83 |         ben_hid_repre, van_hid_repre = map(lambda x: preprocess_minus_1_and_pos_1(x), [ben_hid_repre, van_hid_repre])
 84 |         # np.save("ben_hid_repre_r%s"%i, ben_hid_repre)
 85 |         # np.save("van_hid_repre_r%s"%i, van_hid_repre)
 86 |     else:
 87 |         ben_hid_repre, van_hid_repre = map(lambda x: preprocess_minus_1_and_pos_1(x), [x_ben, x_van])
 88 |         np.save("ben_raw_r%s"%i, ben_hid_repre)
 89 |         np.save("van_raw_r%s"%i, van_hid_repre)
 90 | 
 91 |     x_train, x_test, y_train_OCC, y_test_OCC, y_test_GAN  = \
 92 |         sampling_data_for_OCC(ben_hid_repre, van_hid_repre, sampling_ratio, neg_label_OCC, neg_label_GAN, en_ae)
 93 | 
 94 |     GAN, D, G = get_GAN(g_in, d_in, gan_in)
 95 |     if dra_tra_pro:
 96 |         D, X_fake, D_real_prob, D_fake_prob, D_val_prob, fake_real_mse, f1_score = \
 97 |             train_and_test(GAN, G, D, x_train, x_test, y_test_GAN, en_ae, epochs)
 98 |         x_test_ben = x_test[y_test_GAN == 1]
 99 |         x_test_van = x_test[y_test_GAN != 1]
100 |         x_test_ben = sample_shuffle(x_test_ben)
101 |         x_test_van = sample_shuffle(x_test_van)
102 |         X_fake = sample_shuffle(X_fake)
103 |         X = x_test_ben[0:1000].tolist() +  X_fake[0:1000].tolist() + x_test_van[0:1000].tolist()
104 |         y = np.ones(1000).tolist() + np.zeros(1000).tolist() + (np.ones(1000)+1).tolist()
105 |         X, y = np.array(X), np.array(y)
106 |         TSNE_2D_show_tri(X, y)
107 |         draw_trend(D_real_prob, D_fake_prob, D_val_prob, fake_real_mse, f1_score)
108 |         exit(0)
109 |     else:
110 |         discriminator = train_gan(GAN, G, D, x_train, epochs, en_ae)
111 |         prec_gan, reca_gan, f1_gan, acc_gan = run_Gan(x_test, y_test_GAN, discriminator, en_ae)
112 | 
113 |     prec_OCC, reca_OCC, f1_OCC, acc_OCC = run_OCC(x_train, x_test, y_train_OCC, y_test_OCC, matlab_eng, i, en_ae)
114 |     if en_ae == 1:
115 |         clf = svm_oneclass(x_train[0:7000])
116 |     else:
117 |         clf = svm_oneclass(x_train[0:700])
118 |     prec_svm, reca_svm, f1_svm, acc_svm = run_one_svm(x_test, y_test_OCC, clf, en_ae)
119 | 
120 |     prec_coll.append([prec_gan] + prec_OCC + [prec_svm])
121 |     reca_coll.append([reca_gan] + reca_OCC + [reca_svm])
122 |     f1_score_coll.append([f1_gan] + f1_OCC + [f1_svm])
123 |     accuracy_coll.append([acc_gan] + acc_OCC + [acc_svm])
124 | 
125 | prec_coll, reca_coll, f1_score_coll, accuracy_coll = \
126 |     np.array(prec_coll), np.array(reca_coll), np.array(f1_score_coll), np.array(accuracy_coll)
127 | 
128 | print "====================== precision ================================="
129 | 
130 | print "prec_gan: ", map(lambda x: decimal_precision(x, 4), [np.mean(prec_coll[:,0]),
131 |                                                            np.std(prec_coll[:,0])])
132 | print "prec_gpoc: ", map(lambda x: decimal_precision(x, 4), [np.mean(prec_coll[:,1]),
133 |                                                            np.std(prec_coll[:,1])])
134 | 
135 | print "prec_nn: ", map(lambda x: decimal_precision(x, 4), [np.mean(prec_coll[:,2]),
136 |                                                            np.std(prec_coll[:,2])])
137 | 
138 | print "prec_scikit_svm: ", map(lambda x: decimal_precision(x, 4), [np.mean(prec_coll[:,3]),
139 |                                                            np.std(prec_coll[:,3])])
140 | 
141 | print "====================== recall ================================="
142 | 
143 | print "reca_gan: ", map(lambda x: decimal_precision(x, 4), [np.mean(reca_coll[:,0]),
144 |                                                            np.std(reca_coll[:,0])])
145 | print "reca_gpoc: ", map(lambda x: decimal_precision(x, 4), [np.mean(reca_coll[:,1]),
146 |                                                            np.std(reca_coll[:,1])])
147 | print "reca_nn: ", map(lambda x: decimal_precision(x, 4), [np.mean(reca_coll[:,2]),
148 |                                                            np.std(reca_coll[:,2])])
149 | print "reca_scikit_svm: ", map(lambda x: decimal_precision(x, 4), [np.mean(reca_coll[:,3]),
150 |                                                            np.std(reca_coll[:,3])])
151 | 
152 | print "===================== f1 score ================================"
153 | print "f1_score_gan: ", map(lambda x: decimal_precision(x, 4), [np.mean(f1_score_coll[:,0]),
154 |                                                                 np.std(f1_score_coll[:,0])])
155 | print "f1_score_gpoc: ", map(lambda x: decimal_precision(x, 4), [np.mean(f1_score_coll[:,1]),
156 |                                                                 np.std(f1_score_coll[:,1])])
157 | print "f1_score_nn: ", map(lambda x: decimal_precision(x, 4), [np.mean(f1_score_coll[:,2]),
158 |                                                                 np.std(f1_score_coll[:,2])])
159 | print "f1_scikit_svm: ", map(lambda x: decimal_precision(x, 4), [np.mean(f1_score_coll[:,3]),
160 |                                                                 np.std(f1_score_coll[:,3])])
161 | 
162 | print "====================== accuracy ================================="
163 | 
164 | print "acc_gan: ", map(lambda x: decimal_precision(x, 4), [np.mean(accuracy_coll[:,0]),
165 |                                                            np.std(accuracy_coll[:,0])])
166 | print "acc_gpoc: ", map(lambda x: decimal_precision(x, 4), [np.mean(accuracy_coll[:,1]),
167 |                                                            np.std(accuracy_coll[:,1])])
168 | print "acc_nn: ", map(lambda x: decimal_precision(x, 4), [np.mean(accuracy_coll[:,2]),
169 |                                                            np.std(accuracy_coll[:,2])])
170 | print "acc_scikit_svm: ", map(lambda x: decimal_precision(x, 4), [np.mean(accuracy_coll[:,3]),
171 |                                                            np.std(accuracy_coll[:,3])])
172 | exit(0)
173 | 
174 | 
175 | 
176 | 
177 | 


--------------------------------------------------------------------------------
/Baselines/baseline_OCC_utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     Author: Panpan Zheng
  3 |     Date created:  1/15/2018
  4 |     Python Version: 2.7
  5 | '''
  6 | 
  7 | import os
  8 | import sys
  9 | sys.path.append("..\\..\\")
 10 | sys.path.append(os.getcwd() + "\\..\\..\\")
 11 | 
 12 | 
 13 | import numpy as np
 14 | from data_generation import bw_one_and_minus_one
 15 | from model_components import get_generator, get_discriminator, make_gan, train_and_test
 16 | from utils import sample_shuffle
 17 | from sklearn.metrics import classification_report
 18 | from sklearn.metrics import accuracy_score
 19 | import matplotlib.pyplot as plt
 20 | 
 21 | from keras.preprocessing.sequence import pad_sequences
 22 | from base_classifiers import LSTM_Autoencoder
 23 | from keras.layers import Input
 24 | import scipy.io as sio
 25 | import matlab.engine
 26 | import pandas as pd
 27 | from sklearn import preprocessing
 28 | 
 29 | 
 30 | def draw_f1_accuracy(f1_score, accuracy, ind):
 31 |     fig = plt.figure()
 32 |     axes = plt.gca()
 33 |     plt.subplot(2, 1, 1)
 34 |     plt.plot(ind, f1_score, "ro-")
 35 |     plt.ylabel('f1_score')
 36 |     axes.set_xlim([1., 20.])
 37 |     plt.subplot(2, 1, 2)
 38 |     plt.plot(ind, accuracy, "bo-")
 39 |     plt.ylabel('accuracy')
 40 |     plt.xlabel('Round #')
 41 |     axes.set_xlim([1., 20.])
 42 |     plt.show()
 43 | 
 44 | def load_data(data_path, f_ben, f_van):
 45 |     data_ben = np.load(data_path + "%s.npy"%f_ben)
 46 |     data_van = np.load(data_path + "%s.npy"%f_van)
 47 |     return data_ben, data_van
 48 | 
 49 | def preprocess_minus_1_and_pos_1(X):
 50 |     return np.array(map(lambda x: bw_one_and_minus_one(x), X))
 51 | 
 52 | def sampling_preprocessing_LSTM_AE(x_ben, x_van, train_ratio, max_len):
 53 |     n_samples_train = int(x_ben.shape[0] * train_ratio)
 54 |     # x_train = sample_shuffle(x_ben)[0:n_samples_train]   # shuffle and sampling data
 55 |     x_ben = sample_shuffle(x_ben)
 56 |     x_van = sample_shuffle(x_van)
 57 |     x_train = x_ben[0:n_samples_train]
 58 |     weights = get_sample_weights(x_train)     #  contruct the sample weights for LSTM-AE output
 59 | 
 60 |     return seq_padding(x_train, max_len, 'pre'), \
 61 |            seq_padding(x_ben, max_len, 'pre'), \
 62 |            seq_padding(x_van, max_len, 'pre'), \
 63 |            seq_padding(weights, max_len, 'post'), \
 64 |            map(lambda x: len(x), x_ben),\
 65 |            map(lambda x: len(x), x_van)# padding sequence,
 66 |                                        # 'pre' for editting sequence
 67 |                                        # 'post' for weights sequence
 68 | 
 69 | 
 70 | def sampling_data_for_OCC(x_ben, x_van, sampling_ratio, neg_label1, neg_label2, en_ae):
 71 |     n_samples_train = int(len(x_ben) * sampling_ratio)
 72 |     if en_ae == 1:
 73 |         n_samples_test = len(x_ben) - n_samples_train
 74 |     else:
 75 |         n_samples_test = len(x_van)
 76 |     # n_samples_train = int(x_ben.shape[0] * sampling_ratio)
 77 |     # n_samples_test = x_ben.shape[0] - n_samples_train
 78 |     # assert n_samples_test <= x_van.shape[0]
 79 |     # assert n_samples_test <= len(x_van)
 80 |     x_ben, x_van = sample_shuffle(x_ben), sample_shuffle(x_van)
 81 |     x_train = x_ben[0:n_samples_train]
 82 |     x_test = x_ben[-n_samples_test:].tolist() + x_van[-n_samples_test:].tolist()
 83 |     x_test = np.array(x_test)
 84 |     y_train_OCC = np.ones(n_samples_train)
 85 |     y_test_OCC = np.ones(2 * n_samples_test)
 86 |     y_test_OCC[n_samples_test:] = neg_label1
 87 |     y_test_GAN = np.ones(2 * n_samples_test)
 88 |     y_test_GAN[n_samples_test:] = neg_label2
 89 |     return x_train, x_test, \
 90 |            y_train_OCC, y_test_OCC, y_test_GAN
 91 | 
 92 | def decimal_precision(x, digit_num):
 93 |     if "e" in str(x):
 94 |         x_decimal = x
 95 |     else:
 96 |         itgr_part, frac_part = str(x).split(".")
 97 |         if len(frac_part) > digit_num:
 98 |             x_decimal = itgr_part + "." + frac_part[0:digit_num]
 99 |         else:
100 |             x_decimal = itgr_part + "." + frac_part
101 |     return float(x_decimal)
102 | 
103 | def conf_mat_f1_accuracy(y_test, y_pred, tgt_nam1, tgt_nam2):
104 |     conf_mat = classification_report(y_test, y_pred, target_names=[tgt_nam1, tgt_nam2], digits=4)
105 |     f1 = float(filter(None, conf_mat[-50:].strip().split(" "))[-2])  # avarage f1 of tgt_nam1 and tgt_nam2
106 |     acc = accuracy_score(y_test, y_pred)
107 |     f1, acc = map(lambda x: decimal_precision(x, 4), [f1, acc])
108 |     return conf_mat, f1, acc
109 | 
110 | 
111 | def get_sample_weights(samples):
112 |     sampleWeights = list()
113 |     for e in samples:
114 |         sampleWeights.append(np.ones(len(e)))
115 |     return sampleWeights
116 | 
117 | def seq_padding(sample_sequence, max_length, padding_type):
118 |     return pad_sequences(sample_sequence, maxlen=max_length, dtype='float', padding=padding_type)
119 | 
120 | def get_GAN(g_in, d_in, gan_in):
121 |     G_in = Input(shape=g_in)
122 |     G, G_out = get_generator(G_in, d_in[0])
123 |     # discriminator (x -> y)
124 |     D_in = Input(shape=d_in)
125 |     D, D_out = get_discriminator(D_in)
126 |     GAN_in = Input(shape=gan_in)
127 |     GAN, GAN_out = make_gan(GAN_in, G, D)
128 |     return GAN, D, G
129 | 
130 | def matlab_engine_setup(matlab_script_path):
131 |     eng = matlab.engine.start_matlab()
132 |     eng.addpath(matlab_script_path, nargout=0)
133 |     eng.addpath(matlab_script_path + "netlab3_2\\", nargout=0)
134 |     eng.addpath(matlab_script_path + "NDtoolv0.12\\", nargout=0)
135 |     eng.addpath(matlab_script_path + "NDtoolv0.12\\Netlab\\", nargout=0)
136 |     return eng
137 | 
138 | def  run_OCC(x_train, x_test, y_train_OCC, y_test_OCC, eng, i, en_ae):
139 |     # nd_type = ['gpoc', 'svmsch', 'nn', 'kpca']
140 |     nd_type = ['gpoc', 'nn']
141 |     mat_store_path = os.getcwd() + "\\..\\..\\hidden_representation\\mat_OCC\\"
142 |     prec_container, reca_container, f1_container, acc_container = list(), list(), list(), list()
143 |     X = np.concatenate((x_train, x_test))
144 |     y = np.concatenate((y_train_OCC, y_test_OCC))
145 |     sio.savemat(mat_store_path + "X_hid_emd_4_50_8_200_r%s.mat"%i, dict(x=X, y=y))
146 |     for tp in nd_type:
147 |         prec, reca, f1, acc = eng.run_baseline(mat_store_path + "X_hid_emd_4_50_8_200_r%s.mat"%i, tp, en_ae, nargout=4)
148 |         prec_container.append(prec)
149 |         reca_container.append(reca)
150 |         f1_container.append(f1)
151 |         acc_container.append(acc)
152 | 
153 |     return prec_container, reca_container, f1_container, acc_container
154 | 
155 | def sampling_data_for_dynamic(x_ben, x_van, sampling_ratio, neg_label):
156 |     n_samples_train = int(len(x_ben) * sampling_ratio)
157 |     n_samples_test = len(x_ben) - n_samples_train
158 |     assert n_samples_test <= x_van.shape[0]
159 |     x_ben, x_van = sample_shuffle(x_ben), sample_shuffle(x_van)
160 |     x_train = x_ben[0:n_samples_train]
161 |     x_test = x_ben[-n_samples_test:].tolist() + x_van[-n_samples_test:].tolist()
162 |     x_test = np.array(x_test)
163 |     y_test_GAN = np.ones(2 * n_samples_test)
164 |     y_test_GAN[n_samples_test:] = neg_label
165 |     return x_train, x_test, y_test_GAN
166 | 
167 | def getDataCCFD(f_name):
168 |     data = pd.read_csv(f_name)
169 |     X = data.loc[: ,data.columns!='Class']
170 |     X.loc[:,'Time'] = (X.loc[:,'Time'].values/3600)%24
171 |     y = data.loc[:,'Class']
172 |     min_max_scaler = preprocessing.MinMaxScaler()
173 |     X = min_max_scaler.fit_transform(X.values)
174 |     y = y.values
175 |     return X[y==0], X[y==1]
176 | 


--------------------------------------------------------------------------------
/Baselines/baseline_VEWS_utils.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |     Author: Panpan Zheng
 3 |     Date created:  1/15/2018
 4 |     Python Version: 2.7
 5 | '''
 6 | 
 7 | import os
 8 | import sys
 9 | sys.path.append(os.getcwd() + "\\..\\OCC\\")
10 | sys.path.append(os.getcwd() + "\\..\\..\\")
11 | import numpy as np
12 | from utils import sample_shuffle
13 | from baseline_OCC_utils import get_sample_weights, seq_padding, bw_one_and_minus_one
14 | 
15 | def sample_shuffle_with_label(X,y):
16 | 	n_samples = X.shape[0]
17 | 	s = np.arange(n_samples)
18 | 	np.random.shuffle(s)
19 | 	return X[s], y[s]
20 | 
21 | def sampling_preprocessing_LSTM_AE(x_ben, x_van, train_ratio, max_len):
22 | 
23 |     assert train_ratio < 1.
24 |     n_samples_train = int(x_ben.shape[0] * train_ratio)
25 | 
26 |     assert n_samples_train <= x_van.shape[0]
27 |     x_train = sample_shuffle(x_ben)[0:n_samples_train].tolist() + \
28 |               sample_shuffle(x_van)[0:n_samples_train].tolist()
29 |     x_train = sample_shuffle(np.array(x_train))
30 |     weights = get_sample_weights(x_train)     #  contruct the sample weights for LSTM-AE output.
31 |     return seq_padding(x_train, max_len, 'pre'), \
32 |            seq_padding(x_ben, max_len, 'pre'), \
33 |            seq_padding(x_van, max_len, 'pre'), \
34 |            seq_padding(weights, max_len, 'post') # 'post' for weights sequence
35 | 
36 | def sampling_data_for_VEWS(x_ben, x_van):
37 | 
38 |     y_ben, y_van = np.ones(x_ben.shape[0]), np.zeros(x_van.shape[0])
39 |     x_ben, y_ben = sample_shuffle_with_label(x_ben, y_ben)
40 |     x_van, y_van = sample_shuffle_with_label(x_van, y_van)
41 |     return x_ben, x_van, y_ben, y_van
42 | 
43 | def k_fold_indices(n_samples, i, step):
44 | 	indices = np.arange(n_samples)
45 | 	test_indices = xrange(i * step, (i + 1) * step)
46 | 	train_indices = np.setdiff1d(indices, test_indices)
47 | 	return test_indices, train_indices
48 | 


--------------------------------------------------------------------------------
/Baselines/data_generation.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     Author: Panpan Zheng
  3 |     Date created:  1/15/2018
  4 |     Python Version: 2.7
  5 | '''
  6 | 
  7 | import os
  8 | import sys
  9 | import numpy as np
 10 | import glob
 11 | from libs import Dataset
 12 | from utils import getPageDict, MetaPageList,sample_shuffle
 13 | from shutil import rmtree
 14 | 
 15 | from keras.preprocessing.sequence import pad_sequences
 16 | from keras.models import Sequential
 17 | from libs import Autoencoder
 18 | 
 19 | 
 20 | def gen_samples():
 21 | 	"""
 22 | 	extract features(f1~f7) from Wikipedia data repository released by "VEWS" (http://www.cs.umd.edu/~vs/vews), and
 23 | 	construct samples used in our experiment.
 24 | 	:return: data samples with variant editing length(4~50) and fixed-length(10-step).
 25 | 
 26 | 	 f1~f7:
 27 | 		f1: whether or not pi is a meta-page
 28 | 		f2: if f1 is yes, whether or not pi's category is empty set.   if not, f2=0
 29 | 		f3: whether or not time difference is less than 3 minutes between pi-1 and pi
 30 | 		f4: whether or not pi has already been edited. (re-edit)
 31 | 		f5: if f4 is yes, whether or not pi equals to pi-1. (consective re-edit). if no, f5=0
 32 | 		f6: if f4 is no, whether or not pi has the common category with pi-1. if yes, f6 = 1
 33 | 		f7(optional): whether or not edits are reverted. This information is
 34 | 					  from Wikipedia auto-bots, such as cluebot, for bad editing revert.
 35 | 	"""
 36 | 
 37 | 	base = os.getcwd()
 38 | 	dataRepo = base + "\\Dataset\\"
 39 | 	if not os.path.exists(dataRepo):
 40 | 		raise OSError("data repository is not available.")
 41 | 	else:
 42 | 		f_pages = dataRepo + "pages.tsv"
 43 | 		f_users = dataRepo + "users.tsv"
 44 | 		files = glob.glob(dataRepo + "*.csv")
 45 | 	data = Dataset()
 46 | 
 47 | 	rawData = base + "\\rawData\\"
 48 | 	if not os.path.exists(rawData):
 49 | 		os.makedirs(rawData)
 50 | 		data.getRawData(files, f_users, f_pages, .7, rawData)
 51 | 
 52 | 	sampleData = base + "\\sampleData\\"
 53 | 	# if os.path.exists(sampleData):
 54 | 	# 	rmtree(sampleData)
 55 | 	# os.makedirs(sampleData)
 56 | 
 57 | 	X_tim = np.load(rawData + "wikiEditSeq_0.7\\X_tim.npy")
 58 | 	X_pages = np.load(rawData + "wikiEditSeq_0.7\\X_pages.npy")
 59 | 	X_rev = np.load(rawData + "wikiEditSeq_0.7\\X_rev.npy")
 60 | 	y = np.load(rawData + "wikiEditSeq_0.7\\y.npy")
 61 | 	page2id, page2Cgr, _ = getPageDict(f_pages)
 62 | 
 63 | 	np.save(sampleData + "MetaPageList.npy", MetaPageList(files,page2id))
 64 | 	metaDict = np.load(sampleData + "MetaPageList.npy")
 65 | 
 66 | 	# excluding 'revert' information.
 67 | 	data.getSamples(X_pages, X_tim, y, metaDict, page2Cgr,
 68 | 				"fix", 20, None, sampleData)
 69 | 	# data.getSamples(X_pages, X_tim, y, metaDict, page2Cgr,
 70 | 	# 			"var", 4, 50, sampleData)
 71 | 
 72 | 	# including 'revert' information.
 73 | 	data.getSamples(X_pages, X_tim, X_rev, y, metaDict, page2Cgr,
 74 | 					"fix", 20, None, sampleData)
 75 | 	# data.getSamples(X_pages, X_tim, X_rev, y, metaDict, page2Cgr,
 76 | 	# 			"var", 4, 50, sampleData)
 77 | 	# data.getSamples(X_pages, X_tim, X_rev, y, metaDict, page2Cgr,
 78 | 	# 			"var", 1, 20, sampleData)
 79 | 
 80 | 
 81 | def gen_hid_repre(fea_dim, hid_dim, fix_or_var, step_length):
 82 | 
 83 | 	"""
 84 | 	:param fea_dim: input dimension of LSTM-AE model
 85 | 	:param hid_dim: output dimension of hidden representation
 86 | 	:param fix_or_var:  editing sequence is fixed-length or variant-length.
 87 | 	:return: fixed-length hidden representation of editing sequence.
 88 | 	"""
 89 | 	base_path = os.getcwd()
 90 | 	samples_path = base_path + "\\sampleData\\"
 91 | 	repre_path = base_path + "\\hidden_representation\\"
 92 | 	if not os.path.exists(repre_path):
 93 | 		os.makedirs(repre_path)
 94 | 
 95 | 	if fix_or_var == 1:
 96 | 		# Load data
 97 | 		x_ben = np.load(samples_path + "X_%s_1_20_Ben.npy" %fea_dim)
 98 | 		x_van = np.load(samples_path + "X_%s_1_20_Van.npy" %fea_dim)
 99 | 		# print x_ben.shape, x_van.shape
100 | 		# exit(0)
101 | 		x_ben = sample_shuffle(x_ben)[0:6000]
102 | 		x_van = sample_shuffle(x_van)[0:3000]
103 | 		train_ben = x_ben[0:3000]
104 | 
105 | 		# Fit Model
106 | 		timesteps = 20
107 | 		input_dim = fea_dim
108 | 
109 | 		autoencoder = Autoencoder()
110 | 		autoencoder.model('lstm', [timesteps, input_dim], hid_dim)
111 | 		autoencoder.compile()
112 | 		autoencoder.fit(train_ben, "rev")
113 | 
114 | 		hidModel = Sequential()
115 | 		hidModel.add(autoencoder.model.layers[0])
116 | 		hidModel.add(autoencoder.model.layers[1])
117 | 
118 | 		ben_hid_emd = hidModel.predict(x_ben)
119 | 		van_hid_emd = hidModel.predict(x_van)
120 | 
121 | 		# store data
122 | 		np.save(repre_path + "ben_hid_emd_20_%s_%s" % (fea_dim, hid_dim[0]), ben_hid_emd)
123 | 		np.save(repre_path + "van_hid_emd_20_%s_%s" % (fea_dim, hid_dim[0]), van_hid_emd)
124 | 
125 | 	elif fix_or_var == 0:
126 | 		if step_length == 20:
127 | 			x_ben = np.load(samples_path + "X_%s_1_20_Ben.npy" % fea_dim)
128 | 			x_van = np.load(samples_path + "X_%s_1_20_Van.npy" % fea_dim)
129 | 			x_ben = sample_shuffle(x_ben)  # 16496
130 | 			x_van = sample_shuffle(x_van)  # 17015
131 | 			# train_ben = np.concatenate((x_ben[0:10000], x_van[0:10000])) # mix samples for baseline 'latent representation.'
132 | 			train_ben = x_ben[0:10000]
133 | 
134 | 			sampleWeights = list()
135 | 			for e in train_ben:
136 | 				sampleWeights.append(np.ones(len(e)))
137 | 
138 | 			train_ben_P = pad_sequences(train_ben, maxlen=20, dtype='float')
139 | 			x_ben_P = pad_sequences(x_ben, maxlen=20, dtype='float')
140 | 			x_van_P = pad_sequences(x_van, maxlen=20, dtype='float')
141 | 
142 | 			# decoding sequence is reversed
143 | 			sampleWeights = pad_sequences(sampleWeights, maxlen=20, dtype='float', padding='post')
144 | 
145 | 			timesteps = 20
146 | 			input_dim = fea_dim
147 | 			autoencoder = Autoencoder()
148 | 			autoencoder.modelMasking('lstm', [timesteps, input_dim], hid_dim)
149 | 			autoencoder.compile('temporal')
150 | 			autoencoder.fit(train_ben_P, 'rev', sampleWeights)
151 | 
152 | 			hidModel = Sequential()
153 | 			hidModel.add(autoencoder.model.layers[0])
154 | 			hidModel.add(autoencoder.model.layers[1])
155 | 			hidModel.add(autoencoder.model.layers[2])
156 | 
157 | 			ben_hid_emd = hidModel.predict(x_ben_P)
158 | 			van_hid_emd = hidModel.predict(x_van_P)
159 | 
160 | 			# store data
161 | 			# np.save(repre_path + "ben_hid_emd_mix_1_20_%s_%s" % (fea_dim, hid_dim[0]), ben_hid_emd)
162 | 			# np.save(repre_path + "val_hid_emd_mix_1_20_%s_%s" % (fea_dim, hid_dim[0]), van_hid_emd)
163 | 
164 | 		elif step_length == 50:
165 | 
166 | 			x_ben = np.load(samples_path + "X_v%s_4_50_Ben.npy" %fea_dim)
167 | 			x_van = np.load(samples_path + "X_v%s_4_50_Van.npy" %fea_dim)
168 | 			x_ben = sample_shuffle(x_ben)
169 | 			x_van = sample_shuffle(x_van)
170 | 			train_ben = x_ben[0:7000]
171 | 
172 | 			sampleWeights = list()
173 | 			for e in train_ben:
174 | 				sampleWeights.append(np.ones(len(e)))
175 | 
176 | 			train_ben_P = pad_sequences(train_ben, maxlen=50, dtype='float')
177 | 			x_ben_P = pad_sequences(x_ben, maxlen=50, dtype='float')
178 | 			x_van_P = pad_sequences(x_van, maxlen=50, dtype='float')
179 | 
180 | 			# decoding sequence is reversed
181 | 			sampleWeights = pad_sequences(sampleWeights, maxlen=50, dtype='float', padding='post')
182 | 
183 | 			timesteps = 50
184 | 			input_dim = fea_dim
185 | 			autoencoder = Autoencoder()
186 | 			autoencoder.modelMasking('lstm', [timesteps, input_dim], hid_dim)
187 | 			autoencoder.compile('temporal')
188 | 			autoencoder.fit(train_ben_P, 'rev', sampleWeights)
189 | 
190 | 			hidModel = Sequential()
191 | 			hidModel.add(autoencoder.model.layers[0])
192 | 			hidModel.add(autoencoder.model.layers[1])
193 | 			hidModel.add(autoencoder.model.layers[2])
194 | 
195 | 			ben_hid_emd = hidModel.predict(x_ben_P)
196 | 			van_hid_emd = hidModel.predict(x_van_P)
197 | 
198 | 	return ben_hid_emd, van_hid_emd
199 | 
200 | def bw_one_and_minus_one(x):
201 |     return ((x-min(x))/float((max(x)-min(x))))*2 - 1
202 | 
203 | 
204 | 


--------------------------------------------------------------------------------
/Baselines/latent_repre_explore.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |     Author: Panpan Zheng
 3 |     Date created:  1/15/2018
 4 |     Python Version: 2.7
 5 | '''
 6 | 
 7 | import numpy as np
 8 | from sklearn.cluster import DBSCAN
 9 | from sklearn import metrics
10 | from sklearn.datasets.samples_generator import make_blobs
11 | from sklearn.preprocessing import StandardScaler
12 | from numpy.random import multivariate_normal
13 | from representation_libs import db_span, get_eps, cluster_analyis, DB_statistics
14 | import json
15 | from utils import sample_shuffle
16 | from sklearn.manifold import TSNE
17 | from mpl_toolkits.mplot3d import axes3d
18 | import matplotlib.pyplot as plt
19 | from scipy.spatial import distance
20 | 
21 | from matplotlib.axes import Axes
22 | 
23 | x_ben = np.load("ben_hid_repre.npy")
24 | x_van = np.load("van_hid_repre.npy")
25 | 
26 | x_fake = sample_shuffle(np.load("x_fake.npy"))[0:len(x_van)]
27 | 
28 | X = np.concatenate((x_ben, x_van, x_fake))
29 | y = np.concatenate((np.ones(len(x_ben)), np.zeros(len(x_van)), np.ones(len(x_fake)) + 1))
30 | eps_X = get_eps(X)
31 | 
32 | clusters, outlier = db_span(X, 1.4305, 180)
33 | 
34 | # clusters, outlier = db_span(X, eps_X*.48, 180)
35 | # print "eps: ", eps_X*.48
36 | 
37 | 
38 | 
39 | cluster_X = list()
40 | cluster_y = list()
41 | cluster_c = list()
42 | for cluster_id, class_ids in clusters.items():
43 |     cluster_X.extend(X[class_ids])
44 |     cluster_y.extend(y[class_ids])
45 |     cluster_c.extend((np.zeros(np.sum(class_ids))+cluster_id).tolist())
46 | cluster_X, cluster_y, cluster_c = np.array(cluster_X), np.array(cluster_y), np.array(cluster_c)
47 | np.save("cluster_X", cluster_X)
48 | np.save("cluster_y", cluster_y)
49 | np.save("cluster_c", cluster_c)
50 | 
51 | cluster_label = list()
52 | cluster_samples = list()
53 | for cid in set(cluster_c):
54 |     cluster_label.append(cluster_y[cluster_c == cid])
55 |     cluster_samples.append(cluster_X[cluster_c == cid])
56 | 
57 | for i, e in enumerate(cluster_label):
58 |     tmp = np.array([np.sum(e == 1), np.sum(e == 0), np.sum(e == 2)])
59 |     print "cluster %s: "%i, tmp, tmp/float(np.sum(tmp)), np.sum(tmp)
60 | 
61 | for i in np.arange(len(cluster_samples)):
62 |     for j in np.arange(len(cluster_samples)):
63 |         if i != j:
64 |             inter_dist = distance.euclidean(np.mean(cluster_samples[i], axis=0),
65 |                                np.mean(cluster_samples[j], axis=0))
66 |             print "cluster %s & %s: %s"%(i, j, inter_dist)
67 | 
68 | 
69 | print "*****************************************************************"
70 | 
71 | i += 1
72 | print "Outlier components: "
73 | outlier_y = y[outlier]
74 | outlier_component = np.array([np.sum(outlier_y == 1), np.sum(outlier_y == 0), np.sum(outlier_y == 2)])
75 | print "cluster %s: " % i, outlier_component, outlier_component / float(np.sum(outlier_component)), np.sum(outlier_component)
76 | 


--------------------------------------------------------------------------------
/Baselines/libs.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     Author: Panpan Zheng
  3 |     Date created:  1/15/2018
  4 |     Python Version: 2.7
  5 | '''
  6 | 
  7 | import os
  8 | import sys
  9 | sys.path.append(".")
 10 | import numpy as np
 11 | from shutil import rmtree
 12 | from sklearn import preprocessing as pp
 13 | from utils import getUserDict, getPageDict, TleRevTim, TimeDiff, IsMetaPage, encode, train_test_split
 14 | 
 15 | from keras.layers import Input, Dense, LSTM, RepeatVector, Embedding
 16 | from keras.models import Model, Sequential
 17 | from keras.layers.core import Activation, Dense, Masking
 18 | import theano.tensor as T
 19 | from keras.callbacks import EarlyStopping
 20 | from keras import regularizers
 21 | 
 22 | 
 23 | 
 24 | 
 25 | class Dataset(object):
 26 | 	"""docstring for Dataset"""
 27 | 	def __init__(self):
 28 | 		super(Dataset, self).__init__()
 29 | 
 30 | 	# def getSamples(self, X_pages, X_tim, X_rev, y, metaDict, page2Cgr, seqType, seqLenLow, seqLenUp, storePath):
 31 | 	def getSamples(self, *args):
 32 | 
 33 | 		if len(args) == 10: 
 34 | 			self.X_pages = args[0]
 35 | 			self.X_tim = args[1]
 36 | 			self.X_rev = args[2]
 37 | 			self.y = args[3]
 38 | 			self.metaDict = args[4]
 39 | 			self.page2Cgr = args[5]
 40 | 			self.seqType = args[6]
 41 | 			self.seqLenLow = args[7]
 42 | 			self.seqLenUp = args[8]
 43 | 			self.storePath = args[9]
 44 | 			self.flag = 1
 45 | 		elif len(args) == 9:
 46 | 			self.X_pages = args[0]
 47 | 			self.X_tim = args[1]
 48 | 			self.y = args[2]
 49 | 			self.metaDict = args[3]
 50 | 			self.page2Cgr = args[4]
 51 | 			self.seqType = args[5]
 52 | 			self.seqLenLow = args[6]
 53 | 			self.seqLenUp = args[7]
 54 | 			self.storePath = args[8]
 55 | 			self.flag = 0
 56 | 
 57 | 		self.X = list()
 58 | 		# print self.flag
 59 | 		# exit(0)
 60 | 		for i, pages in enumerate(self.X_pages):
 61 | 
 62 | 			tims = self.X_tim[i]
 63 | 			if self.flag: 
 64 | 				revs = self.X_rev[i]
 65 | 			isMetaTem = list()
 66 | 			timDiffTem1 = list()
 67 | 			timDiffTem2 = list()
 68 | 			timDiffTem3 = list()
 69 | 			reEditTem = list()
 70 | 			consEditTem = list()
 71 | 			comCgrTem = list()
 72 | 			metEmptyTem = list()
 73 | 			for j, page in enumerate(pages):
 74 | 				# meta-page ? 
 75 | 				isMetaTem.append(IsMetaPage(page, self.metaDict)) 
 76 | 				# meta-page is empty ? 
 77 | 				if IsMetaPage(page, self.metaDict): 
 78 | 					if not eval(self.page2Cgr[page]):
 79 | 						metEmptyTem.append(1)
 80 | 					else:
 81 | 						metEmptyTem.append(0)
 82 | 				else:
 83 | 					metEmptyTem.append(0)
 84 | 			
 85 | 				if j == 0:
 86 | 					timDiffTem1.append(0)
 87 | 					timDiffTem2.append(0)
 88 | 					timDiffTem3.append(0)
 89 | 					reEditTem.append(0)
 90 | 					consEditTem.append(0)
 91 | 					comCgrTem.append(0)
 92 | 				else:
 93 | 					# time difference < 1, 3, 15 mins ?
 94 | 					timDiffTem1.append(TimeDiff(tims[j-1], tims[j], 1))
 95 | 					timDiffTem2.append(TimeDiff(tims[j-1], tims[j], 3))
 96 | 					timDiffTem3.append(TimeDiff(tims[j-1], tims[j], 15))
 97 | 					# has it been edited before?
 98 | 					if page in pages[0:j]:
 99 | 						reEditTem.append(1)
100 | 						# Is it a consecutive edit ?
101 | 						if page == pages[j-1]:
102 | 							consEditTem.append(1)
103 | 						else: 
104 | 							consEditTem.append(0)
105 | 					else:
106 | 						reEditTem.append(0)
107 | 						consEditTem.append(0)
108 | 					# share common category ? 
109 | 					if eval(self.page2Cgr[page]).intersection(
110 | 						eval(self.page2Cgr[pages[j-1]])):
111 | 						comCgrTem.append(1)
112 | 					else:
113 | 						comCgrTem.append(0)	
114 | 
115 | 			X_idvl = list()
116 | 
117 | 			if self.flag: 
118 | 				for j, e in enumerate(isMetaTem):
119 | 					X_code = list()
120 | 					attrs = [e,
121 | 					timDiffTem1[j],
122 | 					timDiffTem2[j],
123 | 					timDiffTem3[j],
124 | 					reEditTem[j],
125 | 					consEditTem[j],
126 | 					comCgrTem[j],
127 | 					metEmptyTem[j],
128 | 					revs[j]
129 | 					]
130 | 					for attr in attrs: 
131 | 						X_code.extend(encode(attr, 2))
132 | 					X_idvl.append(X_code)
133 | 			else:
134 | 				for j, e in enumerate(isMetaTem):
135 | 					X_code = list()
136 | 					attrs = [e,
137 | 					timDiffTem1[j],
138 | 				 	timDiffTem2[j],
139 | 				 	timDiffTem3[j],
140 | 					reEditTem[j],
141 | 					consEditTem[j],
142 | 					comCgrTem[j],
143 | 					metEmptyTem[j]
144 | 					]
145 | 					for attr in attrs: 
146 | 						X_code.extend(encode(attr, 2))
147 | 					X_idvl.append(X_code) 
148 | 
149 | 			self.X.append(pp.normalize(X_idvl, axis=1))
150 | 
151 | 		self.X = np.array(self.X)
152 | 		X_value = list()
153 | 		y_value = list()
154 | 
155 | 		if self.seqType == 'fix':
156 | 			for i, e in enumerate(self.X): 
157 | 				if  self.seqLenLow >= len(e):
158 | 					X_value.append(e)
159 | 				else:
160 | 					X_value.append(e[0:self.seqLenLow])
161 | 				y_value.append(self.y[i])
162 | 		elif self.seqType == 'var': 
163 | 			for i, e in enumerate(self.X): 
164 | 				if self.seqLenUp >= len(e) >= self.seqLenLow:
165 | 					X_value.append(e)
166 | 					y_value.append(self.y[i])
167 | 			# X_val_P = pad_sequences(X_val, maxlen=self.seqLenUp, dtype='float')
168 | 
169 | 		X_value = np.array(X_value)
170 | 		y_value = np.array(y_value)
171 | 
172 | 		# print X_val.shape
173 | 		# print y_val.shape
174 | 
175 | 		x_benign = [X_value[i] for i, e in enumerate(y_value) if e == 0]
176 | 		x_vandal = [X_value[i] for i, e in enumerate(y_value) if e == 1]
177 | 
178 | 		x_benign, x_vandal = np.array(x_benign), np.array(x_vandal)
179 | 
180 | 		if self.seqType == 'fix':
181 | 			if self.flag:
182 | 				np.save(self.storePath + "X_18_1_20_Ben.npy", x_benign)
183 | 				np.save(self.storePath + "X_18_1_20_Van.npy", x_vandal)
184 | 			else:
185 | 				np.save(self.storePath + "X_16_1_20_Ben.npy", x_benign)
186 | 				np.save(self.storePath + "X_16_1_20_Van.npy", x_vandal)
187 | 		elif self.seqType == 'var':
188 | 			if self.flag: 
189 | 				np.save(self.storePath + "X_v8_%s_%s_Ben.npy"%(self.seqLenLow,self.seqLenUp), x_benign)
190 | 				np.save(self.storePath + "X_v8_%s_%s_Van.npy"%(self.seqLenLow,self.seqLenUp), x_vandal)
191 | 			else:
192 | 				np.save(self.storePath + "X_v6_%s_%s_Ben.npy"%(self.seqLenLow,self.seqLenUp), x_benign)
193 | 				np.save(self.storePath + "X_v6_%s_%s_Van.npy"%(self.seqLenLow,self.seqLenUp), x_vandal)
194 | 
195 | 
196 | 
197 | 	def getRawData(self, files, f_users, f_pages, splRatio, basePath):
198 | 
199 | 		self.files = files
200 | 		self.splRatio = splRatio
201 | 		self.basePath = basePath
202 | 		
203 | 		directory = self.basePath + "wikiEditSeq" + "_" + str(splRatio)
204 | 		if os.path.exists(directory):
205 | 			rmtree(directory)
206 | 		os.makedirs(directory)
207 | 		directory += "\\"
208 | 
209 | 		# load user-page information from dictionary file and raw data. 
210 | 		user2id, user2Label, id2user = getUserDict(f_users)
211 | 		page2id, __, __ = getPageDict(f_pages)
212 | 		titleSet, revSet, timSet = TleRevTim(self.files,user2id,page2id)
213 | 
214 | 		# user2Label = getLabel(filesB,filesV,user2id,0,1)
215 | 		
216 | 		X_usrs = list()
217 | 		y = list()
218 | 		X_pages = list()
219 | 		X_tim = list()
220 | 		X_rev = list()
221 | 
222 | 
223 | 		for usrid in titleSet:   # to keep userid, label, pageid, revert, editing-time consistent.  
224 | 			X_usrs.append(usrid)
225 | 			y.append(user2Label[usrid])
226 | 			X_pages.append(titleSet[usrid])
227 | 			X_rev.append(revSet[usrid])
228 | 			X_tim.append(timSet[usrid])
229 | 
230 | 		X_len = [len(x) for x in X_pages]
231 | 
232 | 		X_usrs, y, X_pages, X_rev, X_tim, X_len = np.array(X_usrs), np.array(y), \
233 | 						np.array(X_pages), np.array(X_rev), np.array(X_tim), np.array(X_len)
234 | 
235 | 
236 | 		X_usrs_train, y_train, X_pages_train, X_tim_train, X_rev_train, X_len_train, \
237 | 		X_usrs_test, y_test, X_pages_test, X_tim_test, X_rev_test, X_len_test = \
238 | 		train_test_split(X_usrs, y, X_pages, X_tim, X_rev, X_len, splRatio)
239 | 
240 | 		np.save(directory + "X_usrs.npy", X_usrs)
241 | 		np.save(directory + "y.npy", y)
242 | 		np.save(directory + "X_pages.npy", X_pages)
243 | 		np.save(directory + "X_rev.npy", X_rev)
244 | 		np.save(directory + "X_tim.npy", X_tim)
245 | 		np.save(directory + "X_len.npy", X_len)
246 | 
247 | 		np.save(directory + "X_usrs_train.npy", X_usrs_train)
248 | 		np.save(directory + "y_train.npy", y_train)
249 | 		np.save(directory + "X_pages_train.npy", X_pages_train)
250 | 		np.save(directory + "X_tim_train.npy", X_tim_train)
251 | 		np.save(directory + "X_rev_train.npy", X_rev_train)
252 | 		np.save(directory + "X_len_train.npy", X_len_train)
253 | 
254 | 		np.save(directory + "X_usrs_test.npy", X_usrs_test)
255 | 		np.save(directory + "y_test.npy", y_test)
256 | 		np.save(directory + "X_pages_test.npy", X_pages_test)
257 | 		np.save(directory + "X_tim_test.npy", X_tim_test)
258 | 		np.save(directory + "X_rev_test.npy", X_rev_test)
259 | 		np.save(directory + "X_len_test.npy", X_len_test)
260 | 
261 | 
262 | 
263 | class Autoencoder(object):
264 | 	"""docstring for Autoencoder"""
265 | 	# def __init__(self, sampleWeights, sample_weight_mode):
266 | 	def __init__(self):
267 | 		# super(Autoencoder, self).__init__()
268 | 		# self.codeLayerType = 'dense'
269 | 		self.nb_epoch = 20
270 | 		self.batch_size = 256
271 | 		self.shuffle = True
272 | 		self.validation_split = 0.05
273 | 		self.optimizer = 'adadelta'
274 | 		self.loss = 'mse'
275 | 		# self.sampleWeights = sampleWeights
276 | 		# self.sample_weight_mode = sample_weight_mode
277 | 
278 | 
279 | 	def model(self, codeLayerType, inputDim, codeDim):
280 | 
281 | 		self.codeLayerType = codeLayerType
282 | 		assert len(codeDim) > 0
283 | 
284 | 		if self.codeLayerType == 'lstm':
285 | 			assert len(inputDim) == 2
286 | 			inputData = Input(shape=(inputDim[0],inputDim[1]))
287 | 
288 | 			if len(codeDim) == 1:
289 | 				encoded = LSTM(codeDim[0])(inputData)
290 | 				decoded = RepeatVector(inputDim[0])(encoded)
291 | 			elif len(codeDim) > 1:
292 | 				encoded = inputData
293 | 				for i, units in enumerate(codeDim):
294 | 					if i == len(codeDim) - 1:
295 | 						 encoded = LSTM(units)(encoded)
296 | 						 continue		
297 | 					encoded = LSTM(units, return_sequences=True)(encoded)
298 | 
299 | 				for i, units in enumerate(reversed(codeDim)): 
300 | 					if i == 1:
301 | 						decoded = LSTM(units, return_sequences=True)(RepeatVector(inputDim[0])(encoded))
302 | 					elif i > 1: 
303 | 						decoded = LSTM(units, return_sequences=True)(decoded)
304 | 			else: 
305 | 				raise ValueError("The codDim must be over 0.")
306 | 
307 | 			decoded = LSTM(inputDim[-1], return_sequences=True)(decoded)
308 | 			self.model = Model(inputData, decoded)
309 | 
310 | 		elif self.codeLayerType == 'dense': 
311 | 			assert len(inputDim) == 1
312 | 			inputData = Input(shape=(inputDim[0],))
313 | 			encoded = inputData
314 | 			for i, units in enumerate(codeDim): 
315 | 				encoded = Dense(units, activation='relu')(encoded)
316 | 			decoded = Dense(inputDim[-1], activation='sigmoid')(encoded)
317 | 			self.model = Model(inputData, decoded)
318 | 
319 | 		elif self.codeLayerType == 'cov':
320 | 			pass
321 | 
322 | 
323 | 	def modelMasking(self, codeLayerType, inputDim, codeDim):
324 | 
325 | 		self.codeLayerType = codeLayerType
326 | 		assert len(codeDim) > 0
327 | 
328 | 		if self.codeLayerType == 'lstm':
329 | 			assert len(inputDim) == 2
330 | 			inputData = Input(shape=(inputDim[0],inputDim[1]))
331 | 			mask = Masking(mask_value=0.)(inputData)
332 | 			if len(codeDim) == 1:
333 | 				encoded = LSTM(codeDim[0])(mask)
334 | 				decoded = RepeatVector(inputDim[0])(encoded)
335 | 			elif len(codeDim) > 1:
336 | 				encoded = mask
337 | 				for i, units in enumerate(codeDim):
338 | 					if i == len(codeDim) - 1:
339 | 						 encoded = LSTM(units)(encoded)
340 | 						 continue		
341 | 					encoded = LSTM(units, return_sequences=True)(encoded)
342 | 
343 | 				for i, units in enumerate(reversed(codeDim)): 
344 | 					if i == 1:
345 | 						decoded = LSTM(units, return_sequences=True)(RepeatVector(inputDim[0])(encoded))
346 | 					elif i > 1: 
347 | 						decoded = LSTM(units, return_sequences=True)(decoded)
348 | 			else: 
349 | 				raise ValueError("The codDim must be over 0.")
350 | 
351 | 			decoded = LSTM(inputDim[-1], return_sequences=True)(decoded)
352 | 			self.model = Model(inputData, decoded)
353 | 
354 | 		elif self.codeLayerType == 'cov': 
355 | 			pass
356 | 		elif self.codeLayerType == 'dense': 
357 | 			assert len(inputDim) == 1
358 | 			inputData = Input(shape=(inputDim[0],))
359 | 			# encoded = inputData
360 | 			# for i, units in enumerate(codeDim):
361 | 			# 	encoded = Dense(units, activation='relu')(encoded)
362 | 			# decoded = Dense(inputDim[-1], activation='sigmoid')(encoded)
363 | 			# self.model = Model(inputData, decoded)
364 | 			encoder = Dense(codeDim[0], activation="tanh",
365 | 							activity_regularizer=regularizers.l1(10e-5))(inputData)
366 | 			encoder = Dense(int(codeDim[0]/2), activation="relu")(encoder)
367 | 			decoder = Dense(int(codeDim[0]/2), activation='tanh')(encoder)
368 | 			decoder = Dense(inputDim[0], activation='relu')(decoder)
369 | 			self.model = Model(inputData, decoder)
370 | 
371 | 	def compile(self, *args):
372 | 
373 | 		if len(args) == 0:
374 | 			self.model.compile(optimizer=self.optimizer, loss=self.loss)
375 | 		elif len(args) == 1:
376 | 			if args[0] == 'temporal':
377 | 				self.sample_weight_mode = args[0]
378 | 				self.model.compile(optimizer=self.optimizer, loss=self.loss, sample_weight_mode=self.sample_weight_mode)
379 | 			elif args[0] == 'customFunction':
380 | 				self.model.compile(optimizer=self.optimizer, loss= self.weighted_vector_mse)
381 | 			else: 
382 | 				raise ValueError("Invalid maskType, please input 'sampleWeights' or 'customFunction'")
383 | 		else: 
384 | 			raise ValueError("argument # must be 0 or 1.")
385 | 
386 | 
387 | 	def fit(self, *args):
388 | 
389 | 		# early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=3, verbose=1, mode='auto')
390 | 		if len(args) == 2:	
391 | 			if args[1] == 'nor':
392 | 				self.model.fit(args[0],
393 | 				args[0],	
394 | 				nb_epoch=self.nb_epoch, 
395 | 				batch_size=self.batch_size, 
396 | 				shuffle=self.shuffle, 
397 | 				validation_split=self.validation_split)
398 | 				# callbacks = [early_stopping])
399 | 			elif args[1] == 'rev':
400 | 				self.model.fit(args[0],
401 | 				np.flip(args[0], 1), 
402 | 				nb_epoch=self.nb_epoch, 
403 | 				batch_size=self.batch_size, 
404 | 				shuffle=self.shuffle, 
405 | 				validation_split=self.validation_split)
406 | 				# callbacks=[early_stopping])
407 | 			else: 
408 | 				raise ValueError("decoding sequence type: 'normal' or 'reverse'.")
409 | 
410 | 		elif len(args) == 3:
411 | 			self.sampleWeights = args[2]	
412 | 			if args[1] == 'nor':
413 | 				self.model.fit(args[0],
414 | 				args[0],	
415 | 				nb_epoch=self.nb_epoch, 
416 | 				batch_size=self.batch_size, 
417 | 				shuffle=self.shuffle, 
418 | 				validation_split=self.validation_split, 
419 | 				sample_weight=self.sampleWeights)
420 | 				# callbacks=[early_stopping])
421 | 			elif args[1] == 'rev':
422 | 				self.model.fit(args[0],
423 | 				np.flip(args[0], 1), 
424 | 				nb_epoch=self.nb_epoch, 
425 | 				batch_size=self.batch_size, 
426 | 				shuffle=self.shuffle, 
427 | 				validation_split=self.validation_split,
428 | 				sample_weight=self.sampleWeights)
429 | 				# callbacks=[early_stopping])
430 | 			else: 
431 | 				raise ValueError("Please input, 'data', 'nor' or 'rev', 'sample_weights'")
432 | 
433 | 	def predict(self, data):
434 | 		return self.model.predict(data)
435 | 
436 | 	def weighted_vector_mse(self, y_true, y_pred):
437 | 		
438 | 		self.y_true = y_true
439 | 		self.y_pred = y_pred
440 | 
441 | 		weight = T.ceil(self.y_true)
442 | 		loss = T.square(weight * (self.y_true - self.y_pred)) 
443 | 		# use appropriate relations for other objectives. E.g, for binary_crossentropy: 
444 | 		#loss = weights * (y_true * T.log(y_pred) + (1.0 - y_true) * T.log(1.0 - y_pred))
445 | 		return T.mean(T.sum(loss, axis=-1))
446 | 
447 | 
448 | 
449 | 
450 | 


--------------------------------------------------------------------------------
/Baselines/model_components.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     Author: Panpan Zheng
  3 |     Date created:  1/15/2018
  4 |     Python Version: 2.7
  5 | '''
  6 | 
  7 | import os
  8 | import sys
  9 | import numpy as np
 10 | from keras.callbacks import EarlyStopping
 11 | from keras.layers import Input
 12 | from keras.models import Model, Sequential
 13 | from keras.layers.core import Dense
 14 | from keras.layers import Reshape, Flatten, LeakyReLU, Activation
 15 | from keras_adversarial.legacy import l1l2
 16 | from sklearn.metrics import classification_report
 17 | 
 18 | 
 19 | 
 20 | def get_generator(G_in, output_dim, hidden_dim=100, reg=lambda: l1l2(1e-5, 1e-5)):
 21 | 
 22 |     x = Dense(int(hidden_dim), name="generator_h1", W_regularizer=reg())(G_in)
 23 |     x = LeakyReLU(0.2)(x)
 24 |     x = Dense(output_dim, name="generator_x_flat", W_regularizer=reg())(x)
 25 |     G_out = Activation('tanh')(x)
 26 |     # G_out = Activation('sigmoid')(x)
 27 |     G = Model(G_in, G_out)
 28 |     G.compile(loss='binary_crossentropy', optimizer='adam')
 29 |     return G, G_out
 30 | 
 31 | def get_discriminator(D_in, hidden_dim=50, reg=lambda: l1l2(1e-5, 1e-5)):
 32 | 
 33 |     x = Dense(hidden_dim * 2, name="discriminator_h1",W_regularizer=reg())(D_in)
 34 |     x = LeakyReLU(0.2)(x)
 35 |     x = Dense(hidden_dim, name="discriminator_h2",W_regularizer=reg())(x)
 36 |     x = LeakyReLU(0.2)(x)
 37 |     x = Dense(1, name="discriminator_y", W_regularizer=reg())(x)
 38 |     D_out = Activation("sigmoid")(x)
 39 |     D = Model(D_in, D_out)
 40 |     D.compile(loss='binary_crossentropy', optimizer='sgd')
 41 |     return D, D_out
 42 | 
 43 | # Freeze weights in the discriminator for stacked training
 44 | def set_trainable(model, trainable):
 45 |     model.trainable = trainable
 46 |     for l in model.layers:
 47 |         l.trainable = trainable
 48 | 
 49 | # Build stacked GAN model
 50 | def make_gan(GAN_in, G, D):
 51 |     set_trainable(D, False)
 52 |     x = G(GAN_in)
 53 |     GAN_out = D(x)
 54 |     GAN = Model(GAN_in, GAN_out)
 55 |     GAN.compile(loss='binary_crossentropy', optimizer='adam')
 56 |     return GAN, GAN_out
 57 | 
 58 | # Training Procedure Definition
 59 | def sample_data_and_gen(XT, G, noise_dim=50):
 60 |     n_samples = XT.shape[0]
 61 |     s = np.arange(2*n_samples)
 62 |     np.random.shuffle(s)
 63 |     XN_noise = np.random.normal(0, 1, size=[n_samples, noise_dim])
 64 |     XN = G.predict(XN_noise)
 65 |     X = np.concatenate((XT, XN))
 66 |     y = np.ones(2*n_samples)
 67 |     y[n_samples:] = 0
 68 |     X = X[s]
 69 |     y = y[s]
 70 |     return X, y
 71 | 
 72 | def sample_noise(n_samples, noise_dim=50):
 73 |     X = np.random.normal(0, 1, size=[n_samples, noise_dim])
 74 |     y = np.ones(n_samples)
 75 |     return X, y
 76 | 
 77 | def pretrain(G, D, XT, batch_size=50):
 78 |     X, y = sample_data_and_gen(XT, G, noise_dim=50)
 79 |     set_trainable(D, True)
 80 |     D.fit(X, y, nb_epoch=1, batch_size=batch_size)
 81 | 
 82 | def batch_divide(X, batch_size):
 83 |     q = np.divide(X, batch_size)
 84 |     r = np.remainder(X, batch_size)
 85 |     return q, r
 86 | 
 87 | def train_and_test(GAN, G, D, XT, x_test, y_test, en_ae, epochs, verbose=True, v_freq=10):
 88 |     if en_ae == 1:
 89 |         XT = XT[0:7000]
 90 |         x_test, y_test = random_sampling_test_data(x_test, y_test, 3000)
 91 |         batch_size = 700
 92 |     elif en_ae == 2:
 93 |         XT = XT[0:700]
 94 |         x_test, y_test = random_sampling_test_data(x_test, y_test, 490)
 95 |         batch_size = 70
 96 |     D_fake_prob = list()
 97 |     D_real_prob = list()
 98 |     D_val_prob = list()
 99 |     fake_real_mse = list()
100 |     f1_score_coll = list()
101 | 
102 |     # D_loss = list()
103 |     # G_loss = list()
104 |     # accuracy  = list()
105 | 
106 |     for epoch in range(epochs):
107 |         X, y = sample_data_and_gen(XT, G, noise_dim=50)
108 |         X_real = X[y == 1]
109 |         X_fake = X[y == 0]
110 |         d_loss = list()
111 |         g_loss = list()
112 |         q, r = batch_divide(X_real.shape[0], batch_size)
113 |         for i in range(q):
114 |             set_trainable(D, True)
115 |             d_loss.append(D.train_on_batch(np.array(
116 |                                             X_real[i * batch_size:(i + 1) * batch_size].tolist() +
117 |                                             X_fake[i * batch_size:(i + 1) * batch_size].tolist()
118 |                                             ),
119 |                                             np.array(
120 |                                             np.ones(batch_size).tolist() + np.zeros(batch_size).tolist()
121 |                                             )))
122 | 
123 |             set_trainable(D, False)
124 |             X_gan, y_gan = sample_noise(batch_size, 50)
125 |             g_loss.append(GAN.train_on_batch(X_gan,y_gan))
126 | 
127 |         if r != 0:
128 |             set_trainable(D, True)
129 |             d_loss.append(D.train_on_batch(
130 |                 np.array(
131 |                     X_real[-r:].tolist() + X_fake[-r:].tolist()
132 |                 ),
133 |                 np.array(
134 |                     np.ones(r).tolist() + np.zeros(r).tolist()
135 |                 )))
136 |             set_trainable(D, False)
137 |             X_r, y_r = sample_noise(r, 50)
138 |             g_loss.append(GAN.train_on_batch(X_r,y_r))
139 | 
140 |         fake_real_mse.append(np.mean(np.sqrt((X_real-X_fake)**2)))
141 |         D_fake_prob.append(np.mean(D.predict(X_fake)))
142 |         D_real_prob.append(np.mean(D.predict(X_real)))
143 |         D_val_prob.append(np.mean(D.predict(x_test[y_test==0])))
144 |         # D_loss.append(np.mean(d_loss))
145 |         # G_loss.append(np.mean(g_loss))
146 | 
147 |         y_pred = (D.predict(x_test) > .5).astype(int).flatten()
148 |         conf_mat = classification_report(y_test, y_pred, target_names=['vandal', 'benign'], digits=4)
149 |         f1_score = float(filter(None, conf_mat.strip().split(" "))[7])
150 |         f1_score_coll.append(f1_score)
151 |         # print "epoch: ", epoch, "    ", filter(None, conf_mat[-50:].strip().split(" "))[-2]
152 |         # print "epoch:%s"%epoch
153 |         # print conf_mat
154 |         # f1_score.append(float(filter(None, conf_mat.strip().split(" "))[7]))
155 |         # acc = np.sum(y_pred == y_test)/float(y_pred.shape[0])
156 |         # accuracy.append(acc)
157 |     return D, X_fake, D_real_prob, D_fake_prob, D_val_prob, fake_real_mse, f1_score_coll
158 | 
159 | 
160 | def random_sampling_test_data(x_test, y_test,n_samples=3000):
161 |     x_test_ben = x_test[y_test == 1]
162 |     x_test_van = x_test[y_test != 1]
163 |     assert x_test_ben.shape[0] == x_test_van.shape[0]
164 |     assert x_test_ben.shape[0] >= n_samples
165 |     # s = np.arange(x_test_ben.shape[0])
166 |     # np.random.shuffle(s)
167 |     # s = s[:n_samples]
168 |     x_test = np.concatenate((x_test_ben[0:n_samples], x_test_van[0:n_samples]))
169 |     y_test = np.ones(2 * n_samples)
170 |     y_test[n_samples:] = 0
171 |     return x_test, y_test
172 | 
173 | 
174 | def train_gan(GAN, G, D, XT, epochs, en_ae, verbose=True, v_freq=10):
175 |     if en_ae == 1:
176 |         XT = XT[0:7000]
177 |         batch_size = 700
178 |     else:
179 |         XT = XT[0:700]
180 |         batch_size = 70
181 |     for epoch in range(epochs):
182 |         X, y = sample_data_and_gen(XT, G, noise_dim=50)
183 |         X_real = X[y == 1]
184 |         X_fake = X[y == 0]
185 |         d_loss = list()
186 |         g_loss = list()
187 |         q, r = batch_divide(X_real.shape[0], batch_size)
188 |         for i in range(q):
189 |             set_trainable(D, True)
190 |             d_loss.append(D.train_on_batch(np.array(
191 |                                             X_real[i * batch_size:(i + 1) * batch_size].tolist() +
192 |                                             X_fake[i * batch_size:(i + 1) * batch_size].tolist()
193 |                                             ),
194 |                                             np.array(
195 |                                             np.ones(batch_size).tolist() + np.zeros(batch_size).tolist()
196 |                                             )))
197 | 
198 |             set_trainable(D, False)
199 |             X_gan, y_gan = sample_noise(batch_size, 50)
200 |             g_loss.append(GAN.train_on_batch(X_gan,y_gan))
201 | 
202 |         if r != 0:
203 |             set_trainable(D, True)
204 |             d_loss.append(D.train_on_batch(
205 |                 np.array(
206 |                     X_real[-r:].tolist() + X_fake[-r:].tolist()
207 |                 ),
208 |                 np.array(
209 |                     np.ones(r).tolist() + np.zeros(r).tolist()
210 |                 )))
211 |             set_trainable(D, False)
212 |             X_r, y_r = sample_noise(r, 50)
213 |             g_loss.append(GAN.train_on_batch(X_r,y_r))
214 |     return D
215 | 
216 | 
217 | def run_Gan(x_test, y_test, D, en_ae):
218 |     if en_ae == 1:
219 |         x_test, y_test = random_sampling_test_data(x_test, y_test, 3000)
220 |     else:
221 |         x_test, y_test = random_sampling_test_data(x_test, y_test, 490)
222 | 
223 |     y_pred = (D.predict(x_test) > .5).astype(int).flatten()
224 |     conf_mat = classification_report(y_test, y_pred, target_names=['vandal', 'benign'], digits=4)
225 |     acc = np.sum(y_pred == y_test) / float(y_pred.shape[0])
226 |     print conf_mat
227 |     return np.array(filter(None, conf_mat.strip().split(" "))[5]).astype(float),\
228 |            np.array(filter(None, conf_mat.strip().split(" "))[6]).astype(float), \
229 |            np.array(filter(None, conf_mat.strip().split(" "))[7]).astype(float), acc
230 | 
231 | def run_one_svm(x_test, y_test, clf, en_ae):
232 |     if en_ae == 1:
233 |         N = 3000
234 |     else:
235 |         N = 490
236 |     x_test_svm = np.concatenate((x_test[y_test == 1][0:N], x_test[y_test == 2][0:N]))
237 |     y_test_svm = np.concatenate((np.ones(N), np.zeros(N)-1))
238 |     y_pred = clf.predict(x_test_svm)
239 |     conf_mat = classification_report(y_test_svm, y_pred, target_names=['vandal', 'benign'], digits=4)
240 |     acc = np.sum(y_pred == y_test_svm) / float(y_pred.shape[0])
241 | 
242 |     return np.array(filter(None, conf_mat.strip().split(" "))[5]).astype(float),\
243 |            np.array(filter(None, conf_mat.strip().split(" "))[6]).astype(float), \
244 |            np.array(filter(None, conf_mat.strip().split(" "))[7]).astype(float), acc
245 | 


--------------------------------------------------------------------------------
/Baselines/representation_libs.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |     Author: Panpan Zheng
 3 |     Date created:  1/15/2018
 4 |     Python Version: 2.7
 5 | '''
 6 | 
 7 | import numpy as np
 8 | from sklearn.cluster import DBSCAN
 9 | from collections import defaultdict
10 | 
11 | def db_span(X, eps, min_samples):
12 |     db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
13 |     outlier = None
14 |     cluster_label = dict()
15 |     for label_id in set(db.labels_):
16 |         if label_id == -1:
17 |             outlier = (db.labels_ == label_id)
18 |             continue
19 |         cluster_label[label_id] = (db.labels_ == label_id)
20 |     return cluster_label, outlier
21 | 
22 | def cluster_analyis(cluster_label):
23 |     dict_of_cluster = dict()
24 |     for label_id in cluster_label:
25 |         dict_of_cluster[label_id] = np.sum(cluster_label[label_id])
26 |     return dict_of_cluster
27 | 
28 | def get_eps(X):
29 |     X_contre = np.mean(X, axis=0)
30 |     diff_to_contre = X - X_contre
31 |     dist_to_contre = list(map(lambda x: np.sqrt(np.sum(x ** 2)), diff_to_contre))
32 |     return np.mean(dist_to_contre)
33 | 
34 | def DB_statistics(X, eps, min_num_samples):
35 |     cluster_index, outlier_index = db_span(X, eps, min_num_samples)
36 |     dict_of_cluster = cluster_analyis(cluster_index)
37 |     print("num_of_cluster: ", len(dict_of_cluster.keys()))
38 |     print("cluster_list ", dict_of_cluster)
39 |     print("outlier_rate: ", np.sum(outlier_index), np.sum(outlier_index)/ float(len(outlier_index)))
40 | 


--------------------------------------------------------------------------------
/Baselines/run_baseline.m:
--------------------------------------------------------------------------------
  1 | %   Author: Panpan Zheng
  2 | %   Date created:  1/15/2018
  3 | 
  4 | function [precision_neg, recall_neg, f1_neg, accuracy] = run_baseline(file_url,NDtype,en_ae)
  5 | 
  6 | % type_list = ['dist', 'nn', 'kmeans', 'parzen', 'gmm', 'svmTax', 'gpoc', 'kde', 'som', 'pca', 'kpca'];
  7 | 
  8 | data = load(file_url);
  9 | X = data.x; 
 10 | y = data.y;
 11 | 
 12 | %% Sampling training, validating, and testing set.
 13 | 
 14 | isnor = y == 1; % regard class 1 as normal.
 15 | isab = ~isnor;
 16 | [traindataNorOri, validdataNorOri, testdataNorOri, validdataAbOri, testdataAbOri] = splitData(X,isab,en_ae);
 17 | 
 18 | size(traindataNorOri, 1); 
 19 | size(testdataNorOri, 1); 
 20 | size(testdataAbOri, 1); 
 21 | size(validdataNorOri, 1); 
 22 | size(validdataAbOri, 1);
 23 | 
 24 | %% Training, validating, and testing. 
 25 | 
 26 | switch lower(NDtype)
 27 |     case 'gpoc'
 28 |         trained_model = train_gpoc(traindataNorOri);
 29 |     case 'svmsch' 
 30 |         trained_model = train_svmsch(traindataNorOri);
 31 |     case 'nn'
 32 |         trained_model = train_nn(traindataNorOri);
 33 |     case 'kpca'
 34 |         trained_model = train_kpca(traindataNorOri);
 35 |     case 'svmtax'
 36 |         trained_model = train_svmtax(traindataNorOri);
 37 |     case 'pca'
 38 |         trained_model = train_pca(traindataNorOri);
 39 |     case 'kde'
 40 |         trained_model = train_kde(traindataNorOri);
 41 | end
 42 | 
 43 | 
 44 | %% Testing
 45 | 
 46 | switch lower(NDtype)
 47 |     case 'gpoc'
 48 |         output_nor = out_gpoc(testdataNorOri, trained_model);
 49 |         output_ab = out_gpoc(testdataAbOri, trained_model);
 50 |     case 'svmsch'
 51 |         output_nor = out_svmsch(testdataNorOri, trained_model);
 52 |         output_ab = out_svmsch(testdataAbOri, trained_model);
 53 |     case 'nn'
 54 |         output_nor = out_nn(testdataNorOri, trained_model);
 55 |         output_ab = out_nn(testdataAbOri, trained_model);
 56 |     case 'kpca'
 57 |         output_nor = out_kpca(testdataNorOri, trained_model);
 58 |         output_ab = out_kpca(testdataAbOri, trained_model);
 59 |     case 'svmtax'
 60 |         output_nor = out_svmtax(testdataNorOri, trained_model);
 61 |         output_ab = out_svmtax(testdataAbOri, trained_model);
 62 |     case 'pca'
 63 |         output_nor = out_pca(testdataNorOri, trained_model);
 64 |         output_ab = out_pca(testdataAbOri, trained_model);
 65 |     case 'kde'
 66 |         output_nor = out_kde(testdataNorOri, trained_model);
 67 |         output_ab = out_kde(testdataAbOri, trained_model);
 68 | end
 69 | 
 70 | 
 71 | %% Validation (threshold)
 72 | 
 73 | 
 74 | switch lower(NDtype)
 75 |     case 'gpoc'
 76 |         [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'gpoc'); 
 77 |     case 'svmsch'
 78 |         [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'svmsch');
 79 |     case 'nn'
 80 |         [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'nn'); 
 81 |     case 'kpca'
 82 |         [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'kpca');
 83 |     case 'svmtax'
 84 |         [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'svmtax'); 
 85 |     case 'pca'
 86 |         [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'pca');
 87 |      case 'kde'
 88 |         [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'kde'); 
 89 | end 
 90 | 
 91 | 
 92 | %% Get label 
 93 | switch lower(NDtype)
 94 |     case 'gpoc'
 95 |         pred_nor = assignCls('gpoc', output_nor, optthr);
 96 |         pred_ab = assignCls('gpoc', output_ab, optthr);
 97 |     case 'svmsch'
 98 |         pred_nor = assignCls('svmsch', output_nor, optthr);
 99 |         pred_ab = assignCls('svmsch', output_ab, optthr);
100 |     case 'nn'        
101 |         pred_nor = assignCls('nn', output_nor, optthr);
102 |         pred_ab = assignCls('nn', output_ab, optthr);
103 |     case 'kpca'
104 |         pred_nor = assignCls('kpca', output_nor, optthr);
105 |         pred_ab = assignCls('kpca', output_ab, optthr);
106 |     case 'svmtax'
107 |         pred_nor = assignCls('svmtax', output_nor, optthr);
108 |         pred_ab = assignCls('svmtax', output_ab, optthr);
109 |     case 'pca'
110 |         pred_nor = assignCls('pca', output_nor, optthr);
111 |         pred_ab = assignCls('pca', output_ab, optthr);
112 |     case 'kde'
113 |         pred_nor = assignCls('kde', output_nor, optthr);
114 |         pred_ab = assignCls('kde', output_ab, optthr);
115 | end
116 | 
117 | pred_labels = [pred_nor; pred_ab];
118 | 
119 | 
120 | 
121 | %% Compute confusion matrix of test data.
122 | 
123 | tar_nor = zeros(size(output_nor, 1), 1);
124 | tar_ab = ones(size(output_ab, 1), 1);
125 | tar_labels = [tar_nor; tar_ab];
126 | 
127 | 
128 | [conf, ~] = confmat(pred_labels, tar_labels); % predTest and tarTest are 0-1 coding.
129 | fprintf('\n');
130 | disp(conf);
131 | fprintf('\n');
132 | % fprintf('Confusion matrix using test data is:\n');
133 | % disp(conf);
134 | accuracy = (conf(1,1) + conf(2,2)) / sum(conf(:)); % accuracy = rate(1)
135 | 
136 | precision_pos = conf(1,1)/(conf(1,1) + conf(2,1)); 
137 | precision_neg = conf(2,2)/(conf(1,2) + conf(2,2));
138 | precision = (precision_pos + precision_neg)/2; 
139 | 
140 | recall_pos = conf(1,1)/(conf(1,1) + conf(1,2)); 
141 | recall_neg = conf(2,2)/(conf(2,1) + conf(2,2));
142 | recall = (recall_pos + recall_neg)/2;
143 | 
144 | f1_pos = 2 * ((precision_pos*recall_pos)/(precision_pos + recall_pos)); 
145 | f1_neg = 2 * ((precision_neg*recall_neg)/(precision_neg + recall_neg));
146 | 
147 | f1 = (f1_pos + f1_neg)/2;
148 | 


--------------------------------------------------------------------------------
/Baselines/splitData.m:
--------------------------------------------------------------------------------
  1 | %   Author: Panpan Zheng
  2 | %   Date created:  1/15/2018
  3 | 
  4 | function [traindataNor, validdataNor, testdataNor, validdataAb, testdataAb] = splitData(alldata, isab, en_ae)
  5 | %SPLITDATA Split data into three groups: training, validation, and test set.
  6 | %   [traindataNor, validdataNor, testdataNor, validdataAb, testdataAb] = splitData(alldata, isab)
  7 | %
  8 | %   Inputs:
  9 | %       allData:    a matrix, ntrain by nftrs
 10 | %       isab:       flag abnormal cases, ndata by 1
 11 | %
 12 | %   Ouputs:
 13 | %       traindataNor                data to be used for training
 14 | %       validdataNor, validdataAb   data to be used for validation
 15 | %       testdataNor, testdataAb     data to be used for testing
 16 | %
 17 | %   See also demoND
 18 | 
 19 | normaldata = alldata(~isab, :);
 20 | abnormaldata = alldata(isab, :);
 21 | 
 22 | numdata = size(alldata, 1);
 23 | numAb = sum(isab);
 24 | numNor = numdata - numAb;
 25 | 
 26 | 
 27 | % fprintf('%d \n', numdata);
 28 | % fprintf('%d \n', numAb);
 29 | % fprintf('%d \n', numNor);
 30 | 
 31 | if en_ae == 1 
 32 |     traindataNor = normaldata(1:7000, :);
 33 |     testdataNor = normaldata(7001:10000, :);
 34 |     testdataAb = abnormaldata(1:3000, :);
 35 |     validdataAb = abnormaldata(3000:end, :);
 36 |     validdataNor = normaldata(10001:10000+size(validdataAb,1), :);
 37 | else
 38 |     traindataNor = normaldata(1:700, :);
 39 |     testdataNor = normaldata(701:1190, :);
 40 |     testdataAb = abnormaldata(1:490, :);
 41 |     validdataAb = abnormaldata(491:492, :);
 42 |     validdataNor = normaldata(1191:1190+size(validdataAb,1), :);
 43 | end 
 44 | 
 45 | % permNor = randperm(numNor);
 46 | % indTrainNor = permNor(1:7000);
 47 | % traindataNor = normaldata(indTrainNor, :);
 48 | % 
 49 | % 
 50 | % indTestNor = permNor(7001:10000);
 51 | % % indValidNor = permNor(10001:10159);
 52 | % testdataNor = normaldata(indTestNor, :);
 53 | % permAb = randperm(numAb);
 54 | % indTestAb = permAb(1:3000);
 55 | % testdataAb = abnormaldata(indTestAb, :);
 56 | % 
 57 | % 
 58 | % indValidAb = permAb(3001:end);
 59 | % validdataAb = abnormaldata(indValidAb, :);
 60 | % indValidNor = permNor(10001: 10000+size(validdataAb,1));
 61 | % validdataNor = normaldata(indValidNor, :);
 62 | 
 63 | %%
 64 | % if numNor > numAb * 2
 65 | %     howtosplit = 'balance'; % this is not that correct, used in my AUC paper in Poland conference.
 66 | %     % howtosplit = 'balanceByPts'; % this is the proper way; see BSP log 1.133. Implemented in RunND_byPts.m.
 67 | % else
 68 | %     howtosplit = 'percentage';
 69 | % end
 70 | % 
 71 | % switch lower(howtosplit)
 72 | %     case 'percentage'
 73 | %         percentTrainNor = 0.6; % use 60% normal data for training.
 74 | %         percentValidNor = 0.2; % use 20% normal data for validation => use 20% normal data for testing.
 75 | %         numTrainNor = floor(percentTrainNor * numNor);
 76 | %         numValidNor = floor(percentValidNor * numNor);
 77 | %         numTestNor = numNor - numTrainNor - numValidNor;
 78 | %         
 79 | %         indValidAb = (1:numValidAb);
 80 | %         indTestAb = (numValidAb+1 : numAb);
 81 | %         
 82 | %         indTrainNor = (1 : numTrainNor);
 83 | %         indValidNor = (numTrainNor + 1 : numTrainNor + numValidNor);
 84 | %         indTestNor = (numTrainNor + numValidNor + 1 : numNor);
 85 | %         
 86 | %     case 'balance' % Use the same number of normal data for validation and test, in order to balance the data set.
 87 | %         numValidNor = numValidAb;
 88 | %         numTestNor = numTestAb;
 89 | %         numTrainNor = numNor - numValidNor - numTestNor;
 90 | %         
 91 | %         permAb = randperm(numAb);
 92 | %         indValidAb = permAb(1:numValidAb);
 93 | %         indTestAb = permAb(numValidAb+1 : end);
 94 | %         
 95 | %         permNor = randperm(numNor);
 96 | %         indTrainNor = permNor(1 : numTrainNor);
 97 | %         indValidNor = permNor(numTrainNor + 1 : numTrainNor + numValidNor);
 98 | %         indTestNor = permNor(numTrainNor + numValidNor + 1 : end);
 99 | % end
100 | 
101 | %%
102 | % normaldata = alldata(~isab, :);
103 | % abnormaldata = alldata(isab, :);
104 | % % Find training, validation and test data
105 | % traindataNor = normaldata(indTrainNor, :); % only use normal data for training.
106 | % validdataAb = abnormaldata(indValidAb, :);
107 | % testdataAb = abnormaldata(indTestAb, :);
108 | % validdataNor = normaldata(indValidNor, :);
109 | % testdataNor = normaldata(indTestNor, :);
110 | 
111 | end
112 | 


--------------------------------------------------------------------------------
/Baselines/utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     Author: Panpan Zheng
  3 |     Date created:  1/15/2018
  4 |     Python Version: 2.7
  5 | '''
  6 | 
  7 | import numpy as np
  8 | from numpy import random
  9 | import pandas as pd
 10 | from collections import defaultdict
 11 | import datetime
 12 | from datetime import * 
 13 | from numpy import linalg as LA
 14 | import matplotlib.pyplot as plt
 15 | from sklearn.manifold import TSNE
 16 | 
 17 | 
 18 | 
 19 | # functions for data processing. 
 20 | 
 21 | def getUserDict(f_users):
 22 | 
 23 | 	users_frame = pd.read_csv(f_users, sep='\t', header=None)
 24 | 	users_frame = users_frame.applymap(str)
 25 | 	user2id = users_frame.set_index(0)[1].to_dict()
 26 | 
 27 | 	uid2type = users_frame.set_index(1)[4].to_dict()
 28 | 
 29 | 	for uid in uid2type:
 30 | 		if uid2type[uid] == "benign":
 31 | 			uid2type[uid] = 0
 32 | 		else:
 33 | 			uid2type[uid] = 1
 34 | 
 35 | 	id2user = users_frame.set_index(1)[0].to_dict()
 36 | 	return user2id, uid2type, id2user
 37 | 
 38 | 
 39 | def getPageDict(f_pages):
 40 | 
 41 | 	pages_frame = pd.read_csv(f_pages, sep=',')
 42 | 	pages_frame = pages_frame.applymap(str)
 43 | 	page2id = pages_frame.set_index('pagetitle')['pageid'].to_dict()
 44 | 	id2Cgr = pages_frame.set_index('pageid')['pagecategories'].to_dict()
 45 | 	id2page = pages_frame.set_index('pageid')['pagetitle'].to_dict()
 46 | 	return page2id, id2Cgr, id2page
 47 | 
 48 | 
 49 | def TleRevTim(files,user2id,page2id):
 50 | 	titleSet = defaultdict(list)
 51 | 	revSet = defaultdict(list)
 52 | 	timSet = defaultdict(list)
 53 | 	for f in files:
 54 | 		df = pd.read_csv(f,sep=',')
 55 | 		for index, row in df.iterrows():
 56 | 			try:
 57 | 				usrid = user2id[row['username']]
 58 | 			except Exception as e:
 59 | 				print row['username']
 60 | 			try:
 61 | 				pageid = page2id[row['pagetitle']]
 62 | 			except Exception as e:
 63 | 				print row['pagetitle']
 64 | 			isRev = row['isReverted']
 65 | 			revTime = row['revtime']
 66 | 			titleSet[usrid].append(pageid)
 67 | 			if isRev:			
 68 | 				revSet[usrid].append(1)
 69 | 			else:
 70 | 				revSet[usrid].append(0)
 71 | 			timSet[usrid].append(revTime)
 72 | 	return titleSet, revSet, timSet
 73 | 
 74 | 
 75 | 
 76 | def getLabel(filesB,filesV,user2id,posLabel,negLabel):
 77 | 	usrListB = list()
 78 | 	usrListV = list()
 79 | 	userid2Label = dict()
 80 | 	for f in filesB:
 81 | 		df = pd.read_csv(f,sep=',')
 82 | 		for index, row in df.iterrows():
 83 | 			usrid = user2id[row['username']]
 84 | 			usrListB.append(usrid)
 85 | 	usrListB = list(set(usrListB))
 86 | 	for val in usrListB:
 87 | 		userid2Label[val] = posLabel
 88 | 	for f in filesV:
 89 | 		df = pd.read_csv(f,sep=',')
 90 | 		for index, row in df.iterrows():
 91 | 			usrid = user2id[row['username']]
 92 | 			usrListV.append(usrid)
 93 | 	usrListV = list(set(usrListV))
 94 | 	for val in usrListV:
 95 | 		userid2Label[val] = negLabel
 96 | 	return userid2Label
 97 | 
 98 | 
 99 | def train_test_split(*agrs):
100 | 
101 | 	trainRatio = agrs[-1]
102 | 	assert trainRatio <= 1.0
103 | 	size = agrs[0].shape[0]
104 | 	thre = int(trainRatio * 10)
105 | 	indSet = np.random.randint(0,10, size) + 1
106 | 	trainInd = [indSet <= thre]
107 | 	testInd = [indSet > thre]
108 | 	X_usrs, y, X_pages, X_tim, X_rev, X_len = \
109 | 									agrs[0], agrs[1], agrs[2], agrs[3], agrs[4], agrs[5]
110 | 	return X_usrs[trainInd], y[trainInd], X_pages[trainInd], \
111 | 									X_tim[trainInd], X_rev[trainInd], X_len[trainInd],\
112 | 		   X_usrs[testInd], y[testInd], X_pages[testInd], \
113 | 									X_tim[testInd], X_rev[testInd], X_len[testInd]
114 | 
115 | 
116 | def MetaPageList(files,page2id):
117 | 	MPL = list()
118 | 	for f in files:
119 | 		df = pd.read_csv(f,sep=',')
120 | 		for index, row in df.iterrows():
121 | 			try:
122 | 				pageid = page2id[row['pagetitle']]
123 | 			except Exception as e:
124 | 				print row['pagetitle']
125 | 			title = row['pagetitle'].lower()
126 | 			if "user:" in title or "talk:" in title or "user talk:" in title or  "wikipedia" in title:
127 | 				MPL.append(pageid)
128 | 	return np.array(list(set(MPL)))
129 | 
130 | def encode(x, n):
131 | 	x = int(x)
132 | 	result = np.zeros(n).tolist()
133 | 	result[x] = 1.
134 | 	return result
135 | 
136 | def TimeDiff(p1, p2, timDiff):
137 | 	p1 = datetime.strptime(p1, '%Y-%m-%dT%XZ')
138 | 	p2 = datetime.strptime(p2, '%Y-%m-%dT%XZ')
139 | 	td = p2 - p1
140 | 	if td.days*24*60 + td.seconds/60 < timDiff:
141 | 		return 1
142 | 	else: 
143 | 		return 0
144 | 
145 | 
146 | def IsMetaPage(p, metaDict):
147 | 	if p in metaDict: 
148 | 		return 1
149 | 	else: 
150 | 		return 0
151 | 
152 | 
153 | 
154 | # functions for model training and data analysis
155 | 
156 | def recMSE(a,b):
157 | 	return np.mean((a-b)**2)
158 | 
159 | def recErr(X1, X2):
160 | 	seqRecErr = list()
161 | 	for s1, s2 in zip(X1, X2):
162 | 		seqRecErr.append(np.sum((s1 - s2)**2)/float(np.prod(s1.shape)))
163 | 		# seqRecErr.append(np.mean([LA.norm(x1 - x2)**2/8. for x1, x2 in zip(s1, s2)]))
164 | 		# seqRecErr.append(np.mean([LA.norm(x1 - x2) for x1, x2 in zip(s1, s2)]))
165 | 	return np.array(seqRecErr)
166 | 
167 | def recErrMeaVar(X):
168 | 	return np.mean(X), np.var(X)
169 | 
170 | def recErrHist(*args):
171 | 
172 | 	plt.figure()
173 | 	# # axes = plt.gca()
174 | 	# # axes.set_xlim([0.,1.0])
175 | 	# # axes.set_ylim([ymin,ymax])
176 | 	# plt.subplot(3,1,1)
177 | 	# plt.title('Trainning')
178 | 	# weights = np.ones_like(args[0])/len(args[0])
179 | 	# plt.hist(args[0], weights=weights, bins=100)
180 | 	# plt.ylabel('Probability')
181 | 	# axes = plt.gca()
182 | 	# axes.set_xlim([0.,.1])
183 | 
184 | 	plt.subplot(2,1,1)
185 | 	plt.title('Benign')
186 | 	weights = np.ones_like(args[0])/len(args[0])
187 | 	plt.hist(args[0], weights=weights, bins=100)
188 | 	plt.ylabel('Probability')
189 | 	axes = plt.gca()
190 | 	axes.set_xlim([0.,.1])
191 | 	
192 | 	plt.subplot(2,1,2)
193 | 	plt.title('Vandal')
194 | 	weights = np.ones_like(args[1])/len(args[1])
195 | 	plt.hist(args[1], weights=weights, bins=100)
196 | 	plt.ylabel('Probability')
197 | 	axes = plt.gca()
198 | 	axes.set_xlim([0.,.1])
199 | 
200 | 	plt.xlabel('Recontruction Error')
201 | 	plt.show()
202 | 
203 | 
204 | def vanDet(X,thrd):
205 | 	return np.sum(X>=thrd)/float(len(X))
206 | 
207 | def vanDet2(X,thrd):
208 | 	return (np.array(X)>=thrd).astype(int)
209 | 
210 | def TSNE_2D_show_bi(X,y,i):
211 | 	model = TSNE(n_components=2, random_state=0)
212 | 	X_2D = model.fit_transform(X)
213 | 	X_2D_beg = X_2D[y == 1]
214 | 	X_2D_val = X_2D[y == 0]
215 | 
216 | 	fig = plt.figure()
217 | 	fig.patch.set_facecolor('w')
218 | 	ax = fig.add_subplot(111)
219 | 	ax.set_axis_off()
220 | 
221 | 	blue_dot, = plt.plot(X_2D_beg[:,0],X_2D_beg[:,1], "ro", mec='none')
222 | 	red_dot, = plt.plot(X_2D_val[:,0],X_2D_val[:,1], "bo", mec='none')
223 | 	plt.legend([blue_dot, red_dot], ["Benign", "Vandal"], numpoints=1)
224 | 	plt.savefig("representation_%s"%i)
225 | 	plt.clf()
226 | 	plt.close()
227 | 
228 | def TSNE_2D_show_tri(X,y):
229 | 	model = TSNE(n_components=2, random_state=0)
230 | 	X_2D = model.fit_transform(X)
231 | 	X_2D_train_beg = X_2D[y == 1]
232 | 	X_2D_fake = X_2D[y == 0]
233 | 	X_2D_val = X_2D[y == 2]
234 | 
235 | 
236 | 	fig = plt.figure()
237 | 	fig.patch.set_facecolor('w')
238 | 	ax = fig.add_subplot(111)
239 | 	ax.set_axis_off()
240 | 
241 | 	red_dot, = plt.plot(X_2D_train_beg[:,0],X_2D_train_beg[:,1], "ro", mec='none')
242 | 	green_dot, = plt.plot(X_2D_fake[:, 0], X_2D_fake[:, 1], "go", mec='none')
243 | 	blue_dot, = plt.plot(X_2D_val[:, 0], X_2D_val[:, 1], "bo", mec='none')
244 | 	# yellow_dot, = plt.plot(X_2D_train[:, 0], X_2D_train[:, 1], "yo", mec='none')
245 | 	plt.legend([red_dot, green_dot, blue_dot], ["Benign", "Fake", "Vandal"], numpoints=1)
246 | 	plt.show()
247 | 	plt.clf()
248 | 	plt.close()
249 | 	return X_2D_train_beg, X_2D_fake, X_2D_val
250 | 
251 | 
252 | 
253 | 
254 | def draw_trend(D_real_prob, D_fake_prob, D_val_prob, fm_loss, f1):
255 | 
256 |     fig = plt.figure()
257 |     fig.patch.set_facecolor('w')
258 |     # plt.subplot(311)
259 |     p1, = plt.plot(D_real_prob, "-g")
260 |     p2, = plt.plot(D_fake_prob, "--r")
261 |     p3, = plt.plot(D_val_prob, ":c")
262 |     plt.xlabel("# of epoch")
263 |     plt.ylabel("probability")
264 |     leg = plt.legend([p1, p2, p3], [r'$p(y|V_B)$', r'$p(y|\~{V})$', r'$p(y|V_M)$'], loc=1, bbox_to_anchor=(1, 1), borderaxespad=0.)
265 |     leg.draw_frame(False)
266 |     # plt.legend(frameon=False)
267 | 
268 |     fig = plt.figure()
269 |     fig.patch.set_facecolor('w')
270 |     # plt.subplot(312)
271 |     p4, = plt.plot(fm_loss, "-b")
272 |     plt.xlabel("# of epoch")
273 |     plt.ylabel("feature matching loss")
274 |     # plt.legend([p4], ["d_real_prob", "d_fake_prob", "d_val_prob"], loc=1, bbox_to_anchor=(1, 1), borderaxespad=0.)
275 | 
276 |     fig = plt.figure()
277 |     fig.patch.set_facecolor('w')
278 |     # plt.subplot(313)
279 |     p5, = plt.plot(f1, "-y")
280 |     plt.xlabel("# of epoch")
281 |     plt.ylabel("F1")
282 |     # plt.legend([p1, p2, p3, p4, p5], ["d_real_prob", "d_fake_prob", "d_val_prob", "fm_loss","f1"], loc=1, bbox_to_anchor=(1, 3.5), borderaxespad=0.)
283 |     plt.show()
284 | 
285 | 
286 | def sample_shuffle(X):
287 | 	# n_samples = X.shape[0]
288 | 	n_samples = len(X)
289 | 	s = np.arange(n_samples)
290 | 	np.random.shuffle(s)
291 | 	return np.array(X[s])
292 | 
293 | # Helper function to plot a decision boundary.
294 | # If you don't fully understand this function don't worry, it just generates the contour plot below.
295 | def plot_decision_boundary(pred_func, X, y):
296 |     # Set min and max values and give it some padding
297 |     x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
298 |     y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
299 |     h = 0.01
300 |     # Generate a grid of points with distance h between them
301 |     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
302 |     # Predict the function value for the whole gid
303 |     Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
304 |     Z = Z.reshape(xx.shape)
305 |     # Plot the contour and training examples
306 |     plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
307 |     plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)
308 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # OCAN: One-Class Adversarial Nets for Fraud Detection
 3 | 
 4 | In this paper, we develop one-class adversarial nets (OCAN) for fraud detection with only benign users as training data. 
 5 | 
 6 | ## Running Environment
 7 | 
 8 | The main packages you need to install are listed as follow
 9 | 
10 | ```
11 | 1. python 2.7 
12 | 2. tensorflow 1.3.0
13 | ```
14 | 
15 | ## DateSet
16 | 
17 | For experiments, we evaluate **OCAN** on two real-world datasets: wiki and credit-card which have been attached in folder [data/](https://github.com/PanpanZheng/OCAN/tree/master/data).
18 | 
19 | ## Model Evaluation
20 | 
21 | The command line for OCAN goes as follow
22 | 
23 | ```
24 |     python oc_gan.py $1 $2
25 | ```
26 | **where** $1 refers to different datasets with wiki 1, credit-card(encoding) 2 and credit-card(raw) 3; $2 denotes whether some metrics, such as fm_loss and f1 in training process, are provided, with non-display 0 and display 1.
27 | 
28 | 
29 | ```
30 |    e.g. python oc_gan.py 1 0 
31 | ```
32 | The above command line shows the performance of OCAN on wiki without displaying metrics in the training process.
33 | 
34 | 


--------------------------------------------------------------------------------
/bg_dataset.py:
--------------------------------------------------------------------------------
 1 | '''
 2 |     Author: Panpan Zheng
 3 |     Date created:  2/15/2018
 4 |     Python Version: 2.7
 5 | '''
 6 | import numpy as np
 7 | from bg_utils import one_hot
 8 | 
 9 | def load_data(x_benign, x_vandal, n_b_lab, n_v_lab, n_b_test, n_v_test, oh=True):
10 | 
11 |     # labeled data (supervised)
12 |     x_lab_ben = x_benign[0:n_b_lab]
13 |     x_lab_van = x_vandal[0:n_v_lab]
14 |     x_lab = x_lab_ben.tolist() + x_lab_van.tolist()
15 |     x_lab = np.array(x_lab)
16 |     y_lab = np.ones(len(x_lab), dtype=np.int32)
17 |     y_lab[len(x_lab_ben):] = 0
18 |     if oh:
19 |         y_lab = one_hot(y_lab, 3)
20 | 
21 | 
22 |     # unlabeled data (unsupervised)
23 |     # x_unl_ben = x_benign[len(x_lab_ben):-n_b_test]
24 |     # x_unl_van = x_vandal[len(x_lab_van):-n_v_test]
25 |     x_unl_ben = x_benign[len(x_lab_ben):-3*n_b_test]
26 |     x_unl_van = x_vandal[len(x_lab_van):-3*n_v_test]
27 |     x_unl = x_unl_ben.tolist() + x_unl_van.tolist()
28 |     x_unl = np.array(x_unl)
29 | 
30 | 
31 |     # test data.
32 |     x_benign_test = x_benign[len(x_lab_ben) + len(x_unl_ben):]
33 |     x_vandal_test = x_vandal[len(x_lab_van) + len(x_unl_van):]
34 |     x_test = x_benign_test.tolist() + x_vandal_test.tolist()
35 |     x_test = np.array(x_test)
36 |     y_test = np.ones(len(x_test), dtype=np.int32)
37 |     y_test[len(x_benign_test):] = 0
38 | 
39 |     return x_lab, y_lab, x_unl, x_test, y_test
40 | 
41 | 
42 | 
43 | 
44 | def load_data_unbal(x_benign, x_vandal, n_b_lab, n_v_lab, n_b_test, n_v_test, oh=True):
45 | 
46 |     # labeled data (supervised)
47 |     x_lab_ben = x_benign[0:n_b_lab]
48 |     x_lab_van = x_vandal[0:n_v_lab]
49 |     x_lab = x_lab_ben.tolist() + x_lab_van.tolist()
50 |     x_lab = np.array(x_lab)
51 |     y_lab = np.ones(len(x_lab), dtype=np.int32)
52 |     y_lab[len(x_lab_ben):] = 0
53 |     if oh:
54 |         y_lab = one_hot(y_lab, 3)
55 | 
56 |     print x_lab_ben.shape, x_lab_van.shape
57 | 
58 | 
59 |     # unlabeled data (unsupervised)
60 |     x_unl_ben = x_benign[len(x_lab_ben):-3*n_b_test]
61 |     x_unl_van = x_vandal[len(x_lab_van):-3*n_v_test]
62 |     x_unl = x_unl_ben.tolist() + x_unl_van.tolist()
63 |     x_unl = np.array(x_unl)
64 |     print x_unl_ben.shape, x_unl_van.shape
65 | 
66 | 
67 |     # test data.
68 |     x_benign_test = x_benign[len(x_lab_ben) + len(x_unl_ben):]
69 |     x_vandal_test = x_vandal[len(x_lab_van) + len(x_unl_van):]
70 |     x_test = x_benign_test.tolist() + x_vandal_test.tolist()
71 |     x_test = np.array(x_test)
72 |     y_test = np.ones(len(x_test), dtype=np.int32)
73 |     y_test[len(x_benign_test):] = 0
74 |     print x_benign_test.shape, x_vandal_test.shape
75 | 
76 |     return x_lab, y_lab, x_unl, x_test, y_test
77 | 


--------------------------------------------------------------------------------
/bg_utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     Author: Panpan Zheng
  3 |     Date created:  2/15/2018
  4 |     Python Version: 2.7
  5 | '''
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | from sklearn.neighbors.kde import KernelDensity
  9 | import matplotlib.pyplot as plt
 10 | 
 11 | 
 12 | def one_hot(x, depth):
 13 |     x_one_hot = np.zeros((len(x), depth), dtype=np.int32)
 14 |     x = x.astype(int)
 15 |     for i in range(x_one_hot.shape[0]):
 16 |         x_one_hot[i, x[i]] = 1
 17 |     return x_one_hot
 18 | 
 19 | 
 20 | def xavier_init(size): # initialize the weight-matrix W.
 21 |     in_dim = size[0]
 22 |     xavier_stddev = 1. / tf.sqrt(in_dim / 2.)
 23 |     return tf.random_normal(shape=size, stddev=xavier_stddev)
 24 | 
 25 | 
 26 | def sample_Z(m, n):   # generating the input for G.
 27 |     return np.random.uniform(-1., 1., size=[m, n])
 28 | 
 29 | 
 30 | def sample_shuffle_spv(X, labels):
 31 | 	n_samples = len(X)
 32 | 	s = np.arange(n_samples)
 33 | 	np.random.shuffle(s)
 34 | 	return np.array(X[s]), labels[s]
 35 | 
 36 | 
 37 | def sample_shuffle_uspv(X):
 38 | 	n_samples = len(X)
 39 | 	s = np.arange(n_samples)
 40 | 	np.random.shuffle(s)
 41 | 	return np.array(X[s])
 42 | 
 43 | 
 44 | def kde_density_estimator(X,kernel='gaussian',bandwidth=0.2):
 45 |    return KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(X)
 46 | 
 47 | def complement_density(kde, X, sf=0.5):
 48 |     # probs = map(lambda x: np.exp(kde.score([x])), X)
 49 |     probs = np.exp(kde.score_samples(X))
 50 |     thrld = np.median(probs)
 51 |     return np.array(
 52 |         map(lambda x: low_density(x, thrld, sf), probs)
 53 |     )
 54 | 
 55 | def low_density(prob, thrld, sf):
 56 | 
 57 |     if prob > thrld:
 58 |         return sf * np.reciprocal(prob)
 59 |         # return sf * (1-prob)
 60 |     else:
 61 |         return thrld
 62 | 
 63 | 
 64 | 
 65 | def pull_away_loss(g):
 66 | 
 67 |     Nor = tf.norm(g, axis=1)
 68 |     Nor_mat = tf.tile(tf.expand_dims(Nor, axis=1),
 69 |                       [1, tf.shape(g)[1]])
 70 |     X = tf.divide(g, Nor_mat)
 71 |     X_X = tf.square(tf.matmul(X, tf.transpose(X)))
 72 |     mask = tf.subtract(tf.ones_like(X_X),
 73 |                        tf.diag(
 74 |                            tf.ones([tf.shape(X_X)[0]]))
 75 |                        )
 76 |     pt_loss = tf.divide(tf.reduce_sum(tf.multiply(X_X, mask)),
 77 |                         tf.multiply(
 78 |                             tf.cast(tf.shape(X_X)[0], tf.float32),
 79 |                             tf.cast(tf.shape(X_X)[0]-1, tf.float32)))
 80 | 
 81 |     return pt_loss
 82 | 
 83 | 
 84 | def draw_trend(D_real_prob, D_fake_prob, D_val_prob, fm_loss, f1):
 85 | 
 86 |     fig = plt.figure()
 87 |     fig.patch.set_facecolor('w')
 88 |     # plt.subplot(311)
 89 |     p1, = plt.plot(D_real_prob, "-g")
 90 |     p2, = plt.plot(D_fake_prob, "--r")
 91 |     p3, = plt.plot(D_val_prob, ":c")
 92 |     plt.xlabel("# of epoch")
 93 |     plt.ylabel("probability")
 94 |     leg = plt.legend([p1, p2, p3], [r'$p(y|V_B)$', r'$p(y|\~{V})$', r'$p(y|V_M)$'], loc=1, bbox_to_anchor=(1, 1), borderaxespad=0.)
 95 |     leg.draw_frame(False)
 96 |     # plt.legend(frameon=False)
 97 | 
 98 |     fig = plt.figure()
 99 |     fig.patch.set_facecolor('w')
100 |     # plt.subplot(312)
101 |     p4, = plt.plot(fm_loss, "-b")
102 |     plt.xlabel("# of epoch")
103 |     plt.ylabel("feature matching loss")
104 |     # plt.legend([p4], ["d_real_prob", "d_fake_prob", "d_val_prob"], loc=1, bbox_to_anchor=(1, 1), borderaxespad=0.)
105 | 
106 |     fig = plt.figure()
107 |     fig.patch.set_facecolor('w')
108 |     # plt.subplot(313)
109 |     p5, = plt.plot(f1, "-y")
110 |     plt.xlabel("# of epoch")
111 |     plt.ylabel("F1")
112 |     # plt.legend([p1, p2, p3, p4, p5], ["d_real_prob", "d_fake_prob", "d_val_prob", "fm_loss","f1"], loc=1, bbox_to_anchor=(1, 3.5), borderaxespad=0.)
113 |     plt.show()
114 | 
115 | 
116 | def plot_decision_boundary(pred_func, X, y):
117 |     # Set min and max values and give it some padding
118 |     x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
119 |     y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
120 |     h = 0.01
121 |     # Generate a grid of points with distance h between them
122 |     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
123 |     # Predict the function value for the whole gid
124 |     Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
125 |     Z = Z.reshape(xx.shape)
126 |     # Plot the contour and training examples
127 |     plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
128 |     plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)
129 | 


--------------------------------------------------------------------------------
/data/credit_card/ben_hid_repre_r2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/credit_card/ben_hid_repre_r2.npy


--------------------------------------------------------------------------------
/data/credit_card/van_hid_repre_r2.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/credit_card/van_hid_repre_r2.npy


--------------------------------------------------------------------------------
/data/raw_credit_card/ben_raw_r0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/raw_credit_card/ben_raw_r0.npy


--------------------------------------------------------------------------------
/data/raw_credit_card/van_raw_r0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/raw_credit_card/van_raw_r0.npy


--------------------------------------------------------------------------------
/data/wiki/X_v8_4_50_Ben.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/wiki/X_v8_4_50_Ben.npy


--------------------------------------------------------------------------------
/data/wiki/X_v8_4_50_Van.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/wiki/X_v8_4_50_Van.npy


--------------------------------------------------------------------------------
/data/wiki/ben_hid_emd_4_50_8_200_r0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/wiki/ben_hid_emd_4_50_8_200_r0.npy


--------------------------------------------------------------------------------
/data/wiki/val_hid_emd_4_50_8_200_r0.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/wiki/val_hid_emd_4_50_8_200_r0.npy


--------------------------------------------------------------------------------
/oc_gan.py:
--------------------------------------------------------------------------------
  1 | '''
  2 |     Author: Panpan Zheng
  3 |     Date created:  2/15/2018
  4 |     Python Version: 2.7
  5 | '''
  6 | 
  7 | import tensorflow as tf
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | from sklearn.metrics import classification_report, accuracy_score
 11 | from sklearn.preprocessing import MinMaxScaler
 12 | import os
 13 | 
 14 | from bg_utils import pull_away_loss, one_hot, xavier_init, sample_shuffle_spv, sample_shuffle_uspv, sample_Z, draw_trend
 15 | from bg_dataset import load_data, load_data_unbal
 16 | import sys
 17 | 
 18 | 
 19 | 
 20 | en_ae = int(sys.argv[1])  # en_ae == 1 for wiki dataset with autoencoding;
 21 |                           # en_ae == 2 for credit card dataset with autoencoding;
 22 |                           # en_ae == 3 for credit card dataset without autoencoding.
 23 | 
 24 | dra_tra_pro = int(sys.argv[2])  # dra_tra_pro == 1 for printing training trend, discr_probabiltiy, f1 and fm_loss;
 25 |                                 # dra_tra_pro == 1 for printing training trend, discr_probabiltiy, f1 and fm_loss;
 26 | 
 27 | 
 28 | # print en_ae, dra_tra_pro
 29 | #
 30 | # exit(0)
 31 | # en_ae = 3 # 1 for wiki dataset with autoencoding; 2 for credit card dataset with autoencoding; 3 for credit card dataset without autoencoding.
 32 | # dra_tra_pro = False
 33 | 
 34 | if en_ae == 1:
 35 |     mb_size = 100
 36 |     dim_input = 200
 37 | elif en_ae == 2:
 38 |     mb_size = 70
 39 |     dim_input = 50
 40 | else:
 41 |     mb_size = 70
 42 |     dim_input = 30
 43 | 
 44 | 
 45 | D_dim = [dim_input, 100, 50, 2]
 46 | G_dim = [50, 100, dim_input]
 47 | Z_dim = G_dim[0]
 48 | 
 49 | 
 50 | # define placeholders for labeled-data, unlabeled-data, noise-data and target-data.
 51 | 
 52 | X_oc = tf.placeholder(tf.float32, shape=[None, dim_input])
 53 | Z = tf.placeholder(tf.float32, shape=[None, Z_dim])
 54 | X_tar = tf.placeholder(tf.float32, shape=[None, dim_input])
 55 | # X_val = tf.placeholder(tf.float32, shape=[None, dim_input])
 56 | 
 57 | 
 58 | # declare weights and biases of discriminator.
 59 | 
 60 | D_W1 = tf.Variable(xavier_init([D_dim[0], D_dim[1]]))
 61 | D_b1 = tf.Variable(tf.zeros(shape=[D_dim[1]]))
 62 | 
 63 | D_W2 = tf.Variable(xavier_init([D_dim[1], D_dim[2]]))
 64 | D_b2 = tf.Variable(tf.zeros(shape=[D_dim[2]]))
 65 | 
 66 | D_W3 = tf.Variable(xavier_init([D_dim[2], D_dim[3]]))
 67 | D_b3 = tf.Variable(tf.zeros(shape=[D_dim[3]]))
 68 | 
 69 | theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
 70 | 
 71 | 
 72 | 
 73 | # declare weights and biases of generator.
 74 | 
 75 | G_W1 = tf.Variable(xavier_init([G_dim[0], G_dim[1]]))
 76 | G_b1 = tf.Variable(tf.zeros(shape=[G_dim[1]]))
 77 | 
 78 | G_W2 = tf.Variable(xavier_init([G_dim[1], G_dim[2]]))
 79 | G_b2 = tf.Variable(tf.zeros(shape=[G_dim[2]]))
 80 | 
 81 | theta_G = [G_W1, G_W2, G_b1, G_b2]
 82 | 
 83 | 
 84 | # declare weights and biases of pre-train net for density estimation.
 85 | 
 86 | T_W1 = tf.Variable(xavier_init([D_dim[0], D_dim[1]]))
 87 | T_b1 = tf.Variable(tf.zeros(shape=[D_dim[1]]))
 88 | 
 89 | T_W2 = tf.Variable(xavier_init([D_dim[1], D_dim[2]]))
 90 | T_b2 = tf.Variable(tf.zeros(shape=[D_dim[2]]))
 91 | 
 92 | T_W3 = tf.Variable(xavier_init([D_dim[2], D_dim[3]]))
 93 | T_b3 = tf.Variable(tf.zeros(shape=[D_dim[3]]))
 94 | 
 95 | theta_T = [T_W1, T_W2, T_W3, T_b1, T_b2, T_b3]
 96 | 
 97 | 
 98 | def generator(z):
 99 |     G_h1 = tf.nn.relu(tf.matmul(z, G_W1) + G_b1)
100 |     G_logit = tf.nn.tanh(tf.matmul(G_h1, G_W2) + G_b2)
101 |     return G_logit
102 | 
103 | 
104 | def discriminator(x):
105 |     D_h1 = tf.nn.relu(tf.matmul(x, D_W1) + D_b1)
106 |     D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2)
107 |     D_logit = tf.matmul(D_h2, D_W3) + D_b3
108 |     D_prob = tf.nn.softmax(D_logit)
109 |     return D_prob, D_logit, D_h2
110 | 
111 | 
112 | # pre-train net for density estimation.
113 | 
114 | def discriminator_tar(x):
115 |     T_h1 = tf.nn.relu(tf.matmul(x, T_W1) + T_b1)
116 |     T_h2 = tf.nn.relu(tf.matmul(T_h1, T_W2) + T_b2)
117 |     T_logit = tf.matmul(T_h2, T_W3) + T_b3
118 |     T_prob = tf.nn.softmax(T_logit)
119 |     return T_prob, T_logit, T_h2
120 | 
121 | 
122 | D_prob_real, D_logit_real, D_h2_real = discriminator(X_oc)
123 | 
124 | G_sample = generator(Z)
125 | D_prob_gen, D_logit_gen, D_h2_gen = discriminator(G_sample)
126 | 
127 | D_prob_tar, D_logit_tar, D_h2_tar = discriminator_tar(X_tar)
128 | D_prob_tar_gen, D_logit_tar_gen, D_h2_tar_gen = discriminator_tar(G_sample)
129 | # D_prob_val, _, D_h1_val = discriminator(X_val)
130 | 
131 | 
132 | 
133 | 
134 | # disc. loss
135 | y_real= tf.placeholder(tf.int32, shape=[None, D_dim[3]])
136 | y_gen = tf.placeholder(tf.int32, shape=[None, D_dim[3]])
137 | 
138 | D_loss_real = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=D_logit_real,labels=y_real))
139 | D_loss_gen = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=D_logit_gen, labels=y_gen))
140 | 
141 | ent_real_loss = -tf.reduce_mean(
142 |                         tf.reduce_sum(
143 |                             tf.multiply(D_prob_real, tf.log(D_prob_real)), 1
144 |                         )
145 |                     )
146 | 
147 | ent_gen_loss = -tf.reduce_mean(
148 |                         tf.reduce_sum(
149 |                             tf.multiply(D_prob_gen, tf.log(D_prob_gen)), 1
150 |                         )
151 |                     )
152 | 
153 | D_loss = D_loss_real + D_loss_gen + 1.85 * ent_real_loss
154 | 
155 | 
156 | # gene. loss
157 | pt_loss = pull_away_loss(D_h2_tar_gen)
158 | 
159 | y_tar= tf.placeholder(tf.int32, shape=[None, D_dim[3]])
160 | T_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=D_logit_tar, labels=y_tar))
161 | tar_thrld = tf.divide(tf.reduce_max(D_prob_tar_gen[:,-1]) +
162 |                       tf.reduce_min(D_prob_tar_gen[:,-1]), 2)
163 | 
164 | # tar_thrld = tf.reduce_mean(D_prob_tar_gen[:,-1])
165 | 
166 | 
167 | indicator = tf.sign(
168 |               tf.subtract(D_prob_tar_gen[:,-1],
169 |                           tar_thrld))
170 | condition = tf.greater(tf.zeros_like(indicator), indicator)
171 | mask_tar = tf.where(condition, tf.zeros_like(indicator), indicator)
172 | G_ent_loss = tf.reduce_mean(tf.multiply(tf.log(D_prob_tar_gen[:,-1]), mask_tar))
173 | # G_ent_loss = tf.reduce_mean(tf.log(D_prob_tar_gen[:,-1]))
174 | 
175 | fm_loss = tf.reduce_mean(
176 |             tf.sqrt(
177 |                 tf.reduce_sum(
178 |                     tf.square(D_logit_real - D_logit_gen), 1
179 |                     )
180 |                 )
181 |             )
182 | 
183 | G_loss = pt_loss + G_ent_loss + fm_loss
184 | 
185 | D_solver = tf.train.GradientDescentOptimizer(learning_rate=1e-3).minimize(D_loss, var_list=theta_D)
186 | G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G)
187 | T_solver = tf.train.GradientDescentOptimizer(learning_rate=1e-3).minimize(T_loss, var_list=theta_T)
188 | 
189 | 
190 | # Load data....
191 | 
192 | 
193 | min_max_scaler = MinMaxScaler()
194 | 
195 | if en_ae == 1:
196 |     x_benign = min_max_scaler.fit_transform(np.load("./data/wiki/ben_hid_emd_4_50_8_200_r0.npy"))
197 |     x_vandal = min_max_scaler.transform(np.load("./data/wiki/val_hid_emd_4_50_8_200_r0.npy"))
198 | elif en_ae == 2:
199 |     x_benign = min_max_scaler.fit_transform(np.load("./data/credit_card/ben_hid_repre_r2.npy"))
200 |     x_vandal = min_max_scaler.transform(np.load("./data/credit_card/van_hid_repre_r2.npy"))
201 | else:
202 |     x_benign = min_max_scaler.fit_transform(np.load("./data/raw_credit_card/ben_raw_r0.npy"))
203 |     x_vandal = min_max_scaler.transform(np.load("./data/raw_credit_card/van_raw_r0.npy"))
204 | 
205 | 
206 | #x_benign = min_max_scaler.fit_transform(np.load("./hidden_output/ben_hid_emd_4_50_8_200.npy"))
207 | #x_vandal = min_max_scaler.transform(np.load("./hidden_output/val_hid_emd_4_50_8_200.npy"))
208 | 
209 | 
210 | x_benign = sample_shuffle_uspv(x_benign)
211 | x_vandal = sample_shuffle_uspv(x_vandal)
212 | 
213 | if en_ae == 1:
214 |     x_benign = x_benign[0:10000]
215 |     x_vandal = x_vandal[0:10000]
216 |     x_pre = x_benign[0:7000]
217 | else:
218 |     x_pre = x_benign[0:700]
219 | 
220 | y_pre = np.zeros(len(x_pre))
221 | y_pre = one_hot(y_pre, 2)
222 | 
223 | x_train = x_pre
224 | 
225 | y_real_mb = one_hot(np.zeros(mb_size), 2)
226 | y_fake_mb = one_hot(np.ones(mb_size), 2)
227 | 
228 | if en_ae == 1:
229 |     x_test = x_benign[-3000:].tolist() + x_vandal[-3000:].tolist()
230 | else:
231 |     x_test = x_benign[-490:].tolist() + x_vandal[-490:].tolist()
232 | x_test = np.array(x_test)
233 | 
234 | 
235 | y_test = np.zeros(len(x_test))
236 | if en_ae == 1:
237 |     y_test[3000:] = 1
238 | else:
239 |     y_test[490:] = 1
240 | 
241 | 
242 | sess = tf.Session()
243 | sess.run(tf.global_variables_initializer())
244 | 
245 | # pre-training for target distribution
246 | 
247 | _ = sess.run(T_solver,
248 |              feed_dict={
249 |                 X_tar:x_pre,
250 |                 y_tar:y_pre
251 |                 })
252 | 
253 | q = np.divide(len(x_train), mb_size)
254 | 
255 | # n_epoch = 1
256 | #
257 | # while n_epoch:
258 | 
259 | d_ben_pro, d_fake_pro, fm_loss_coll = list(), list(), list()
260 | f1_score  = list()
261 | d_val_pro = list()
262 | 
263 | if en_ae == 1:
264 |     n_round = 50
265 | else:
266 |     n_round = 200
267 | 
268 | for n_epoch in range(n_round):
269 | 
270 |     X_mb_oc = sample_shuffle_uspv(x_train)
271 | 
272 |     for n_batch in range(q):
273 | 
274 |         _, D_loss_curr, ent_real_curr = sess.run([D_solver, D_loss, ent_real_loss],
275 |                                           feed_dict={
276 |                                                      X_oc: X_mb_oc[n_batch*mb_size:(n_batch+1)*mb_size],
277 |                                                      Z: sample_Z(mb_size, Z_dim),
278 |                                                      y_real: y_real_mb,
279 |                                                      y_gen: y_fake_mb
280 |                                                      })
281 | 
282 |         _, G_loss_curr, fm_loss_curr = sess.run([G_solver, G_loss, fm_loss],
283 |         # _, G_loss_curr, fm_loss_, kld_ = sess.run([G_solver, G_loss, fm_loss, pt_loss + G_ent_loss],
284 |                                            feed_dict={Z: sample_Z(mb_size, Z_dim),
285 |                                                       X_oc: X_mb_oc[n_batch*mb_size:(n_batch+1)*mb_size],
286 |                                                       })
287 | 
288 |     D_prob_real_, D_prob_gen_ = sess.run([D_prob_real, D_prob_gen],
289 |                                          feed_dict={X_oc: x_train,
290 |                                                     Z: sample_Z(len(x_train), Z_dim)})
291 | 
292 |     if en_ae == 1:
293 |         D_prob_vandal_ = sess.run(D_prob_real,
294 |                                   feed_dict={X_oc: x_vandal[0:7000]})
295 |                                   # feed_dict={X_oc:x_vandal[-490:]})
296 |     else:
297 |         D_prob_vandal_ = sess.run(D_prob_real,
298 |                                   #feed_dict={X_oc: x_vandal[0:7000]})
299 |                                   feed_dict={X_oc:x_vandal[-490:]})
300 | 
301 |     d_ben_pro.append(np.mean(D_prob_real_[:, 0]))
302 |     d_fake_pro.append(np.mean(D_prob_gen_[:, 0]))
303 |     d_val_pro.append(np.mean(D_prob_vandal_[:, 0]))
304 |     fm_loss_coll.append(fm_loss_curr)
305 | 
306 |     prob, _ = sess.run([D_prob_real, D_logit_real], feed_dict={X_oc: x_test})
307 |     y_pred = np.argmax(prob, axis=1)
308 |     conf_mat = classification_report(y_test, y_pred, target_names=['benign', 'vandal'], digits=4)
309 |     f1_score.append(float(filter(None, conf_mat.strip().split(" "))[12]))
310 |     # print conf_mat
311 | 
312 | if not dra_tra_pro:
313 |     acc = np.sum(y_pred == y_test)/float(len(y_pred))
314 |     print conf_mat
315 |     print "acc:%s"%acc
316 | 
317 | if dra_tra_pro:
318 |     draw_trend(d_ben_pro, d_fake_pro, d_val_pro, fm_loss_coll, f1_score)
319 | 
320 | exit(0)
321 | 


--------------------------------------------------------------------------------