├── Baselines ├── 2D_representation.py ├── Readme ├── auroc_draw.py ├── base_classifiers.py ├── baseline_OCC.py ├── baseline_OCC_utils.py ├── baseline_VEWS_utils.py ├── data_generation.py ├── latent_repre_explore.py ├── libs.py ├── model_components.py ├── representation_libs.py ├── run_baseline.m ├── splitData.m └── utils.py ├── README.md ├── bg_dataset.py ├── bg_utils.py ├── data ├── credit_card │ ├── ben_hid_repre_r2.npy │ └── van_hid_repre_r2.npy ├── raw_credit_card │ ├── ben_raw_r0.npy │ └── van_raw_r0.npy └── wiki │ ├── X_v8_4_50_Ben.npy │ ├── X_v8_4_50_Van.npy │ ├── ben_hid_emd_4_50_8_200_r0.npy │ └── val_hid_emd_4_50_8_200_r0.npy └── oc_gan.py /Baselines/2D_representation.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 1/15/2018 4 | Python Version: 2.7 5 | ''' 6 | 7 | import os 8 | import sys 9 | sys.path.append(os.getcwd() + "\\..\\..\\") 10 | from baseline_OCC_utils import * 11 | from base_classifiers import LSTM_Autoencoder 12 | from model_components import train_gan 13 | 14 | # Load data and preprocess. 15 | samples_path = os.getcwd() + "\\..\\..\\sampleData\\" 16 | f_ben, f_van = "X_v8_4_50_Ben", "X_v8_4_50_Van" 17 | x_ben, x_van = load_data(samples_path, f_ben, f_van) 18 | 19 | train_ratio = .7 20 | max_len = 50 21 | 22 | # Contruct the LSTM-AE 23 | input_dim = 8 24 | time_step = max_len 25 | hid_dim = [200] 26 | 27 | sampling_ratio = train_ratio 28 | 29 | x_train_P, x_ben_P, x_van_P, weights_P, seq_len_ben, seq_len_van = sampling_preprocessing_LSTM_AE(x_ben, x_van, train_ratio, \ 30 | max_len) 31 | 32 | lstm_ae = LSTM_Autoencoder(input_dim, time_step, hid_dim) 33 | lstm_ae.compile() 34 | lstm_ae.fit(x_train_P, weights_P) 35 | 36 | test_ben_P = x_ben_P[len(x_train_P):] 37 | test_van_P = x_van_P[0:len(test_ben_P)] 38 | 39 | test_seq_len_ben = np.array(seq_len_ben[len(x_train_P):]) 40 | test_seq_len_van = np.array(seq_len_van[0:len(test_ben_P)]) 41 | 42 | lstm_ae.get_hidden_layer_sequence() 43 | 44 | ben_hid_repre_P = lstm_ae.get_hidden_representation(test_ben_P) 45 | van_hid_repre_P = lstm_ae.get_hidden_representation(test_van_P) 46 | 47 | ben_hid_last_4 = ben_hid_repre_P[:,-4:] 48 | van_hid_last_4 = van_hid_repre_P[:,-4:] 49 | 50 | a = ben_hid_last_4.shape 51 | b = van_hid_last_4.shape 52 | 53 | print a 54 | -------------------------------------------------------------------------------- /Baselines/Readme: -------------------------------------------------------------------------------- 1 | For OCNN and OCGP, they are implemented by package, NDtool, which can be downloaded in the website, http://www.robots.ox.ac.uk/~davidc/publications_NDtool.php. 2 | 3 | 1. Create a folder named "MATLAB", and then download the package, "NDtoolv0.12", into "MATLAB". 4 | 2. Move "run_baseline.m" into the folder "MATLAB", and then replace "splitData.m" in "MATLAB/NDtoolv0.12" with "splitData.m" in path "Baselines/" 5 | 6 | For OCSVM, it's implemented by sklearn. 7 | 8 | For credit_card dataset, it needs to be downloaded from website, https://www.kaggle.com/dalpozz/creditcardfraud , and put it into path, 'OCAN/data/credit_card'. 9 | -------------------------------------------------------------------------------- /Baselines/auroc_draw.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from mpl_toolkits.mplot3d import axes3d 4 | from sklearn.preprocessing import MinMaxScaler 5 | from bg_utils import sample_shuffle_uspv 6 | from sklearn.manifold import TSNE, Isomap, LocallyLinearEmbedding 7 | from sklearn import metrics 8 | from sklearn.metrics import classification_report 9 | 10 | 11 | # Create data 12 | # N = 60 13 | # g1 = (0.6 + 0.6 * np.random.rand(N), np.random.rand(N), 0.4 + 0.1 * np.random.rand(N)) 14 | # g2 = (0.4 + 0.3 * np.random.rand(N), 0.5 * np.random.rand(N), 0.1 * np.random.rand(N)) 15 | # g3 = (0.3 * np.random.rand(N), 0.3 * np.random.rand(N), 0.3 * np.random.rand(N)) 16 | 17 | 18 | def draw_3D(X,y): 19 | 20 | colors = ("blue", "magenta", "cyan") 21 | groups = ("Benign", "Fake", "Vandal") 22 | markers = ("*", "o", "v") 23 | # Create plot 24 | 25 | 26 | # fig = plt.figure() 27 | fig = plt.figure(facecolor='white') 28 | ax = fig.add_subplot(1, 1, 1, axisbg="1.0", projection='3d') 29 | 30 | for i in range(3): 31 | ax.scatter(X[y == i][:, 0], X[y == i][:, 1], X[y == i][:, 2], marker=markers[i], alpha=0.8, c=colors[i], edgecolors='face', s=5, 32 | label=groups[i]) 33 | 34 | # plt.axis('off') 35 | plt.title('Matplot 3d scatter plot') 36 | plt.legend(loc=2) 37 | plt.xlim(0, 1) 38 | plt.ylim(0, 1) 39 | # ax.set_zticklabels([]) 40 | # ax.set_yticklabels([]) 41 | # ax.set_xticklabels([]) 42 | plt.show() 43 | 44 | def draw_2D(X, y): 45 | 46 | colors = ("blue", "c") 47 | groups = ("Benign", "Vandal") 48 | markers = ("*", "v") 49 | 50 | # Create plot 51 | 52 | # fig = plt.figure() 53 | fig = plt.figure(facecolor='white') 54 | ax = fig.add_subplot(1, 1, 1, axisbg="1.0") 55 | 56 | for i in range(2): 57 | ax.scatter(X[y == i][:, 0], X[y == i][:, 1], marker=markers[i], alpha=0.8, c=colors[i], edgecolors='face', s=5, label=groups[i]) 58 | 59 | plt.axis('off') 60 | # plt.title('Matplot 3d scatter plot') 61 | # plt.legend(loc=2) 62 | # ax.set_zticklabels([]) 63 | # ax.set_yticklabels([]) 64 | # ax.set_xticklabels([]) 65 | plt.show() 66 | 67 | def roc_curve(y, pred, title): 68 | 69 | fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=1) 70 | auc_val = metrics.auc(fpr, tpr) 71 | 72 | plt.figure() 73 | lw = 2 74 | plt.plot(fpr, tpr, color='darkorange', 75 | lw=lw, label='ROC curve (area = %0.4f)'%auc_val) 76 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 77 | plt.xlim([0.0, 1.0]) 78 | plt.ylim([0.0, 1.05]) 79 | plt.xlabel('False Positive Rate') 80 | plt.ylabel('True Positive Rate') 81 | plt.title('%s'%title) 82 | plt.legend(loc="lower right") 83 | plt.tight_layout() 84 | plt.show() 85 | 86 | 87 | def roc_curve_two(y, pred, y2, pred2, title): 88 | 89 | fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=1) 90 | auc_val = metrics.auc(fpr, tpr) 91 | 92 | fpr2, tpr2, thresholds2 = metrics.roc_curve(y2, pred2, pos_label=1) 93 | auc_val2 = metrics.auc(fpr2, tpr2) 94 | 95 | 96 | plt.figure() 97 | lw = 2 98 | plt.plot(fpr, tpr, color='c', 99 | lw=lw, label='representation (area = %0.4f)'%auc_val) 100 | 101 | plt.plot(fpr2, tpr2, color='darkorange', linestyle=":", 102 | lw=lw, label='raw feature (area = %0.4f)'%auc_val2) 103 | 104 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 105 | plt.xlim([0.0, 1.0]) 106 | plt.ylim([0.0, 1.05]) 107 | plt.xlabel('False Positive Rate') 108 | plt.ylabel('True Positive Rate') 109 | plt.title('%s'%title) 110 | plt.legend(loc="lower right") 111 | plt.tight_layout() 112 | plt.show() 113 | 114 | 115 | 116 | #y_test_wiki = np.load("y_test_wiki.npy")[0:3300] 117 | #y_pred_wiki = np.load("y_prob_wiki.npy")[0:3300,1] 118 | 119 | 120 | y_test_credit = np.load("y_test_credit.npy")[0:1100] 121 | y_pred_credit = np.load("y_prob_credit.npy")[0:1100,1] 122 | 123 | y_test_credit_noencoding = np.load("y_test_credit_noencoding.npy")[0:1100] 124 | y_pred_credit_noencoding = np.load("y_prob_credit_noencoding.npy")[0:1100,1] 125 | 126 | roc_curve_two(y_test_credit, y_pred_credit, y_test_credit_noencoding, y_pred_credit_noencoding, "") 127 | 128 | #roc_curve(y_test_wiki, y_pred_wiki, "") 129 | #roc_curve(y_test_credit, y_pred_credit, "") 130 | 131 | # y_test_wiki = np.load("y_test_wiki.npy")[0:3300] 132 | # y_pred_wiki = (np.load("y_prob_wiki.npy")[0:3300,1] > 0.5).astype(int) 133 | 134 | # y_test_credit = np.load("y_test_credit.npy")[0:1300] 135 | # y_pred_credit = (np.load("y_prob_credit.npy")[0:1300,1] > 0.5).astype(int) 136 | 137 | 138 | 139 | 140 | # conf_mat_wiki = classification_report(y_test_wiki, y_pred_wiki, target_names=['benign', 'vandal'], digits=4) 141 | # conf_mat_cred = classification_report(y_test_credit, y_pred_credit, target_names=['benign', 'vandal'], digits=4) 142 | 143 | # print conf_mat_wiki 144 | #print conf_mat_cred 145 | 146 | 147 | exit(0) 148 | 149 | 150 | min_max_scaler = MinMaxScaler() 151 | x_benign = min_max_scaler.fit_transform(np.load("./hidden_repre/ben_hid_emd_4_50_8_200_r0.npy")) 152 | x_vandal = min_max_scaler.fit_transform(np.load("./hidden_repre/val_hid_emd_4_50_8_200_r0.npy")) 153 | 154 | x_benign = sample_shuffle_uspv(x_benign) 155 | x_vandal = sample_shuffle_uspv(x_vandal) 156 | 157 | X = x_benign[0:3000].tolist() + x_vandal[0:3000].tolist() 158 | y = np.zeros(3000).tolist() + np.ones(3000).tolist() 159 | X, y = np.array(X), np.array(y) 160 | 161 | model_2D = Isomap(n_components=2) 162 | X_2D = model_2D.fit_transform(X) 163 | 164 | 165 | draw_2D(X_2D, y) 166 | 167 | 168 | 169 | exit(0) 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | min_max_scaler = MinMaxScaler() 194 | 195 | # if en_ae == 1: 196 | # x_benign = min_max_scaler.fit_transform(np.load("./hidden_repre/ben_hid_emd_4_50_8_200_r0.npy")) 197 | # x_vandal = min_max_scaler.transform(np.load("./hidden_repre/val_hid_emd_4_50_8_200_r0.npy")) 198 | # elif en_ae == 2: 199 | # x_benign = min_max_scaler.fit_transform(np.load("./hidden_repre/credit_card/ben_hid_repre_r2.npy")) 200 | # x_vandal = min_max_scaler.transform(np.load("./hidden_repre/credit_card/van_hid_repre_r2.npy")) 201 | # else: 202 | # x_benign = min_max_scaler.fit_transform(np.load("./raw_credit_card/ben_raw_r0.npy")) 203 | # x_vandal = min_max_scaler.transform(np.load("./raw_credit_card/van_raw_r0.npy")) 204 | 205 | 206 | #x_benign = min_max_scaler.fit_transform(np.load("./hidden_output/ben_hid_emd_4_50_8_200.npy")) 207 | #x_vandal = min_max_scaler.transform(np.load("./hidden_output/val_hid_emd_4_50_8_200.npy")) 208 | 209 | 210 | def gen_circle_data(num_samples=11000): 211 | 212 | # make a simple unit circle 213 | theta = np.linspace(0, 2*np.pi, num_samples) 214 | a, b = 1 * np.cos(theta), 1 * np.sin(theta) 215 | r = np.random.rand((num_samples)) 216 | x, y = r * np.cos(theta), r * np.sin(theta) 217 | 218 | real_data = list() 219 | for i, e in enumerate(y): 220 | real_data.append([x[i], e]) 221 | return np.array(real_data) 222 | 223 | 224 | x_benign = gen_circle_data() 225 | x_benign = sample_shuffle_uspv(x_benign) 226 | # x_vandal = sample_shuffle_uspv(x_vandal) 227 | 228 | x_benign = x_benign[0:10000] 229 | x_pre = x_benign[0:7000] 230 | 231 | # exit(0) 232 | # print x_benign.shape, x_pre.shape 233 | # exit(0) 234 | 235 | 236 | # if en_ae == 1: 237 | # x_benign = x_benign[0:10000] 238 | # # x_vandal = x_vandal[0:10000] 239 | # x_pre = x_benign[0:7000] 240 | # else: 241 | # x_pre = x_benign[0:700] 242 | 243 | y_pre = np.zeros(len(x_pre)) 244 | y_pre = one_hot(y_pre, 2) 245 | 246 | x_train = x_pre 247 | 248 | y_real_mb = one_hot(np.zeros(mb_size), 2) 249 | y_fake_mb = one_hot(np.ones(mb_size), 2) 250 | 251 | # if en_ae == 1: 252 | # x_test = x_benign[-3000:].tolist() + x_vandal[-3000:].tolist() 253 | # else: 254 | # x_test = x_benign[-490:].tolist() + x_vandal[-490:].tolist() 255 | # x_test = np.array(x_test) 256 | 257 | 258 | # y_test = np.zeros(len(x_test)) 259 | # if en_ae == 1: 260 | # y_test[3000:] = 1 261 | # else: 262 | # y_test[490:] = 1 263 | 264 | 265 | sess = tf.Session() 266 | sess.run(tf.global_variables_initializer()) 267 | 268 | # pre-training for target distribution 269 | 270 | _ = sess.run(T_solver, 271 | feed_dict={ 272 | X_tar:x_pre, 273 | y_tar:y_pre 274 | }) 275 | 276 | q = np.divide(len(x_train), mb_size) 277 | 278 | # n_epoch = 1 279 | # 280 | # while n_epoch: 281 | 282 | d_ben_pro, d_fake_pro, fm_loss_coll = list(), list(), list() 283 | f1_score = list() 284 | d_val_pro = list() 285 | 286 | n_round = 200 287 | 288 | # if en_ae == 1: 289 | # n_round = 50 290 | # else: 291 | # n_round = 200 292 | 293 | 294 | # plt.scatter(x_train[0:2000,0], x_train[0:2000,1], c="r") 295 | # 296 | # plt.ylim([-1.5,1.5]) 297 | # plt.xlim([-1.5,1.5]) 298 | # plt.show() 299 | # exit(0) 300 | 301 | for n_epoch in range(n_round): 302 | 303 | X_mb_oc = sample_shuffle_uspv(x_train) 304 | 305 | for n_batch in range(q): 306 | 307 | _, D_loss_curr, ent_real_curr = sess.run([D_solver, D_loss, ent_real_loss], 308 | feed_dict={ 309 | X_oc: X_mb_oc[n_batch*mb_size:(n_batch+1)*mb_size], 310 | Z: sample_Z(mb_size, Z_dim), 311 | y_real: y_real_mb, 312 | y_gen: y_fake_mb 313 | }) 314 | 315 | _, G_loss_curr, fm_loss_curr = sess.run([G_solver, G_loss, fm_loss], 316 | # _, G_loss_curr, fm_loss_, kld_ = sess.run([G_solver, G_loss, fm_loss, pt_loss + G_ent_loss], 317 | feed_dict={Z: sample_Z(mb_size, Z_dim), 318 | X_oc: X_mb_oc[n_batch*mb_size:(n_batch+1)*mb_size], 319 | }) 320 | 321 | D_prob_real_, D_prob_gen_ = sess.run([D_prob_real, D_prob_gen], 322 | feed_dict={X_oc: x_train, 323 | Z: sample_Z(len(x_train), Z_dim)}) 324 | 325 | # if en_ae == 1: 326 | # D_prob_vandal_ = sess.run(D_prob_real, 327 | # feed_dict={X_oc: x_vandal[0:7000]}) 328 | # # feed_dict={X_oc:x_vandal[-490:]}) 329 | # else: 330 | # D_prob_vandal_ = sess.run(D_prob_real, 331 | # #feed_dict={X_oc: x_vandal[0:7000]}) 332 | # feed_dict={X_oc:x_vandal[-490:]}) 333 | 334 | d_ben_pro.append(np.mean(D_prob_real_[:, 0])) 335 | d_fake_pro.append(np.mean(D_prob_gen_[:, 0])) 336 | # d_val_pro.append(np.mean(D_prob_vandal_[:, 0])) 337 | fm_loss_coll.append(fm_loss_curr) 338 | print "epoch %s"%n_epoch, np.mean(fm_loss_coll) 339 | 340 | 341 | 342 | 343 | 344 | bg_gen = sess.run([G_sample], 345 | feed_dict={Z:sample_Z(2000, Z_dim)}) 346 | 347 | 348 | plt.scatter(bg_gen[:,0], bg_gen[:,1], c="r") 349 | plt.ylim([-1.5,1.5]) 350 | plt.xlim([-1.5,1.5]) 351 | plt.show() 352 | 353 | # prob, _ = sess.run([D_prob_real, D_logit_real], feed_dict={X_oc: x_test}) 354 | # y_pred = np.argmax(prob, axis=1) 355 | # conf_mat = classification_report(y_test, y_pred, target_names=['benign', 'vandal'], digits=4) 356 | # f1_score.append(float(filter(None, conf_mat.strip().split(" "))[12])) 357 | # print conf_mat 358 | 359 | # if not dra_tra_pro: 360 | # acc = np.sum(y_pred == y_test)/float(len(y_pred)) 361 | # print conf_mat 362 | # print "acc:%s"%acc 363 | # 364 | # if dra_tra_pro: 365 | # draw_trend(d_ben_pro, d_fake_pro, d_val_pro, fm_loss_coll, f1_score) 366 | 367 | exit(0) 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | -------------------------------------------------------------------------------- /Baselines/base_classifiers.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 1/15/2018 4 | Python Version: 2.7 5 | ''' 6 | 7 | import sys 8 | sys.path.append("..\\..\\") 9 | import os 10 | import numpy as np 11 | from libs import Autoencoder 12 | from keras.models import Sequential, Model 13 | from keras.layers import Input, LSTM 14 | from keras.layers.core import Masking 15 | 16 | from sklearn import tree, ensemble, neighbors, svm, covariance 17 | 18 | def k_NN(X,y): 19 | clf = neighbors.KNeighborsClassifier(n_neighbors=3) 20 | return clf.fit(X,y) 21 | 22 | def decision_tree(X,y): 23 | clf = tree.DecisionTreeClassifier() 24 | return clf.fit(X, y) 25 | 26 | def random_forest(X,y): 27 | clf = ensemble.RandomForestClassifier(n_estimators=10) 28 | return clf.fit(X,y) 29 | 30 | def svm_svc(X,y): 31 | clf = svm.SVC() 32 | return clf.fit(X,y) 33 | 34 | def svm_nusvc(X,y): 35 | clf = svm.NuSVC() 36 | return clf.fit(X,y) 37 | 38 | def svm_linearsvc(X,y): 39 | clf = svm.LinearSVC() 40 | return clf.fit(X,y) 41 | 42 | def svm_oneclass(X): 43 | clf = svm.OneClassSVM() 44 | return clf.fit(X) 45 | 46 | def elliptic_envelope(X): 47 | clf = covariance.EllipticEnvelope() 48 | return clf.fit(X) 49 | 50 | def iso_forest(X): 51 | clf = ensemble.IsolationForest(max_samples=X.shape[0], random_state=None) 52 | return clf.fit(X) 53 | 54 | class LSTM_Autoencoder(object): 55 | """docstring for LSTM_Autoencoder""" 56 | def __init__(self, input_dim, time_step, hidden_dim): 57 | self.input_dim = input_dim 58 | self.time_step = time_step 59 | self.hidden_dim = hidden_dim 60 | self.autoencoder = Autoencoder() 61 | self.autoencoder.modelMasking('lstm', [self.time_step, self.input_dim], self.hidden_dim) 62 | 63 | def compile(self): 64 | self.autoencoder.compile('temporal') 65 | 66 | def fit(self, data, weights): 67 | self.autoencoder.fit(data, 'rev', weights) 68 | 69 | def get_hidden_layer_last_step(self): 70 | # print "net summary: ", self.autoencoder.model.summary() 71 | self.hidden_representation = Sequential() 72 | self.hidden_representation.add(self.autoencoder.model.layers[0]) 73 | self.hidden_representation.add(self.autoencoder.model.layers[1]) 74 | self.hidden_representation.add(self.autoencoder.model.layers[2]) 75 | 76 | def get_hidden_layer_sequence(self): 77 | inputData = Input(shape=(self.time_step, self.input_dim)) 78 | mask = Masking(mask_value=0.)(inputData) 79 | encoded = LSTM(self.hidden_dim[0], return_sequences=True, weights=self.autoencoder.model.layers[2].get_weights())(mask) 80 | self.hidden_representation = Model(inputData, encoded) 81 | 82 | def get_hidden_representation(self, data): 83 | return self.hidden_representation.predict(data) 84 | 85 | class Dense_Autoencoder(object): 86 | """docstring for LSTM_Autoencoder""" 87 | def __init__(self, input_dim, hidden_dim): 88 | self.input_dim = input_dim 89 | self.hidden_dim = hidden_dim 90 | self.autoencoder = Autoencoder() 91 | self.autoencoder.modelMasking('dense', [self.input_dim], self.hidden_dim) 92 | 93 | def compile(self): 94 | self.autoencoder.compile() 95 | 96 | def fit(self, data): 97 | self.autoencoder.fit(data, 'nor') 98 | 99 | def get_hidden_layer(self): 100 | # print "net summary: ", self.autoencoder.model.summary() 101 | self.hidden_representation = Sequential() 102 | self.hidden_representation.add(self.autoencoder.model.layers[0]) 103 | self.hidden_representation.add(self.autoencoder.model.layers[1]) 104 | self.hidden_representation.add(self.autoencoder.model.layers[2]) 105 | 106 | def get_hidden_representation(self, data): 107 | return self.hidden_representation.predict(data) 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /Baselines/baseline_OCC.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 1/15/2018 4 | Python Version: 2.7 5 | ''' 6 | 7 | import os 8 | import sys 9 | sys.path.append(os.getcwd() + "\\..\\..\\") 10 | import numpy as np 11 | from utils import sample_shuffle 12 | from base_classifiers import svm_oneclass, elliptic_envelope, iso_forest 13 | from sklearn.metrics import classification_report 14 | from sklearn.metrics import f1_score, accuracy_score 15 | import matplotlib.pyplot as plt 16 | 17 | from baseline_OCC_utils import * 18 | from base_classifiers import LSTM_Autoencoder, svm_oneclass, Dense_Autoencoder 19 | from model_components import train_gan, run_Gan,run_one_svm 20 | from utils import sample_shuffle, draw_trend, plot_decision_boundary, TSNE_2D_show_tri 21 | 22 | 23 | matlab_script_path = "C:\\Users\\Panpan_user\\Documents\\MATLAB\\" 24 | matlab_eng = matlab_engine_setup(matlab_script_path) 25 | 26 | # try_num = sys.argv[1:] 27 | # Load data and preprocess. 28 | en_ae = 1 # 1 for wiki; 2 for credit card with encoding; 3 for credit card without encoding. 29 | dra_tra_pro = True # Observe the training process along epochs, or run training then test it. 30 | 31 | if en_ae == 1: 32 | samples_path = os.getcwd() + "\\..\\..\\sampleData\\" 33 | f_ben, f_van = "X_v8_4_50_Ben", "X_v8_4_50_Van" 34 | x_ben, x_van = load_data(samples_path, f_ben, f_van) 35 | input_dim = 8 36 | hid_dim = [200] 37 | d_in = [200] 38 | epochs = 150 39 | elif en_ae == 2: 40 | x_ben, x_van = getDataCCFD("creditcard.csv.zip") 41 | x_ben = sample_shuffle(x_ben)[0:2000] 42 | input_dim = 30 43 | hid_dim = [100] 44 | d_in = [50] #autoencoding. 45 | epochs = 200 46 | else: 47 | x_ben, x_van = getDataCCFD("creditcard.csv.zip") 48 | x_ben = sample_shuffle(x_ben)[0:2000] 49 | input_dim = 30 50 | d_in = [input_dim] # without autoencoding. 51 | epochs = 200 52 | 53 | train_ratio = .7 54 | max_len = 50 55 | time_step = max_len 56 | g_in = [50] 57 | gan_in = [50] 58 | sampling_ratio = train_ratio 59 | neg_label_OCC = 2 60 | neg_label_GAN = 0 61 | iter_num = 10 62 | 63 | prec_coll = list() 64 | reca_coll = list() 65 | f1_score_coll = list() 66 | accuracy_coll = list() 67 | 68 | for i in range(iter_num): 69 | if en_ae == 1: # LSTM-autoencoder for wiki data. 70 | x_train_P, x_ben_P, x_van_P, weights_P, __, __ = sampling_preprocessing_LSTM_AE(x_ben, x_van, train_ratio, max_len) 71 | lstm_ae = LSTM_Autoencoder(input_dim, time_step, hid_dim) 72 | lstm_ae.compile() 73 | lstm_ae.fit(x_train_P, weights_P) 74 | lstm_ae.get_hidden_layer_last_step() 75 | ben_hid_repre, van_hid_repre = map(lambda x: lstm_ae.get_hidden_representation(x), [x_ben_P, x_van_P]) 76 | ben_hid_repre, van_hid_repre = map(lambda x: preprocess_minus_1_and_pos_1(x), [ben_hid_repre, van_hid_repre]) 77 | elif en_ae == 2: # Dense encoder for Credit Card data. 78 | dense_ae = Dense_Autoencoder(input_dim, hid_dim) 79 | dense_ae.compile() 80 | dense_ae.fit(x_ben[0:700]) 81 | dense_ae.get_hidden_layer() 82 | ben_hid_repre, van_hid_repre = map(lambda x: dense_ae.get_hidden_representation(x), [x_ben, x_van]) 83 | ben_hid_repre, van_hid_repre = map(lambda x: preprocess_minus_1_and_pos_1(x), [ben_hid_repre, van_hid_repre]) 84 | # np.save("ben_hid_repre_r%s"%i, ben_hid_repre) 85 | # np.save("van_hid_repre_r%s"%i, van_hid_repre) 86 | else: 87 | ben_hid_repre, van_hid_repre = map(lambda x: preprocess_minus_1_and_pos_1(x), [x_ben, x_van]) 88 | np.save("ben_raw_r%s"%i, ben_hid_repre) 89 | np.save("van_raw_r%s"%i, van_hid_repre) 90 | 91 | x_train, x_test, y_train_OCC, y_test_OCC, y_test_GAN = \ 92 | sampling_data_for_OCC(ben_hid_repre, van_hid_repre, sampling_ratio, neg_label_OCC, neg_label_GAN, en_ae) 93 | 94 | GAN, D, G = get_GAN(g_in, d_in, gan_in) 95 | if dra_tra_pro: 96 | D, X_fake, D_real_prob, D_fake_prob, D_val_prob, fake_real_mse, f1_score = \ 97 | train_and_test(GAN, G, D, x_train, x_test, y_test_GAN, en_ae, epochs) 98 | x_test_ben = x_test[y_test_GAN == 1] 99 | x_test_van = x_test[y_test_GAN != 1] 100 | x_test_ben = sample_shuffle(x_test_ben) 101 | x_test_van = sample_shuffle(x_test_van) 102 | X_fake = sample_shuffle(X_fake) 103 | X = x_test_ben[0:1000].tolist() + X_fake[0:1000].tolist() + x_test_van[0:1000].tolist() 104 | y = np.ones(1000).tolist() + np.zeros(1000).tolist() + (np.ones(1000)+1).tolist() 105 | X, y = np.array(X), np.array(y) 106 | TSNE_2D_show_tri(X, y) 107 | draw_trend(D_real_prob, D_fake_prob, D_val_prob, fake_real_mse, f1_score) 108 | exit(0) 109 | else: 110 | discriminator = train_gan(GAN, G, D, x_train, epochs, en_ae) 111 | prec_gan, reca_gan, f1_gan, acc_gan = run_Gan(x_test, y_test_GAN, discriminator, en_ae) 112 | 113 | prec_OCC, reca_OCC, f1_OCC, acc_OCC = run_OCC(x_train, x_test, y_train_OCC, y_test_OCC, matlab_eng, i, en_ae) 114 | if en_ae == 1: 115 | clf = svm_oneclass(x_train[0:7000]) 116 | else: 117 | clf = svm_oneclass(x_train[0:700]) 118 | prec_svm, reca_svm, f1_svm, acc_svm = run_one_svm(x_test, y_test_OCC, clf, en_ae) 119 | 120 | prec_coll.append([prec_gan] + prec_OCC + [prec_svm]) 121 | reca_coll.append([reca_gan] + reca_OCC + [reca_svm]) 122 | f1_score_coll.append([f1_gan] + f1_OCC + [f1_svm]) 123 | accuracy_coll.append([acc_gan] + acc_OCC + [acc_svm]) 124 | 125 | prec_coll, reca_coll, f1_score_coll, accuracy_coll = \ 126 | np.array(prec_coll), np.array(reca_coll), np.array(f1_score_coll), np.array(accuracy_coll) 127 | 128 | print "====================== precision =================================" 129 | 130 | print "prec_gan: ", map(lambda x: decimal_precision(x, 4), [np.mean(prec_coll[:,0]), 131 | np.std(prec_coll[:,0])]) 132 | print "prec_gpoc: ", map(lambda x: decimal_precision(x, 4), [np.mean(prec_coll[:,1]), 133 | np.std(prec_coll[:,1])]) 134 | 135 | print "prec_nn: ", map(lambda x: decimal_precision(x, 4), [np.mean(prec_coll[:,2]), 136 | np.std(prec_coll[:,2])]) 137 | 138 | print "prec_scikit_svm: ", map(lambda x: decimal_precision(x, 4), [np.mean(prec_coll[:,3]), 139 | np.std(prec_coll[:,3])]) 140 | 141 | print "====================== recall =================================" 142 | 143 | print "reca_gan: ", map(lambda x: decimal_precision(x, 4), [np.mean(reca_coll[:,0]), 144 | np.std(reca_coll[:,0])]) 145 | print "reca_gpoc: ", map(lambda x: decimal_precision(x, 4), [np.mean(reca_coll[:,1]), 146 | np.std(reca_coll[:,1])]) 147 | print "reca_nn: ", map(lambda x: decimal_precision(x, 4), [np.mean(reca_coll[:,2]), 148 | np.std(reca_coll[:,2])]) 149 | print "reca_scikit_svm: ", map(lambda x: decimal_precision(x, 4), [np.mean(reca_coll[:,3]), 150 | np.std(reca_coll[:,3])]) 151 | 152 | print "===================== f1 score ================================" 153 | print "f1_score_gan: ", map(lambda x: decimal_precision(x, 4), [np.mean(f1_score_coll[:,0]), 154 | np.std(f1_score_coll[:,0])]) 155 | print "f1_score_gpoc: ", map(lambda x: decimal_precision(x, 4), [np.mean(f1_score_coll[:,1]), 156 | np.std(f1_score_coll[:,1])]) 157 | print "f1_score_nn: ", map(lambda x: decimal_precision(x, 4), [np.mean(f1_score_coll[:,2]), 158 | np.std(f1_score_coll[:,2])]) 159 | print "f1_scikit_svm: ", map(lambda x: decimal_precision(x, 4), [np.mean(f1_score_coll[:,3]), 160 | np.std(f1_score_coll[:,3])]) 161 | 162 | print "====================== accuracy =================================" 163 | 164 | print "acc_gan: ", map(lambda x: decimal_precision(x, 4), [np.mean(accuracy_coll[:,0]), 165 | np.std(accuracy_coll[:,0])]) 166 | print "acc_gpoc: ", map(lambda x: decimal_precision(x, 4), [np.mean(accuracy_coll[:,1]), 167 | np.std(accuracy_coll[:,1])]) 168 | print "acc_nn: ", map(lambda x: decimal_precision(x, 4), [np.mean(accuracy_coll[:,2]), 169 | np.std(accuracy_coll[:,2])]) 170 | print "acc_scikit_svm: ", map(lambda x: decimal_precision(x, 4), [np.mean(accuracy_coll[:,3]), 171 | np.std(accuracy_coll[:,3])]) 172 | exit(0) 173 | 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /Baselines/baseline_OCC_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 1/15/2018 4 | Python Version: 2.7 5 | ''' 6 | 7 | import os 8 | import sys 9 | sys.path.append("..\\..\\") 10 | sys.path.append(os.getcwd() + "\\..\\..\\") 11 | 12 | 13 | import numpy as np 14 | from data_generation import bw_one_and_minus_one 15 | from model_components import get_generator, get_discriminator, make_gan, train_and_test 16 | from utils import sample_shuffle 17 | from sklearn.metrics import classification_report 18 | from sklearn.metrics import accuracy_score 19 | import matplotlib.pyplot as plt 20 | 21 | from keras.preprocessing.sequence import pad_sequences 22 | from base_classifiers import LSTM_Autoencoder 23 | from keras.layers import Input 24 | import scipy.io as sio 25 | import matlab.engine 26 | import pandas as pd 27 | from sklearn import preprocessing 28 | 29 | 30 | def draw_f1_accuracy(f1_score, accuracy, ind): 31 | fig = plt.figure() 32 | axes = plt.gca() 33 | plt.subplot(2, 1, 1) 34 | plt.plot(ind, f1_score, "ro-") 35 | plt.ylabel('f1_score') 36 | axes.set_xlim([1., 20.]) 37 | plt.subplot(2, 1, 2) 38 | plt.plot(ind, accuracy, "bo-") 39 | plt.ylabel('accuracy') 40 | plt.xlabel('Round #') 41 | axes.set_xlim([1., 20.]) 42 | plt.show() 43 | 44 | def load_data(data_path, f_ben, f_van): 45 | data_ben = np.load(data_path + "%s.npy"%f_ben) 46 | data_van = np.load(data_path + "%s.npy"%f_van) 47 | return data_ben, data_van 48 | 49 | def preprocess_minus_1_and_pos_1(X): 50 | return np.array(map(lambda x: bw_one_and_minus_one(x), X)) 51 | 52 | def sampling_preprocessing_LSTM_AE(x_ben, x_van, train_ratio, max_len): 53 | n_samples_train = int(x_ben.shape[0] * train_ratio) 54 | # x_train = sample_shuffle(x_ben)[0:n_samples_train] # shuffle and sampling data 55 | x_ben = sample_shuffle(x_ben) 56 | x_van = sample_shuffle(x_van) 57 | x_train = x_ben[0:n_samples_train] 58 | weights = get_sample_weights(x_train) # contruct the sample weights for LSTM-AE output 59 | 60 | return seq_padding(x_train, max_len, 'pre'), \ 61 | seq_padding(x_ben, max_len, 'pre'), \ 62 | seq_padding(x_van, max_len, 'pre'), \ 63 | seq_padding(weights, max_len, 'post'), \ 64 | map(lambda x: len(x), x_ben),\ 65 | map(lambda x: len(x), x_van)# padding sequence, 66 | # 'pre' for editting sequence 67 | # 'post' for weights sequence 68 | 69 | 70 | def sampling_data_for_OCC(x_ben, x_van, sampling_ratio, neg_label1, neg_label2, en_ae): 71 | n_samples_train = int(len(x_ben) * sampling_ratio) 72 | if en_ae == 1: 73 | n_samples_test = len(x_ben) - n_samples_train 74 | else: 75 | n_samples_test = len(x_van) 76 | # n_samples_train = int(x_ben.shape[0] * sampling_ratio) 77 | # n_samples_test = x_ben.shape[0] - n_samples_train 78 | # assert n_samples_test <= x_van.shape[0] 79 | # assert n_samples_test <= len(x_van) 80 | x_ben, x_van = sample_shuffle(x_ben), sample_shuffle(x_van) 81 | x_train = x_ben[0:n_samples_train] 82 | x_test = x_ben[-n_samples_test:].tolist() + x_van[-n_samples_test:].tolist() 83 | x_test = np.array(x_test) 84 | y_train_OCC = np.ones(n_samples_train) 85 | y_test_OCC = np.ones(2 * n_samples_test) 86 | y_test_OCC[n_samples_test:] = neg_label1 87 | y_test_GAN = np.ones(2 * n_samples_test) 88 | y_test_GAN[n_samples_test:] = neg_label2 89 | return x_train, x_test, \ 90 | y_train_OCC, y_test_OCC, y_test_GAN 91 | 92 | def decimal_precision(x, digit_num): 93 | if "e" in str(x): 94 | x_decimal = x 95 | else: 96 | itgr_part, frac_part = str(x).split(".") 97 | if len(frac_part) > digit_num: 98 | x_decimal = itgr_part + "." + frac_part[0:digit_num] 99 | else: 100 | x_decimal = itgr_part + "." + frac_part 101 | return float(x_decimal) 102 | 103 | def conf_mat_f1_accuracy(y_test, y_pred, tgt_nam1, tgt_nam2): 104 | conf_mat = classification_report(y_test, y_pred, target_names=[tgt_nam1, tgt_nam2], digits=4) 105 | f1 = float(filter(None, conf_mat[-50:].strip().split(" "))[-2]) # avarage f1 of tgt_nam1 and tgt_nam2 106 | acc = accuracy_score(y_test, y_pred) 107 | f1, acc = map(lambda x: decimal_precision(x, 4), [f1, acc]) 108 | return conf_mat, f1, acc 109 | 110 | 111 | def get_sample_weights(samples): 112 | sampleWeights = list() 113 | for e in samples: 114 | sampleWeights.append(np.ones(len(e))) 115 | return sampleWeights 116 | 117 | def seq_padding(sample_sequence, max_length, padding_type): 118 | return pad_sequences(sample_sequence, maxlen=max_length, dtype='float', padding=padding_type) 119 | 120 | def get_GAN(g_in, d_in, gan_in): 121 | G_in = Input(shape=g_in) 122 | G, G_out = get_generator(G_in, d_in[0]) 123 | # discriminator (x -> y) 124 | D_in = Input(shape=d_in) 125 | D, D_out = get_discriminator(D_in) 126 | GAN_in = Input(shape=gan_in) 127 | GAN, GAN_out = make_gan(GAN_in, G, D) 128 | return GAN, D, G 129 | 130 | def matlab_engine_setup(matlab_script_path): 131 | eng = matlab.engine.start_matlab() 132 | eng.addpath(matlab_script_path, nargout=0) 133 | eng.addpath(matlab_script_path + "netlab3_2\\", nargout=0) 134 | eng.addpath(matlab_script_path + "NDtoolv0.12\\", nargout=0) 135 | eng.addpath(matlab_script_path + "NDtoolv0.12\\Netlab\\", nargout=0) 136 | return eng 137 | 138 | def run_OCC(x_train, x_test, y_train_OCC, y_test_OCC, eng, i, en_ae): 139 | # nd_type = ['gpoc', 'svmsch', 'nn', 'kpca'] 140 | nd_type = ['gpoc', 'nn'] 141 | mat_store_path = os.getcwd() + "\\..\\..\\hidden_representation\\mat_OCC\\" 142 | prec_container, reca_container, f1_container, acc_container = list(), list(), list(), list() 143 | X = np.concatenate((x_train, x_test)) 144 | y = np.concatenate((y_train_OCC, y_test_OCC)) 145 | sio.savemat(mat_store_path + "X_hid_emd_4_50_8_200_r%s.mat"%i, dict(x=X, y=y)) 146 | for tp in nd_type: 147 | prec, reca, f1, acc = eng.run_baseline(mat_store_path + "X_hid_emd_4_50_8_200_r%s.mat"%i, tp, en_ae, nargout=4) 148 | prec_container.append(prec) 149 | reca_container.append(reca) 150 | f1_container.append(f1) 151 | acc_container.append(acc) 152 | 153 | return prec_container, reca_container, f1_container, acc_container 154 | 155 | def sampling_data_for_dynamic(x_ben, x_van, sampling_ratio, neg_label): 156 | n_samples_train = int(len(x_ben) * sampling_ratio) 157 | n_samples_test = len(x_ben) - n_samples_train 158 | assert n_samples_test <= x_van.shape[0] 159 | x_ben, x_van = sample_shuffle(x_ben), sample_shuffle(x_van) 160 | x_train = x_ben[0:n_samples_train] 161 | x_test = x_ben[-n_samples_test:].tolist() + x_van[-n_samples_test:].tolist() 162 | x_test = np.array(x_test) 163 | y_test_GAN = np.ones(2 * n_samples_test) 164 | y_test_GAN[n_samples_test:] = neg_label 165 | return x_train, x_test, y_test_GAN 166 | 167 | def getDataCCFD(f_name): 168 | data = pd.read_csv(f_name) 169 | X = data.loc[: ,data.columns!='Class'] 170 | X.loc[:,'Time'] = (X.loc[:,'Time'].values/3600)%24 171 | y = data.loc[:,'Class'] 172 | min_max_scaler = preprocessing.MinMaxScaler() 173 | X = min_max_scaler.fit_transform(X.values) 174 | y = y.values 175 | return X[y==0], X[y==1] 176 | -------------------------------------------------------------------------------- /Baselines/baseline_VEWS_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 1/15/2018 4 | Python Version: 2.7 5 | ''' 6 | 7 | import os 8 | import sys 9 | sys.path.append(os.getcwd() + "\\..\\OCC\\") 10 | sys.path.append(os.getcwd() + "\\..\\..\\") 11 | import numpy as np 12 | from utils import sample_shuffle 13 | from baseline_OCC_utils import get_sample_weights, seq_padding, bw_one_and_minus_one 14 | 15 | def sample_shuffle_with_label(X,y): 16 | n_samples = X.shape[0] 17 | s = np.arange(n_samples) 18 | np.random.shuffle(s) 19 | return X[s], y[s] 20 | 21 | def sampling_preprocessing_LSTM_AE(x_ben, x_van, train_ratio, max_len): 22 | 23 | assert train_ratio < 1. 24 | n_samples_train = int(x_ben.shape[0] * train_ratio) 25 | 26 | assert n_samples_train <= x_van.shape[0] 27 | x_train = sample_shuffle(x_ben)[0:n_samples_train].tolist() + \ 28 | sample_shuffle(x_van)[0:n_samples_train].tolist() 29 | x_train = sample_shuffle(np.array(x_train)) 30 | weights = get_sample_weights(x_train) # contruct the sample weights for LSTM-AE output. 31 | return seq_padding(x_train, max_len, 'pre'), \ 32 | seq_padding(x_ben, max_len, 'pre'), \ 33 | seq_padding(x_van, max_len, 'pre'), \ 34 | seq_padding(weights, max_len, 'post') # 'post' for weights sequence 35 | 36 | def sampling_data_for_VEWS(x_ben, x_van): 37 | 38 | y_ben, y_van = np.ones(x_ben.shape[0]), np.zeros(x_van.shape[0]) 39 | x_ben, y_ben = sample_shuffle_with_label(x_ben, y_ben) 40 | x_van, y_van = sample_shuffle_with_label(x_van, y_van) 41 | return x_ben, x_van, y_ben, y_van 42 | 43 | def k_fold_indices(n_samples, i, step): 44 | indices = np.arange(n_samples) 45 | test_indices = xrange(i * step, (i + 1) * step) 46 | train_indices = np.setdiff1d(indices, test_indices) 47 | return test_indices, train_indices 48 | -------------------------------------------------------------------------------- /Baselines/data_generation.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 1/15/2018 4 | Python Version: 2.7 5 | ''' 6 | 7 | import os 8 | import sys 9 | import numpy as np 10 | import glob 11 | from libs import Dataset 12 | from utils import getPageDict, MetaPageList,sample_shuffle 13 | from shutil import rmtree 14 | 15 | from keras.preprocessing.sequence import pad_sequences 16 | from keras.models import Sequential 17 | from libs import Autoencoder 18 | 19 | 20 | def gen_samples(): 21 | """ 22 | extract features(f1~f7) from Wikipedia data repository released by "VEWS" (http://www.cs.umd.edu/~vs/vews), and 23 | construct samples used in our experiment. 24 | :return: data samples with variant editing length(4~50) and fixed-length(10-step). 25 | 26 | f1~f7: 27 | f1: whether or not pi is a meta-page 28 | f2: if f1 is yes, whether or not pi's category is empty set. if not, f2=0 29 | f3: whether or not time difference is less than 3 minutes between pi-1 and pi 30 | f4: whether or not pi has already been edited. (re-edit) 31 | f5: if f4 is yes, whether or not pi equals to pi-1. (consective re-edit). if no, f5=0 32 | f6: if f4 is no, whether or not pi has the common category with pi-1. if yes, f6 = 1 33 | f7(optional): whether or not edits are reverted. This information is 34 | from Wikipedia auto-bots, such as cluebot, for bad editing revert. 35 | """ 36 | 37 | base = os.getcwd() 38 | dataRepo = base + "\\Dataset\\" 39 | if not os.path.exists(dataRepo): 40 | raise OSError("data repository is not available.") 41 | else: 42 | f_pages = dataRepo + "pages.tsv" 43 | f_users = dataRepo + "users.tsv" 44 | files = glob.glob(dataRepo + "*.csv") 45 | data = Dataset() 46 | 47 | rawData = base + "\\rawData\\" 48 | if not os.path.exists(rawData): 49 | os.makedirs(rawData) 50 | data.getRawData(files, f_users, f_pages, .7, rawData) 51 | 52 | sampleData = base + "\\sampleData\\" 53 | # if os.path.exists(sampleData): 54 | # rmtree(sampleData) 55 | # os.makedirs(sampleData) 56 | 57 | X_tim = np.load(rawData + "wikiEditSeq_0.7\\X_tim.npy") 58 | X_pages = np.load(rawData + "wikiEditSeq_0.7\\X_pages.npy") 59 | X_rev = np.load(rawData + "wikiEditSeq_0.7\\X_rev.npy") 60 | y = np.load(rawData + "wikiEditSeq_0.7\\y.npy") 61 | page2id, page2Cgr, _ = getPageDict(f_pages) 62 | 63 | np.save(sampleData + "MetaPageList.npy", MetaPageList(files,page2id)) 64 | metaDict = np.load(sampleData + "MetaPageList.npy") 65 | 66 | # excluding 'revert' information. 67 | data.getSamples(X_pages, X_tim, y, metaDict, page2Cgr, 68 | "fix", 20, None, sampleData) 69 | # data.getSamples(X_pages, X_tim, y, metaDict, page2Cgr, 70 | # "var", 4, 50, sampleData) 71 | 72 | # including 'revert' information. 73 | data.getSamples(X_pages, X_tim, X_rev, y, metaDict, page2Cgr, 74 | "fix", 20, None, sampleData) 75 | # data.getSamples(X_pages, X_tim, X_rev, y, metaDict, page2Cgr, 76 | # "var", 4, 50, sampleData) 77 | # data.getSamples(X_pages, X_tim, X_rev, y, metaDict, page2Cgr, 78 | # "var", 1, 20, sampleData) 79 | 80 | 81 | def gen_hid_repre(fea_dim, hid_dim, fix_or_var, step_length): 82 | 83 | """ 84 | :param fea_dim: input dimension of LSTM-AE model 85 | :param hid_dim: output dimension of hidden representation 86 | :param fix_or_var: editing sequence is fixed-length or variant-length. 87 | :return: fixed-length hidden representation of editing sequence. 88 | """ 89 | base_path = os.getcwd() 90 | samples_path = base_path + "\\sampleData\\" 91 | repre_path = base_path + "\\hidden_representation\\" 92 | if not os.path.exists(repre_path): 93 | os.makedirs(repre_path) 94 | 95 | if fix_or_var == 1: 96 | # Load data 97 | x_ben = np.load(samples_path + "X_%s_1_20_Ben.npy" %fea_dim) 98 | x_van = np.load(samples_path + "X_%s_1_20_Van.npy" %fea_dim) 99 | # print x_ben.shape, x_van.shape 100 | # exit(0) 101 | x_ben = sample_shuffle(x_ben)[0:6000] 102 | x_van = sample_shuffle(x_van)[0:3000] 103 | train_ben = x_ben[0:3000] 104 | 105 | # Fit Model 106 | timesteps = 20 107 | input_dim = fea_dim 108 | 109 | autoencoder = Autoencoder() 110 | autoencoder.model('lstm', [timesteps, input_dim], hid_dim) 111 | autoencoder.compile() 112 | autoencoder.fit(train_ben, "rev") 113 | 114 | hidModel = Sequential() 115 | hidModel.add(autoencoder.model.layers[0]) 116 | hidModel.add(autoencoder.model.layers[1]) 117 | 118 | ben_hid_emd = hidModel.predict(x_ben) 119 | van_hid_emd = hidModel.predict(x_van) 120 | 121 | # store data 122 | np.save(repre_path + "ben_hid_emd_20_%s_%s" % (fea_dim, hid_dim[0]), ben_hid_emd) 123 | np.save(repre_path + "van_hid_emd_20_%s_%s" % (fea_dim, hid_dim[0]), van_hid_emd) 124 | 125 | elif fix_or_var == 0: 126 | if step_length == 20: 127 | x_ben = np.load(samples_path + "X_%s_1_20_Ben.npy" % fea_dim) 128 | x_van = np.load(samples_path + "X_%s_1_20_Van.npy" % fea_dim) 129 | x_ben = sample_shuffle(x_ben) # 16496 130 | x_van = sample_shuffle(x_van) # 17015 131 | # train_ben = np.concatenate((x_ben[0:10000], x_van[0:10000])) # mix samples for baseline 'latent representation.' 132 | train_ben = x_ben[0:10000] 133 | 134 | sampleWeights = list() 135 | for e in train_ben: 136 | sampleWeights.append(np.ones(len(e))) 137 | 138 | train_ben_P = pad_sequences(train_ben, maxlen=20, dtype='float') 139 | x_ben_P = pad_sequences(x_ben, maxlen=20, dtype='float') 140 | x_van_P = pad_sequences(x_van, maxlen=20, dtype='float') 141 | 142 | # decoding sequence is reversed 143 | sampleWeights = pad_sequences(sampleWeights, maxlen=20, dtype='float', padding='post') 144 | 145 | timesteps = 20 146 | input_dim = fea_dim 147 | autoencoder = Autoencoder() 148 | autoencoder.modelMasking('lstm', [timesteps, input_dim], hid_dim) 149 | autoencoder.compile('temporal') 150 | autoencoder.fit(train_ben_P, 'rev', sampleWeights) 151 | 152 | hidModel = Sequential() 153 | hidModel.add(autoencoder.model.layers[0]) 154 | hidModel.add(autoencoder.model.layers[1]) 155 | hidModel.add(autoencoder.model.layers[2]) 156 | 157 | ben_hid_emd = hidModel.predict(x_ben_P) 158 | van_hid_emd = hidModel.predict(x_van_P) 159 | 160 | # store data 161 | # np.save(repre_path + "ben_hid_emd_mix_1_20_%s_%s" % (fea_dim, hid_dim[0]), ben_hid_emd) 162 | # np.save(repre_path + "val_hid_emd_mix_1_20_%s_%s" % (fea_dim, hid_dim[0]), van_hid_emd) 163 | 164 | elif step_length == 50: 165 | 166 | x_ben = np.load(samples_path + "X_v%s_4_50_Ben.npy" %fea_dim) 167 | x_van = np.load(samples_path + "X_v%s_4_50_Van.npy" %fea_dim) 168 | x_ben = sample_shuffle(x_ben) 169 | x_van = sample_shuffle(x_van) 170 | train_ben = x_ben[0:7000] 171 | 172 | sampleWeights = list() 173 | for e in train_ben: 174 | sampleWeights.append(np.ones(len(e))) 175 | 176 | train_ben_P = pad_sequences(train_ben, maxlen=50, dtype='float') 177 | x_ben_P = pad_sequences(x_ben, maxlen=50, dtype='float') 178 | x_van_P = pad_sequences(x_van, maxlen=50, dtype='float') 179 | 180 | # decoding sequence is reversed 181 | sampleWeights = pad_sequences(sampleWeights, maxlen=50, dtype='float', padding='post') 182 | 183 | timesteps = 50 184 | input_dim = fea_dim 185 | autoencoder = Autoencoder() 186 | autoencoder.modelMasking('lstm', [timesteps, input_dim], hid_dim) 187 | autoencoder.compile('temporal') 188 | autoencoder.fit(train_ben_P, 'rev', sampleWeights) 189 | 190 | hidModel = Sequential() 191 | hidModel.add(autoencoder.model.layers[0]) 192 | hidModel.add(autoencoder.model.layers[1]) 193 | hidModel.add(autoencoder.model.layers[2]) 194 | 195 | ben_hid_emd = hidModel.predict(x_ben_P) 196 | van_hid_emd = hidModel.predict(x_van_P) 197 | 198 | return ben_hid_emd, van_hid_emd 199 | 200 | def bw_one_and_minus_one(x): 201 | return ((x-min(x))/float((max(x)-min(x))))*2 - 1 202 | 203 | 204 | -------------------------------------------------------------------------------- /Baselines/latent_repre_explore.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 1/15/2018 4 | Python Version: 2.7 5 | ''' 6 | 7 | import numpy as np 8 | from sklearn.cluster import DBSCAN 9 | from sklearn import metrics 10 | from sklearn.datasets.samples_generator import make_blobs 11 | from sklearn.preprocessing import StandardScaler 12 | from numpy.random import multivariate_normal 13 | from representation_libs import db_span, get_eps, cluster_analyis, DB_statistics 14 | import json 15 | from utils import sample_shuffle 16 | from sklearn.manifold import TSNE 17 | from mpl_toolkits.mplot3d import axes3d 18 | import matplotlib.pyplot as plt 19 | from scipy.spatial import distance 20 | 21 | from matplotlib.axes import Axes 22 | 23 | x_ben = np.load("ben_hid_repre.npy") 24 | x_van = np.load("van_hid_repre.npy") 25 | 26 | x_fake = sample_shuffle(np.load("x_fake.npy"))[0:len(x_van)] 27 | 28 | X = np.concatenate((x_ben, x_van, x_fake)) 29 | y = np.concatenate((np.ones(len(x_ben)), np.zeros(len(x_van)), np.ones(len(x_fake)) + 1)) 30 | eps_X = get_eps(X) 31 | 32 | clusters, outlier = db_span(X, 1.4305, 180) 33 | 34 | # clusters, outlier = db_span(X, eps_X*.48, 180) 35 | # print "eps: ", eps_X*.48 36 | 37 | 38 | 39 | cluster_X = list() 40 | cluster_y = list() 41 | cluster_c = list() 42 | for cluster_id, class_ids in clusters.items(): 43 | cluster_X.extend(X[class_ids]) 44 | cluster_y.extend(y[class_ids]) 45 | cluster_c.extend((np.zeros(np.sum(class_ids))+cluster_id).tolist()) 46 | cluster_X, cluster_y, cluster_c = np.array(cluster_X), np.array(cluster_y), np.array(cluster_c) 47 | np.save("cluster_X", cluster_X) 48 | np.save("cluster_y", cluster_y) 49 | np.save("cluster_c", cluster_c) 50 | 51 | cluster_label = list() 52 | cluster_samples = list() 53 | for cid in set(cluster_c): 54 | cluster_label.append(cluster_y[cluster_c == cid]) 55 | cluster_samples.append(cluster_X[cluster_c == cid]) 56 | 57 | for i, e in enumerate(cluster_label): 58 | tmp = np.array([np.sum(e == 1), np.sum(e == 0), np.sum(e == 2)]) 59 | print "cluster %s: "%i, tmp, tmp/float(np.sum(tmp)), np.sum(tmp) 60 | 61 | for i in np.arange(len(cluster_samples)): 62 | for j in np.arange(len(cluster_samples)): 63 | if i != j: 64 | inter_dist = distance.euclidean(np.mean(cluster_samples[i], axis=0), 65 | np.mean(cluster_samples[j], axis=0)) 66 | print "cluster %s & %s: %s"%(i, j, inter_dist) 67 | 68 | 69 | print "*****************************************************************" 70 | 71 | i += 1 72 | print "Outlier components: " 73 | outlier_y = y[outlier] 74 | outlier_component = np.array([np.sum(outlier_y == 1), np.sum(outlier_y == 0), np.sum(outlier_y == 2)]) 75 | print "cluster %s: " % i, outlier_component, outlier_component / float(np.sum(outlier_component)), np.sum(outlier_component) 76 | -------------------------------------------------------------------------------- /Baselines/libs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 1/15/2018 4 | Python Version: 2.7 5 | ''' 6 | 7 | import os 8 | import sys 9 | sys.path.append(".") 10 | import numpy as np 11 | from shutil import rmtree 12 | from sklearn import preprocessing as pp 13 | from utils import getUserDict, getPageDict, TleRevTim, TimeDiff, IsMetaPage, encode, train_test_split 14 | 15 | from keras.layers import Input, Dense, LSTM, RepeatVector, Embedding 16 | from keras.models import Model, Sequential 17 | from keras.layers.core import Activation, Dense, Masking 18 | import theano.tensor as T 19 | from keras.callbacks import EarlyStopping 20 | from keras import regularizers 21 | 22 | 23 | 24 | 25 | class Dataset(object): 26 | """docstring for Dataset""" 27 | def __init__(self): 28 | super(Dataset, self).__init__() 29 | 30 | # def getSamples(self, X_pages, X_tim, X_rev, y, metaDict, page2Cgr, seqType, seqLenLow, seqLenUp, storePath): 31 | def getSamples(self, *args): 32 | 33 | if len(args) == 10: 34 | self.X_pages = args[0] 35 | self.X_tim = args[1] 36 | self.X_rev = args[2] 37 | self.y = args[3] 38 | self.metaDict = args[4] 39 | self.page2Cgr = args[5] 40 | self.seqType = args[6] 41 | self.seqLenLow = args[7] 42 | self.seqLenUp = args[8] 43 | self.storePath = args[9] 44 | self.flag = 1 45 | elif len(args) == 9: 46 | self.X_pages = args[0] 47 | self.X_tim = args[1] 48 | self.y = args[2] 49 | self.metaDict = args[3] 50 | self.page2Cgr = args[4] 51 | self.seqType = args[5] 52 | self.seqLenLow = args[6] 53 | self.seqLenUp = args[7] 54 | self.storePath = args[8] 55 | self.flag = 0 56 | 57 | self.X = list() 58 | # print self.flag 59 | # exit(0) 60 | for i, pages in enumerate(self.X_pages): 61 | 62 | tims = self.X_tim[i] 63 | if self.flag: 64 | revs = self.X_rev[i] 65 | isMetaTem = list() 66 | timDiffTem1 = list() 67 | timDiffTem2 = list() 68 | timDiffTem3 = list() 69 | reEditTem = list() 70 | consEditTem = list() 71 | comCgrTem = list() 72 | metEmptyTem = list() 73 | for j, page in enumerate(pages): 74 | # meta-page ? 75 | isMetaTem.append(IsMetaPage(page, self.metaDict)) 76 | # meta-page is empty ? 77 | if IsMetaPage(page, self.metaDict): 78 | if not eval(self.page2Cgr[page]): 79 | metEmptyTem.append(1) 80 | else: 81 | metEmptyTem.append(0) 82 | else: 83 | metEmptyTem.append(0) 84 | 85 | if j == 0: 86 | timDiffTem1.append(0) 87 | timDiffTem2.append(0) 88 | timDiffTem3.append(0) 89 | reEditTem.append(0) 90 | consEditTem.append(0) 91 | comCgrTem.append(0) 92 | else: 93 | # time difference < 1, 3, 15 mins ? 94 | timDiffTem1.append(TimeDiff(tims[j-1], tims[j], 1)) 95 | timDiffTem2.append(TimeDiff(tims[j-1], tims[j], 3)) 96 | timDiffTem3.append(TimeDiff(tims[j-1], tims[j], 15)) 97 | # has it been edited before? 98 | if page in pages[0:j]: 99 | reEditTem.append(1) 100 | # Is it a consecutive edit ? 101 | if page == pages[j-1]: 102 | consEditTem.append(1) 103 | else: 104 | consEditTem.append(0) 105 | else: 106 | reEditTem.append(0) 107 | consEditTem.append(0) 108 | # share common category ? 109 | if eval(self.page2Cgr[page]).intersection( 110 | eval(self.page2Cgr[pages[j-1]])): 111 | comCgrTem.append(1) 112 | else: 113 | comCgrTem.append(0) 114 | 115 | X_idvl = list() 116 | 117 | if self.flag: 118 | for j, e in enumerate(isMetaTem): 119 | X_code = list() 120 | attrs = [e, 121 | timDiffTem1[j], 122 | timDiffTem2[j], 123 | timDiffTem3[j], 124 | reEditTem[j], 125 | consEditTem[j], 126 | comCgrTem[j], 127 | metEmptyTem[j], 128 | revs[j] 129 | ] 130 | for attr in attrs: 131 | X_code.extend(encode(attr, 2)) 132 | X_idvl.append(X_code) 133 | else: 134 | for j, e in enumerate(isMetaTem): 135 | X_code = list() 136 | attrs = [e, 137 | timDiffTem1[j], 138 | timDiffTem2[j], 139 | timDiffTem3[j], 140 | reEditTem[j], 141 | consEditTem[j], 142 | comCgrTem[j], 143 | metEmptyTem[j] 144 | ] 145 | for attr in attrs: 146 | X_code.extend(encode(attr, 2)) 147 | X_idvl.append(X_code) 148 | 149 | self.X.append(pp.normalize(X_idvl, axis=1)) 150 | 151 | self.X = np.array(self.X) 152 | X_value = list() 153 | y_value = list() 154 | 155 | if self.seqType == 'fix': 156 | for i, e in enumerate(self.X): 157 | if self.seqLenLow >= len(e): 158 | X_value.append(e) 159 | else: 160 | X_value.append(e[0:self.seqLenLow]) 161 | y_value.append(self.y[i]) 162 | elif self.seqType == 'var': 163 | for i, e in enumerate(self.X): 164 | if self.seqLenUp >= len(e) >= self.seqLenLow: 165 | X_value.append(e) 166 | y_value.append(self.y[i]) 167 | # X_val_P = pad_sequences(X_val, maxlen=self.seqLenUp, dtype='float') 168 | 169 | X_value = np.array(X_value) 170 | y_value = np.array(y_value) 171 | 172 | # print X_val.shape 173 | # print y_val.shape 174 | 175 | x_benign = [X_value[i] for i, e in enumerate(y_value) if e == 0] 176 | x_vandal = [X_value[i] for i, e in enumerate(y_value) if e == 1] 177 | 178 | x_benign, x_vandal = np.array(x_benign), np.array(x_vandal) 179 | 180 | if self.seqType == 'fix': 181 | if self.flag: 182 | np.save(self.storePath + "X_18_1_20_Ben.npy", x_benign) 183 | np.save(self.storePath + "X_18_1_20_Van.npy", x_vandal) 184 | else: 185 | np.save(self.storePath + "X_16_1_20_Ben.npy", x_benign) 186 | np.save(self.storePath + "X_16_1_20_Van.npy", x_vandal) 187 | elif self.seqType == 'var': 188 | if self.flag: 189 | np.save(self.storePath + "X_v8_%s_%s_Ben.npy"%(self.seqLenLow,self.seqLenUp), x_benign) 190 | np.save(self.storePath + "X_v8_%s_%s_Van.npy"%(self.seqLenLow,self.seqLenUp), x_vandal) 191 | else: 192 | np.save(self.storePath + "X_v6_%s_%s_Ben.npy"%(self.seqLenLow,self.seqLenUp), x_benign) 193 | np.save(self.storePath + "X_v6_%s_%s_Van.npy"%(self.seqLenLow,self.seqLenUp), x_vandal) 194 | 195 | 196 | 197 | def getRawData(self, files, f_users, f_pages, splRatio, basePath): 198 | 199 | self.files = files 200 | self.splRatio = splRatio 201 | self.basePath = basePath 202 | 203 | directory = self.basePath + "wikiEditSeq" + "_" + str(splRatio) 204 | if os.path.exists(directory): 205 | rmtree(directory) 206 | os.makedirs(directory) 207 | directory += "\\" 208 | 209 | # load user-page information from dictionary file and raw data. 210 | user2id, user2Label, id2user = getUserDict(f_users) 211 | page2id, __, __ = getPageDict(f_pages) 212 | titleSet, revSet, timSet = TleRevTim(self.files,user2id,page2id) 213 | 214 | # user2Label = getLabel(filesB,filesV,user2id,0,1) 215 | 216 | X_usrs = list() 217 | y = list() 218 | X_pages = list() 219 | X_tim = list() 220 | X_rev = list() 221 | 222 | 223 | for usrid in titleSet: # to keep userid, label, pageid, revert, editing-time consistent. 224 | X_usrs.append(usrid) 225 | y.append(user2Label[usrid]) 226 | X_pages.append(titleSet[usrid]) 227 | X_rev.append(revSet[usrid]) 228 | X_tim.append(timSet[usrid]) 229 | 230 | X_len = [len(x) for x in X_pages] 231 | 232 | X_usrs, y, X_pages, X_rev, X_tim, X_len = np.array(X_usrs), np.array(y), \ 233 | np.array(X_pages), np.array(X_rev), np.array(X_tim), np.array(X_len) 234 | 235 | 236 | X_usrs_train, y_train, X_pages_train, X_tim_train, X_rev_train, X_len_train, \ 237 | X_usrs_test, y_test, X_pages_test, X_tim_test, X_rev_test, X_len_test = \ 238 | train_test_split(X_usrs, y, X_pages, X_tim, X_rev, X_len, splRatio) 239 | 240 | np.save(directory + "X_usrs.npy", X_usrs) 241 | np.save(directory + "y.npy", y) 242 | np.save(directory + "X_pages.npy", X_pages) 243 | np.save(directory + "X_rev.npy", X_rev) 244 | np.save(directory + "X_tim.npy", X_tim) 245 | np.save(directory + "X_len.npy", X_len) 246 | 247 | np.save(directory + "X_usrs_train.npy", X_usrs_train) 248 | np.save(directory + "y_train.npy", y_train) 249 | np.save(directory + "X_pages_train.npy", X_pages_train) 250 | np.save(directory + "X_tim_train.npy", X_tim_train) 251 | np.save(directory + "X_rev_train.npy", X_rev_train) 252 | np.save(directory + "X_len_train.npy", X_len_train) 253 | 254 | np.save(directory + "X_usrs_test.npy", X_usrs_test) 255 | np.save(directory + "y_test.npy", y_test) 256 | np.save(directory + "X_pages_test.npy", X_pages_test) 257 | np.save(directory + "X_tim_test.npy", X_tim_test) 258 | np.save(directory + "X_rev_test.npy", X_rev_test) 259 | np.save(directory + "X_len_test.npy", X_len_test) 260 | 261 | 262 | 263 | class Autoencoder(object): 264 | """docstring for Autoencoder""" 265 | # def __init__(self, sampleWeights, sample_weight_mode): 266 | def __init__(self): 267 | # super(Autoencoder, self).__init__() 268 | # self.codeLayerType = 'dense' 269 | self.nb_epoch = 20 270 | self.batch_size = 256 271 | self.shuffle = True 272 | self.validation_split = 0.05 273 | self.optimizer = 'adadelta' 274 | self.loss = 'mse' 275 | # self.sampleWeights = sampleWeights 276 | # self.sample_weight_mode = sample_weight_mode 277 | 278 | 279 | def model(self, codeLayerType, inputDim, codeDim): 280 | 281 | self.codeLayerType = codeLayerType 282 | assert len(codeDim) > 0 283 | 284 | if self.codeLayerType == 'lstm': 285 | assert len(inputDim) == 2 286 | inputData = Input(shape=(inputDim[0],inputDim[1])) 287 | 288 | if len(codeDim) == 1: 289 | encoded = LSTM(codeDim[0])(inputData) 290 | decoded = RepeatVector(inputDim[0])(encoded) 291 | elif len(codeDim) > 1: 292 | encoded = inputData 293 | for i, units in enumerate(codeDim): 294 | if i == len(codeDim) - 1: 295 | encoded = LSTM(units)(encoded) 296 | continue 297 | encoded = LSTM(units, return_sequences=True)(encoded) 298 | 299 | for i, units in enumerate(reversed(codeDim)): 300 | if i == 1: 301 | decoded = LSTM(units, return_sequences=True)(RepeatVector(inputDim[0])(encoded)) 302 | elif i > 1: 303 | decoded = LSTM(units, return_sequences=True)(decoded) 304 | else: 305 | raise ValueError("The codDim must be over 0.") 306 | 307 | decoded = LSTM(inputDim[-1], return_sequences=True)(decoded) 308 | self.model = Model(inputData, decoded) 309 | 310 | elif self.codeLayerType == 'dense': 311 | assert len(inputDim) == 1 312 | inputData = Input(shape=(inputDim[0],)) 313 | encoded = inputData 314 | for i, units in enumerate(codeDim): 315 | encoded = Dense(units, activation='relu')(encoded) 316 | decoded = Dense(inputDim[-1], activation='sigmoid')(encoded) 317 | self.model = Model(inputData, decoded) 318 | 319 | elif self.codeLayerType == 'cov': 320 | pass 321 | 322 | 323 | def modelMasking(self, codeLayerType, inputDim, codeDim): 324 | 325 | self.codeLayerType = codeLayerType 326 | assert len(codeDim) > 0 327 | 328 | if self.codeLayerType == 'lstm': 329 | assert len(inputDim) == 2 330 | inputData = Input(shape=(inputDim[0],inputDim[1])) 331 | mask = Masking(mask_value=0.)(inputData) 332 | if len(codeDim) == 1: 333 | encoded = LSTM(codeDim[0])(mask) 334 | decoded = RepeatVector(inputDim[0])(encoded) 335 | elif len(codeDim) > 1: 336 | encoded = mask 337 | for i, units in enumerate(codeDim): 338 | if i == len(codeDim) - 1: 339 | encoded = LSTM(units)(encoded) 340 | continue 341 | encoded = LSTM(units, return_sequences=True)(encoded) 342 | 343 | for i, units in enumerate(reversed(codeDim)): 344 | if i == 1: 345 | decoded = LSTM(units, return_sequences=True)(RepeatVector(inputDim[0])(encoded)) 346 | elif i > 1: 347 | decoded = LSTM(units, return_sequences=True)(decoded) 348 | else: 349 | raise ValueError("The codDim must be over 0.") 350 | 351 | decoded = LSTM(inputDim[-1], return_sequences=True)(decoded) 352 | self.model = Model(inputData, decoded) 353 | 354 | elif self.codeLayerType == 'cov': 355 | pass 356 | elif self.codeLayerType == 'dense': 357 | assert len(inputDim) == 1 358 | inputData = Input(shape=(inputDim[0],)) 359 | # encoded = inputData 360 | # for i, units in enumerate(codeDim): 361 | # encoded = Dense(units, activation='relu')(encoded) 362 | # decoded = Dense(inputDim[-1], activation='sigmoid')(encoded) 363 | # self.model = Model(inputData, decoded) 364 | encoder = Dense(codeDim[0], activation="tanh", 365 | activity_regularizer=regularizers.l1(10e-5))(inputData) 366 | encoder = Dense(int(codeDim[0]/2), activation="relu")(encoder) 367 | decoder = Dense(int(codeDim[0]/2), activation='tanh')(encoder) 368 | decoder = Dense(inputDim[0], activation='relu')(decoder) 369 | self.model = Model(inputData, decoder) 370 | 371 | def compile(self, *args): 372 | 373 | if len(args) == 0: 374 | self.model.compile(optimizer=self.optimizer, loss=self.loss) 375 | elif len(args) == 1: 376 | if args[0] == 'temporal': 377 | self.sample_weight_mode = args[0] 378 | self.model.compile(optimizer=self.optimizer, loss=self.loss, sample_weight_mode=self.sample_weight_mode) 379 | elif args[0] == 'customFunction': 380 | self.model.compile(optimizer=self.optimizer, loss= self.weighted_vector_mse) 381 | else: 382 | raise ValueError("Invalid maskType, please input 'sampleWeights' or 'customFunction'") 383 | else: 384 | raise ValueError("argument # must be 0 or 1.") 385 | 386 | 387 | def fit(self, *args): 388 | 389 | # early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=3, verbose=1, mode='auto') 390 | if len(args) == 2: 391 | if args[1] == 'nor': 392 | self.model.fit(args[0], 393 | args[0], 394 | nb_epoch=self.nb_epoch, 395 | batch_size=self.batch_size, 396 | shuffle=self.shuffle, 397 | validation_split=self.validation_split) 398 | # callbacks = [early_stopping]) 399 | elif args[1] == 'rev': 400 | self.model.fit(args[0], 401 | np.flip(args[0], 1), 402 | nb_epoch=self.nb_epoch, 403 | batch_size=self.batch_size, 404 | shuffle=self.shuffle, 405 | validation_split=self.validation_split) 406 | # callbacks=[early_stopping]) 407 | else: 408 | raise ValueError("decoding sequence type: 'normal' or 'reverse'.") 409 | 410 | elif len(args) == 3: 411 | self.sampleWeights = args[2] 412 | if args[1] == 'nor': 413 | self.model.fit(args[0], 414 | args[0], 415 | nb_epoch=self.nb_epoch, 416 | batch_size=self.batch_size, 417 | shuffle=self.shuffle, 418 | validation_split=self.validation_split, 419 | sample_weight=self.sampleWeights) 420 | # callbacks=[early_stopping]) 421 | elif args[1] == 'rev': 422 | self.model.fit(args[0], 423 | np.flip(args[0], 1), 424 | nb_epoch=self.nb_epoch, 425 | batch_size=self.batch_size, 426 | shuffle=self.shuffle, 427 | validation_split=self.validation_split, 428 | sample_weight=self.sampleWeights) 429 | # callbacks=[early_stopping]) 430 | else: 431 | raise ValueError("Please input, 'data', 'nor' or 'rev', 'sample_weights'") 432 | 433 | def predict(self, data): 434 | return self.model.predict(data) 435 | 436 | def weighted_vector_mse(self, y_true, y_pred): 437 | 438 | self.y_true = y_true 439 | self.y_pred = y_pred 440 | 441 | weight = T.ceil(self.y_true) 442 | loss = T.square(weight * (self.y_true - self.y_pred)) 443 | # use appropriate relations for other objectives. E.g, for binary_crossentropy: 444 | #loss = weights * (y_true * T.log(y_pred) + (1.0 - y_true) * T.log(1.0 - y_pred)) 445 | return T.mean(T.sum(loss, axis=-1)) 446 | 447 | 448 | 449 | 450 | -------------------------------------------------------------------------------- /Baselines/model_components.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 1/15/2018 4 | Python Version: 2.7 5 | ''' 6 | 7 | import os 8 | import sys 9 | import numpy as np 10 | from keras.callbacks import EarlyStopping 11 | from keras.layers import Input 12 | from keras.models import Model, Sequential 13 | from keras.layers.core import Dense 14 | from keras.layers import Reshape, Flatten, LeakyReLU, Activation 15 | from keras_adversarial.legacy import l1l2 16 | from sklearn.metrics import classification_report 17 | 18 | 19 | 20 | def get_generator(G_in, output_dim, hidden_dim=100, reg=lambda: l1l2(1e-5, 1e-5)): 21 | 22 | x = Dense(int(hidden_dim), name="generator_h1", W_regularizer=reg())(G_in) 23 | x = LeakyReLU(0.2)(x) 24 | x = Dense(output_dim, name="generator_x_flat", W_regularizer=reg())(x) 25 | G_out = Activation('tanh')(x) 26 | # G_out = Activation('sigmoid')(x) 27 | G = Model(G_in, G_out) 28 | G.compile(loss='binary_crossentropy', optimizer='adam') 29 | return G, G_out 30 | 31 | def get_discriminator(D_in, hidden_dim=50, reg=lambda: l1l2(1e-5, 1e-5)): 32 | 33 | x = Dense(hidden_dim * 2, name="discriminator_h1",W_regularizer=reg())(D_in) 34 | x = LeakyReLU(0.2)(x) 35 | x = Dense(hidden_dim, name="discriminator_h2",W_regularizer=reg())(x) 36 | x = LeakyReLU(0.2)(x) 37 | x = Dense(1, name="discriminator_y", W_regularizer=reg())(x) 38 | D_out = Activation("sigmoid")(x) 39 | D = Model(D_in, D_out) 40 | D.compile(loss='binary_crossentropy', optimizer='sgd') 41 | return D, D_out 42 | 43 | # Freeze weights in the discriminator for stacked training 44 | def set_trainable(model, trainable): 45 | model.trainable = trainable 46 | for l in model.layers: 47 | l.trainable = trainable 48 | 49 | # Build stacked GAN model 50 | def make_gan(GAN_in, G, D): 51 | set_trainable(D, False) 52 | x = G(GAN_in) 53 | GAN_out = D(x) 54 | GAN = Model(GAN_in, GAN_out) 55 | GAN.compile(loss='binary_crossentropy', optimizer='adam') 56 | return GAN, GAN_out 57 | 58 | # Training Procedure Definition 59 | def sample_data_and_gen(XT, G, noise_dim=50): 60 | n_samples = XT.shape[0] 61 | s = np.arange(2*n_samples) 62 | np.random.shuffle(s) 63 | XN_noise = np.random.normal(0, 1, size=[n_samples, noise_dim]) 64 | XN = G.predict(XN_noise) 65 | X = np.concatenate((XT, XN)) 66 | y = np.ones(2*n_samples) 67 | y[n_samples:] = 0 68 | X = X[s] 69 | y = y[s] 70 | return X, y 71 | 72 | def sample_noise(n_samples, noise_dim=50): 73 | X = np.random.normal(0, 1, size=[n_samples, noise_dim]) 74 | y = np.ones(n_samples) 75 | return X, y 76 | 77 | def pretrain(G, D, XT, batch_size=50): 78 | X, y = sample_data_and_gen(XT, G, noise_dim=50) 79 | set_trainable(D, True) 80 | D.fit(X, y, nb_epoch=1, batch_size=batch_size) 81 | 82 | def batch_divide(X, batch_size): 83 | q = np.divide(X, batch_size) 84 | r = np.remainder(X, batch_size) 85 | return q, r 86 | 87 | def train_and_test(GAN, G, D, XT, x_test, y_test, en_ae, epochs, verbose=True, v_freq=10): 88 | if en_ae == 1: 89 | XT = XT[0:7000] 90 | x_test, y_test = random_sampling_test_data(x_test, y_test, 3000) 91 | batch_size = 700 92 | elif en_ae == 2: 93 | XT = XT[0:700] 94 | x_test, y_test = random_sampling_test_data(x_test, y_test, 490) 95 | batch_size = 70 96 | D_fake_prob = list() 97 | D_real_prob = list() 98 | D_val_prob = list() 99 | fake_real_mse = list() 100 | f1_score_coll = list() 101 | 102 | # D_loss = list() 103 | # G_loss = list() 104 | # accuracy = list() 105 | 106 | for epoch in range(epochs): 107 | X, y = sample_data_and_gen(XT, G, noise_dim=50) 108 | X_real = X[y == 1] 109 | X_fake = X[y == 0] 110 | d_loss = list() 111 | g_loss = list() 112 | q, r = batch_divide(X_real.shape[0], batch_size) 113 | for i in range(q): 114 | set_trainable(D, True) 115 | d_loss.append(D.train_on_batch(np.array( 116 | X_real[i * batch_size:(i + 1) * batch_size].tolist() + 117 | X_fake[i * batch_size:(i + 1) * batch_size].tolist() 118 | ), 119 | np.array( 120 | np.ones(batch_size).tolist() + np.zeros(batch_size).tolist() 121 | ))) 122 | 123 | set_trainable(D, False) 124 | X_gan, y_gan = sample_noise(batch_size, 50) 125 | g_loss.append(GAN.train_on_batch(X_gan,y_gan)) 126 | 127 | if r != 0: 128 | set_trainable(D, True) 129 | d_loss.append(D.train_on_batch( 130 | np.array( 131 | X_real[-r:].tolist() + X_fake[-r:].tolist() 132 | ), 133 | np.array( 134 | np.ones(r).tolist() + np.zeros(r).tolist() 135 | ))) 136 | set_trainable(D, False) 137 | X_r, y_r = sample_noise(r, 50) 138 | g_loss.append(GAN.train_on_batch(X_r,y_r)) 139 | 140 | fake_real_mse.append(np.mean(np.sqrt((X_real-X_fake)**2))) 141 | D_fake_prob.append(np.mean(D.predict(X_fake))) 142 | D_real_prob.append(np.mean(D.predict(X_real))) 143 | D_val_prob.append(np.mean(D.predict(x_test[y_test==0]))) 144 | # D_loss.append(np.mean(d_loss)) 145 | # G_loss.append(np.mean(g_loss)) 146 | 147 | y_pred = (D.predict(x_test) > .5).astype(int).flatten() 148 | conf_mat = classification_report(y_test, y_pred, target_names=['vandal', 'benign'], digits=4) 149 | f1_score = float(filter(None, conf_mat.strip().split(" "))[7]) 150 | f1_score_coll.append(f1_score) 151 | # print "epoch: ", epoch, " ", filter(None, conf_mat[-50:].strip().split(" "))[-2] 152 | # print "epoch:%s"%epoch 153 | # print conf_mat 154 | # f1_score.append(float(filter(None, conf_mat.strip().split(" "))[7])) 155 | # acc = np.sum(y_pred == y_test)/float(y_pred.shape[0]) 156 | # accuracy.append(acc) 157 | return D, X_fake, D_real_prob, D_fake_prob, D_val_prob, fake_real_mse, f1_score_coll 158 | 159 | 160 | def random_sampling_test_data(x_test, y_test,n_samples=3000): 161 | x_test_ben = x_test[y_test == 1] 162 | x_test_van = x_test[y_test != 1] 163 | assert x_test_ben.shape[0] == x_test_van.shape[0] 164 | assert x_test_ben.shape[0] >= n_samples 165 | # s = np.arange(x_test_ben.shape[0]) 166 | # np.random.shuffle(s) 167 | # s = s[:n_samples] 168 | x_test = np.concatenate((x_test_ben[0:n_samples], x_test_van[0:n_samples])) 169 | y_test = np.ones(2 * n_samples) 170 | y_test[n_samples:] = 0 171 | return x_test, y_test 172 | 173 | 174 | def train_gan(GAN, G, D, XT, epochs, en_ae, verbose=True, v_freq=10): 175 | if en_ae == 1: 176 | XT = XT[0:7000] 177 | batch_size = 700 178 | else: 179 | XT = XT[0:700] 180 | batch_size = 70 181 | for epoch in range(epochs): 182 | X, y = sample_data_and_gen(XT, G, noise_dim=50) 183 | X_real = X[y == 1] 184 | X_fake = X[y == 0] 185 | d_loss = list() 186 | g_loss = list() 187 | q, r = batch_divide(X_real.shape[0], batch_size) 188 | for i in range(q): 189 | set_trainable(D, True) 190 | d_loss.append(D.train_on_batch(np.array( 191 | X_real[i * batch_size:(i + 1) * batch_size].tolist() + 192 | X_fake[i * batch_size:(i + 1) * batch_size].tolist() 193 | ), 194 | np.array( 195 | np.ones(batch_size).tolist() + np.zeros(batch_size).tolist() 196 | ))) 197 | 198 | set_trainable(D, False) 199 | X_gan, y_gan = sample_noise(batch_size, 50) 200 | g_loss.append(GAN.train_on_batch(X_gan,y_gan)) 201 | 202 | if r != 0: 203 | set_trainable(D, True) 204 | d_loss.append(D.train_on_batch( 205 | np.array( 206 | X_real[-r:].tolist() + X_fake[-r:].tolist() 207 | ), 208 | np.array( 209 | np.ones(r).tolist() + np.zeros(r).tolist() 210 | ))) 211 | set_trainable(D, False) 212 | X_r, y_r = sample_noise(r, 50) 213 | g_loss.append(GAN.train_on_batch(X_r,y_r)) 214 | return D 215 | 216 | 217 | def run_Gan(x_test, y_test, D, en_ae): 218 | if en_ae == 1: 219 | x_test, y_test = random_sampling_test_data(x_test, y_test, 3000) 220 | else: 221 | x_test, y_test = random_sampling_test_data(x_test, y_test, 490) 222 | 223 | y_pred = (D.predict(x_test) > .5).astype(int).flatten() 224 | conf_mat = classification_report(y_test, y_pred, target_names=['vandal', 'benign'], digits=4) 225 | acc = np.sum(y_pred == y_test) / float(y_pred.shape[0]) 226 | print conf_mat 227 | return np.array(filter(None, conf_mat.strip().split(" "))[5]).astype(float),\ 228 | np.array(filter(None, conf_mat.strip().split(" "))[6]).astype(float), \ 229 | np.array(filter(None, conf_mat.strip().split(" "))[7]).astype(float), acc 230 | 231 | def run_one_svm(x_test, y_test, clf, en_ae): 232 | if en_ae == 1: 233 | N = 3000 234 | else: 235 | N = 490 236 | x_test_svm = np.concatenate((x_test[y_test == 1][0:N], x_test[y_test == 2][0:N])) 237 | y_test_svm = np.concatenate((np.ones(N), np.zeros(N)-1)) 238 | y_pred = clf.predict(x_test_svm) 239 | conf_mat = classification_report(y_test_svm, y_pred, target_names=['vandal', 'benign'], digits=4) 240 | acc = np.sum(y_pred == y_test_svm) / float(y_pred.shape[0]) 241 | 242 | return np.array(filter(None, conf_mat.strip().split(" "))[5]).astype(float),\ 243 | np.array(filter(None, conf_mat.strip().split(" "))[6]).astype(float), \ 244 | np.array(filter(None, conf_mat.strip().split(" "))[7]).astype(float), acc 245 | -------------------------------------------------------------------------------- /Baselines/representation_libs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 1/15/2018 4 | Python Version: 2.7 5 | ''' 6 | 7 | import numpy as np 8 | from sklearn.cluster import DBSCAN 9 | from collections import defaultdict 10 | 11 | def db_span(X, eps, min_samples): 12 | db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) 13 | outlier = None 14 | cluster_label = dict() 15 | for label_id in set(db.labels_): 16 | if label_id == -1: 17 | outlier = (db.labels_ == label_id) 18 | continue 19 | cluster_label[label_id] = (db.labels_ == label_id) 20 | return cluster_label, outlier 21 | 22 | def cluster_analyis(cluster_label): 23 | dict_of_cluster = dict() 24 | for label_id in cluster_label: 25 | dict_of_cluster[label_id] = np.sum(cluster_label[label_id]) 26 | return dict_of_cluster 27 | 28 | def get_eps(X): 29 | X_contre = np.mean(X, axis=0) 30 | diff_to_contre = X - X_contre 31 | dist_to_contre = list(map(lambda x: np.sqrt(np.sum(x ** 2)), diff_to_contre)) 32 | return np.mean(dist_to_contre) 33 | 34 | def DB_statistics(X, eps, min_num_samples): 35 | cluster_index, outlier_index = db_span(X, eps, min_num_samples) 36 | dict_of_cluster = cluster_analyis(cluster_index) 37 | print("num_of_cluster: ", len(dict_of_cluster.keys())) 38 | print("cluster_list ", dict_of_cluster) 39 | print("outlier_rate: ", np.sum(outlier_index), np.sum(outlier_index)/ float(len(outlier_index))) 40 | -------------------------------------------------------------------------------- /Baselines/run_baseline.m: -------------------------------------------------------------------------------- 1 | % Author: Panpan Zheng 2 | % Date created: 1/15/2018 3 | 4 | function [precision_neg, recall_neg, f1_neg, accuracy] = run_baseline(file_url,NDtype,en_ae) 5 | 6 | % type_list = ['dist', 'nn', 'kmeans', 'parzen', 'gmm', 'svmTax', 'gpoc', 'kde', 'som', 'pca', 'kpca']; 7 | 8 | data = load(file_url); 9 | X = data.x; 10 | y = data.y; 11 | 12 | %% Sampling training, validating, and testing set. 13 | 14 | isnor = y == 1; % regard class 1 as normal. 15 | isab = ~isnor; 16 | [traindataNorOri, validdataNorOri, testdataNorOri, validdataAbOri, testdataAbOri] = splitData(X,isab,en_ae); 17 | 18 | size(traindataNorOri, 1); 19 | size(testdataNorOri, 1); 20 | size(testdataAbOri, 1); 21 | size(validdataNorOri, 1); 22 | size(validdataAbOri, 1); 23 | 24 | %% Training, validating, and testing. 25 | 26 | switch lower(NDtype) 27 | case 'gpoc' 28 | trained_model = train_gpoc(traindataNorOri); 29 | case 'svmsch' 30 | trained_model = train_svmsch(traindataNorOri); 31 | case 'nn' 32 | trained_model = train_nn(traindataNorOri); 33 | case 'kpca' 34 | trained_model = train_kpca(traindataNorOri); 35 | case 'svmtax' 36 | trained_model = train_svmtax(traindataNorOri); 37 | case 'pca' 38 | trained_model = train_pca(traindataNorOri); 39 | case 'kde' 40 | trained_model = train_kde(traindataNorOri); 41 | end 42 | 43 | 44 | %% Testing 45 | 46 | switch lower(NDtype) 47 | case 'gpoc' 48 | output_nor = out_gpoc(testdataNorOri, trained_model); 49 | output_ab = out_gpoc(testdataAbOri, trained_model); 50 | case 'svmsch' 51 | output_nor = out_svmsch(testdataNorOri, trained_model); 52 | output_ab = out_svmsch(testdataAbOri, trained_model); 53 | case 'nn' 54 | output_nor = out_nn(testdataNorOri, trained_model); 55 | output_ab = out_nn(testdataAbOri, trained_model); 56 | case 'kpca' 57 | output_nor = out_kpca(testdataNorOri, trained_model); 58 | output_ab = out_kpca(testdataAbOri, trained_model); 59 | case 'svmtax' 60 | output_nor = out_svmtax(testdataNorOri, trained_model); 61 | output_ab = out_svmtax(testdataAbOri, trained_model); 62 | case 'pca' 63 | output_nor = out_pca(testdataNorOri, trained_model); 64 | output_ab = out_pca(testdataAbOri, trained_model); 65 | case 'kde' 66 | output_nor = out_kde(testdataNorOri, trained_model); 67 | output_ab = out_kde(testdataAbOri, trained_model); 68 | end 69 | 70 | 71 | %% Validation (threshold) 72 | 73 | 74 | switch lower(NDtype) 75 | case 'gpoc' 76 | [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'gpoc'); 77 | case 'svmsch' 78 | [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'svmsch'); 79 | case 'nn' 80 | [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'nn'); 81 | case 'kpca' 82 | [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'kpca'); 83 | case 'svmtax' 84 | [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'svmtax'); 85 | case 'pca' 86 | [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'pca'); 87 | case 'kde' 88 | [~, optthr] = minErr_thr(trained_model, validdataNorOri, validdataAbOri, 'kde'); 89 | end 90 | 91 | 92 | %% Get label 93 | switch lower(NDtype) 94 | case 'gpoc' 95 | pred_nor = assignCls('gpoc', output_nor, optthr); 96 | pred_ab = assignCls('gpoc', output_ab, optthr); 97 | case 'svmsch' 98 | pred_nor = assignCls('svmsch', output_nor, optthr); 99 | pred_ab = assignCls('svmsch', output_ab, optthr); 100 | case 'nn' 101 | pred_nor = assignCls('nn', output_nor, optthr); 102 | pred_ab = assignCls('nn', output_ab, optthr); 103 | case 'kpca' 104 | pred_nor = assignCls('kpca', output_nor, optthr); 105 | pred_ab = assignCls('kpca', output_ab, optthr); 106 | case 'svmtax' 107 | pred_nor = assignCls('svmtax', output_nor, optthr); 108 | pred_ab = assignCls('svmtax', output_ab, optthr); 109 | case 'pca' 110 | pred_nor = assignCls('pca', output_nor, optthr); 111 | pred_ab = assignCls('pca', output_ab, optthr); 112 | case 'kde' 113 | pred_nor = assignCls('kde', output_nor, optthr); 114 | pred_ab = assignCls('kde', output_ab, optthr); 115 | end 116 | 117 | pred_labels = [pred_nor; pred_ab]; 118 | 119 | 120 | 121 | %% Compute confusion matrix of test data. 122 | 123 | tar_nor = zeros(size(output_nor, 1), 1); 124 | tar_ab = ones(size(output_ab, 1), 1); 125 | tar_labels = [tar_nor; tar_ab]; 126 | 127 | 128 | [conf, ~] = confmat(pred_labels, tar_labels); % predTest and tarTest are 0-1 coding. 129 | fprintf('\n'); 130 | disp(conf); 131 | fprintf('\n'); 132 | % fprintf('Confusion matrix using test data is:\n'); 133 | % disp(conf); 134 | accuracy = (conf(1,1) + conf(2,2)) / sum(conf(:)); % accuracy = rate(1) 135 | 136 | precision_pos = conf(1,1)/(conf(1,1) + conf(2,1)); 137 | precision_neg = conf(2,2)/(conf(1,2) + conf(2,2)); 138 | precision = (precision_pos + precision_neg)/2; 139 | 140 | recall_pos = conf(1,1)/(conf(1,1) + conf(1,2)); 141 | recall_neg = conf(2,2)/(conf(2,1) + conf(2,2)); 142 | recall = (recall_pos + recall_neg)/2; 143 | 144 | f1_pos = 2 * ((precision_pos*recall_pos)/(precision_pos + recall_pos)); 145 | f1_neg = 2 * ((precision_neg*recall_neg)/(precision_neg + recall_neg)); 146 | 147 | f1 = (f1_pos + f1_neg)/2; 148 | -------------------------------------------------------------------------------- /Baselines/splitData.m: -------------------------------------------------------------------------------- 1 | % Author: Panpan Zheng 2 | % Date created: 1/15/2018 3 | 4 | function [traindataNor, validdataNor, testdataNor, validdataAb, testdataAb] = splitData(alldata, isab, en_ae) 5 | %SPLITDATA Split data into three groups: training, validation, and test set. 6 | % [traindataNor, validdataNor, testdataNor, validdataAb, testdataAb] = splitData(alldata, isab) 7 | % 8 | % Inputs: 9 | % allData: a matrix, ntrain by nftrs 10 | % isab: flag abnormal cases, ndata by 1 11 | % 12 | % Ouputs: 13 | % traindataNor data to be used for training 14 | % validdataNor, validdataAb data to be used for validation 15 | % testdataNor, testdataAb data to be used for testing 16 | % 17 | % See also demoND 18 | 19 | normaldata = alldata(~isab, :); 20 | abnormaldata = alldata(isab, :); 21 | 22 | numdata = size(alldata, 1); 23 | numAb = sum(isab); 24 | numNor = numdata - numAb; 25 | 26 | 27 | % fprintf('%d \n', numdata); 28 | % fprintf('%d \n', numAb); 29 | % fprintf('%d \n', numNor); 30 | 31 | if en_ae == 1 32 | traindataNor = normaldata(1:7000, :); 33 | testdataNor = normaldata(7001:10000, :); 34 | testdataAb = abnormaldata(1:3000, :); 35 | validdataAb = abnormaldata(3000:end, :); 36 | validdataNor = normaldata(10001:10000+size(validdataAb,1), :); 37 | else 38 | traindataNor = normaldata(1:700, :); 39 | testdataNor = normaldata(701:1190, :); 40 | testdataAb = abnormaldata(1:490, :); 41 | validdataAb = abnormaldata(491:492, :); 42 | validdataNor = normaldata(1191:1190+size(validdataAb,1), :); 43 | end 44 | 45 | % permNor = randperm(numNor); 46 | % indTrainNor = permNor(1:7000); 47 | % traindataNor = normaldata(indTrainNor, :); 48 | % 49 | % 50 | % indTestNor = permNor(7001:10000); 51 | % % indValidNor = permNor(10001:10159); 52 | % testdataNor = normaldata(indTestNor, :); 53 | % permAb = randperm(numAb); 54 | % indTestAb = permAb(1:3000); 55 | % testdataAb = abnormaldata(indTestAb, :); 56 | % 57 | % 58 | % indValidAb = permAb(3001:end); 59 | % validdataAb = abnormaldata(indValidAb, :); 60 | % indValidNor = permNor(10001: 10000+size(validdataAb,1)); 61 | % validdataNor = normaldata(indValidNor, :); 62 | 63 | %% 64 | % if numNor > numAb * 2 65 | % howtosplit = 'balance'; % this is not that correct, used in my AUC paper in Poland conference. 66 | % % howtosplit = 'balanceByPts'; % this is the proper way; see BSP log 1.133. Implemented in RunND_byPts.m. 67 | % else 68 | % howtosplit = 'percentage'; 69 | % end 70 | % 71 | % switch lower(howtosplit) 72 | % case 'percentage' 73 | % percentTrainNor = 0.6; % use 60% normal data for training. 74 | % percentValidNor = 0.2; % use 20% normal data for validation => use 20% normal data for testing. 75 | % numTrainNor = floor(percentTrainNor * numNor); 76 | % numValidNor = floor(percentValidNor * numNor); 77 | % numTestNor = numNor - numTrainNor - numValidNor; 78 | % 79 | % indValidAb = (1:numValidAb); 80 | % indTestAb = (numValidAb+1 : numAb); 81 | % 82 | % indTrainNor = (1 : numTrainNor); 83 | % indValidNor = (numTrainNor + 1 : numTrainNor + numValidNor); 84 | % indTestNor = (numTrainNor + numValidNor + 1 : numNor); 85 | % 86 | % case 'balance' % Use the same number of normal data for validation and test, in order to balance the data set. 87 | % numValidNor = numValidAb; 88 | % numTestNor = numTestAb; 89 | % numTrainNor = numNor - numValidNor - numTestNor; 90 | % 91 | % permAb = randperm(numAb); 92 | % indValidAb = permAb(1:numValidAb); 93 | % indTestAb = permAb(numValidAb+1 : end); 94 | % 95 | % permNor = randperm(numNor); 96 | % indTrainNor = permNor(1 : numTrainNor); 97 | % indValidNor = permNor(numTrainNor + 1 : numTrainNor + numValidNor); 98 | % indTestNor = permNor(numTrainNor + numValidNor + 1 : end); 99 | % end 100 | 101 | %% 102 | % normaldata = alldata(~isab, :); 103 | % abnormaldata = alldata(isab, :); 104 | % % Find training, validation and test data 105 | % traindataNor = normaldata(indTrainNor, :); % only use normal data for training. 106 | % validdataAb = abnormaldata(indValidAb, :); 107 | % testdataAb = abnormaldata(indTestAb, :); 108 | % validdataNor = normaldata(indValidNor, :); 109 | % testdataNor = normaldata(indTestNor, :); 110 | 111 | end 112 | -------------------------------------------------------------------------------- /Baselines/utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 1/15/2018 4 | Python Version: 2.7 5 | ''' 6 | 7 | import numpy as np 8 | from numpy import random 9 | import pandas as pd 10 | from collections import defaultdict 11 | import datetime 12 | from datetime import * 13 | from numpy import linalg as LA 14 | import matplotlib.pyplot as plt 15 | from sklearn.manifold import TSNE 16 | 17 | 18 | 19 | # functions for data processing. 20 | 21 | def getUserDict(f_users): 22 | 23 | users_frame = pd.read_csv(f_users, sep='\t', header=None) 24 | users_frame = users_frame.applymap(str) 25 | user2id = users_frame.set_index(0)[1].to_dict() 26 | 27 | uid2type = users_frame.set_index(1)[4].to_dict() 28 | 29 | for uid in uid2type: 30 | if uid2type[uid] == "benign": 31 | uid2type[uid] = 0 32 | else: 33 | uid2type[uid] = 1 34 | 35 | id2user = users_frame.set_index(1)[0].to_dict() 36 | return user2id, uid2type, id2user 37 | 38 | 39 | def getPageDict(f_pages): 40 | 41 | pages_frame = pd.read_csv(f_pages, sep=',') 42 | pages_frame = pages_frame.applymap(str) 43 | page2id = pages_frame.set_index('pagetitle')['pageid'].to_dict() 44 | id2Cgr = pages_frame.set_index('pageid')['pagecategories'].to_dict() 45 | id2page = pages_frame.set_index('pageid')['pagetitle'].to_dict() 46 | return page2id, id2Cgr, id2page 47 | 48 | 49 | def TleRevTim(files,user2id,page2id): 50 | titleSet = defaultdict(list) 51 | revSet = defaultdict(list) 52 | timSet = defaultdict(list) 53 | for f in files: 54 | df = pd.read_csv(f,sep=',') 55 | for index, row in df.iterrows(): 56 | try: 57 | usrid = user2id[row['username']] 58 | except Exception as e: 59 | print row['username'] 60 | try: 61 | pageid = page2id[row['pagetitle']] 62 | except Exception as e: 63 | print row['pagetitle'] 64 | isRev = row['isReverted'] 65 | revTime = row['revtime'] 66 | titleSet[usrid].append(pageid) 67 | if isRev: 68 | revSet[usrid].append(1) 69 | else: 70 | revSet[usrid].append(0) 71 | timSet[usrid].append(revTime) 72 | return titleSet, revSet, timSet 73 | 74 | 75 | 76 | def getLabel(filesB,filesV,user2id,posLabel,negLabel): 77 | usrListB = list() 78 | usrListV = list() 79 | userid2Label = dict() 80 | for f in filesB: 81 | df = pd.read_csv(f,sep=',') 82 | for index, row in df.iterrows(): 83 | usrid = user2id[row['username']] 84 | usrListB.append(usrid) 85 | usrListB = list(set(usrListB)) 86 | for val in usrListB: 87 | userid2Label[val] = posLabel 88 | for f in filesV: 89 | df = pd.read_csv(f,sep=',') 90 | for index, row in df.iterrows(): 91 | usrid = user2id[row['username']] 92 | usrListV.append(usrid) 93 | usrListV = list(set(usrListV)) 94 | for val in usrListV: 95 | userid2Label[val] = negLabel 96 | return userid2Label 97 | 98 | 99 | def train_test_split(*agrs): 100 | 101 | trainRatio = agrs[-1] 102 | assert trainRatio <= 1.0 103 | size = agrs[0].shape[0] 104 | thre = int(trainRatio * 10) 105 | indSet = np.random.randint(0,10, size) + 1 106 | trainInd = [indSet <= thre] 107 | testInd = [indSet > thre] 108 | X_usrs, y, X_pages, X_tim, X_rev, X_len = \ 109 | agrs[0], agrs[1], agrs[2], agrs[3], agrs[4], agrs[5] 110 | return X_usrs[trainInd], y[trainInd], X_pages[trainInd], \ 111 | X_tim[trainInd], X_rev[trainInd], X_len[trainInd],\ 112 | X_usrs[testInd], y[testInd], X_pages[testInd], \ 113 | X_tim[testInd], X_rev[testInd], X_len[testInd] 114 | 115 | 116 | def MetaPageList(files,page2id): 117 | MPL = list() 118 | for f in files: 119 | df = pd.read_csv(f,sep=',') 120 | for index, row in df.iterrows(): 121 | try: 122 | pageid = page2id[row['pagetitle']] 123 | except Exception as e: 124 | print row['pagetitle'] 125 | title = row['pagetitle'].lower() 126 | if "user:" in title or "talk:" in title or "user talk:" in title or "wikipedia" in title: 127 | MPL.append(pageid) 128 | return np.array(list(set(MPL))) 129 | 130 | def encode(x, n): 131 | x = int(x) 132 | result = np.zeros(n).tolist() 133 | result[x] = 1. 134 | return result 135 | 136 | def TimeDiff(p1, p2, timDiff): 137 | p1 = datetime.strptime(p1, '%Y-%m-%dT%XZ') 138 | p2 = datetime.strptime(p2, '%Y-%m-%dT%XZ') 139 | td = p2 - p1 140 | if td.days*24*60 + td.seconds/60 < timDiff: 141 | return 1 142 | else: 143 | return 0 144 | 145 | 146 | def IsMetaPage(p, metaDict): 147 | if p in metaDict: 148 | return 1 149 | else: 150 | return 0 151 | 152 | 153 | 154 | # functions for model training and data analysis 155 | 156 | def recMSE(a,b): 157 | return np.mean((a-b)**2) 158 | 159 | def recErr(X1, X2): 160 | seqRecErr = list() 161 | for s1, s2 in zip(X1, X2): 162 | seqRecErr.append(np.sum((s1 - s2)**2)/float(np.prod(s1.shape))) 163 | # seqRecErr.append(np.mean([LA.norm(x1 - x2)**2/8. for x1, x2 in zip(s1, s2)])) 164 | # seqRecErr.append(np.mean([LA.norm(x1 - x2) for x1, x2 in zip(s1, s2)])) 165 | return np.array(seqRecErr) 166 | 167 | def recErrMeaVar(X): 168 | return np.mean(X), np.var(X) 169 | 170 | def recErrHist(*args): 171 | 172 | plt.figure() 173 | # # axes = plt.gca() 174 | # # axes.set_xlim([0.,1.0]) 175 | # # axes.set_ylim([ymin,ymax]) 176 | # plt.subplot(3,1,1) 177 | # plt.title('Trainning') 178 | # weights = np.ones_like(args[0])/len(args[0]) 179 | # plt.hist(args[0], weights=weights, bins=100) 180 | # plt.ylabel('Probability') 181 | # axes = plt.gca() 182 | # axes.set_xlim([0.,.1]) 183 | 184 | plt.subplot(2,1,1) 185 | plt.title('Benign') 186 | weights = np.ones_like(args[0])/len(args[0]) 187 | plt.hist(args[0], weights=weights, bins=100) 188 | plt.ylabel('Probability') 189 | axes = plt.gca() 190 | axes.set_xlim([0.,.1]) 191 | 192 | plt.subplot(2,1,2) 193 | plt.title('Vandal') 194 | weights = np.ones_like(args[1])/len(args[1]) 195 | plt.hist(args[1], weights=weights, bins=100) 196 | plt.ylabel('Probability') 197 | axes = plt.gca() 198 | axes.set_xlim([0.,.1]) 199 | 200 | plt.xlabel('Recontruction Error') 201 | plt.show() 202 | 203 | 204 | def vanDet(X,thrd): 205 | return np.sum(X>=thrd)/float(len(X)) 206 | 207 | def vanDet2(X,thrd): 208 | return (np.array(X)>=thrd).astype(int) 209 | 210 | def TSNE_2D_show_bi(X,y,i): 211 | model = TSNE(n_components=2, random_state=0) 212 | X_2D = model.fit_transform(X) 213 | X_2D_beg = X_2D[y == 1] 214 | X_2D_val = X_2D[y == 0] 215 | 216 | fig = plt.figure() 217 | fig.patch.set_facecolor('w') 218 | ax = fig.add_subplot(111) 219 | ax.set_axis_off() 220 | 221 | blue_dot, = plt.plot(X_2D_beg[:,0],X_2D_beg[:,1], "ro", mec='none') 222 | red_dot, = plt.plot(X_2D_val[:,0],X_2D_val[:,1], "bo", mec='none') 223 | plt.legend([blue_dot, red_dot], ["Benign", "Vandal"], numpoints=1) 224 | plt.savefig("representation_%s"%i) 225 | plt.clf() 226 | plt.close() 227 | 228 | def TSNE_2D_show_tri(X,y): 229 | model = TSNE(n_components=2, random_state=0) 230 | X_2D = model.fit_transform(X) 231 | X_2D_train_beg = X_2D[y == 1] 232 | X_2D_fake = X_2D[y == 0] 233 | X_2D_val = X_2D[y == 2] 234 | 235 | 236 | fig = plt.figure() 237 | fig.patch.set_facecolor('w') 238 | ax = fig.add_subplot(111) 239 | ax.set_axis_off() 240 | 241 | red_dot, = plt.plot(X_2D_train_beg[:,0],X_2D_train_beg[:,1], "ro", mec='none') 242 | green_dot, = plt.plot(X_2D_fake[:, 0], X_2D_fake[:, 1], "go", mec='none') 243 | blue_dot, = plt.plot(X_2D_val[:, 0], X_2D_val[:, 1], "bo", mec='none') 244 | # yellow_dot, = plt.plot(X_2D_train[:, 0], X_2D_train[:, 1], "yo", mec='none') 245 | plt.legend([red_dot, green_dot, blue_dot], ["Benign", "Fake", "Vandal"], numpoints=1) 246 | plt.show() 247 | plt.clf() 248 | plt.close() 249 | return X_2D_train_beg, X_2D_fake, X_2D_val 250 | 251 | 252 | 253 | 254 | def draw_trend(D_real_prob, D_fake_prob, D_val_prob, fm_loss, f1): 255 | 256 | fig = plt.figure() 257 | fig.patch.set_facecolor('w') 258 | # plt.subplot(311) 259 | p1, = plt.plot(D_real_prob, "-g") 260 | p2, = plt.plot(D_fake_prob, "--r") 261 | p3, = plt.plot(D_val_prob, ":c") 262 | plt.xlabel("# of epoch") 263 | plt.ylabel("probability") 264 | leg = plt.legend([p1, p2, p3], [r'$p(y|V_B)$', r'$p(y|\~{V})$', r'$p(y|V_M)$'], loc=1, bbox_to_anchor=(1, 1), borderaxespad=0.) 265 | leg.draw_frame(False) 266 | # plt.legend(frameon=False) 267 | 268 | fig = plt.figure() 269 | fig.patch.set_facecolor('w') 270 | # plt.subplot(312) 271 | p4, = plt.plot(fm_loss, "-b") 272 | plt.xlabel("# of epoch") 273 | plt.ylabel("feature matching loss") 274 | # plt.legend([p4], ["d_real_prob", "d_fake_prob", "d_val_prob"], loc=1, bbox_to_anchor=(1, 1), borderaxespad=0.) 275 | 276 | fig = plt.figure() 277 | fig.patch.set_facecolor('w') 278 | # plt.subplot(313) 279 | p5, = plt.plot(f1, "-y") 280 | plt.xlabel("# of epoch") 281 | plt.ylabel("F1") 282 | # plt.legend([p1, p2, p3, p4, p5], ["d_real_prob", "d_fake_prob", "d_val_prob", "fm_loss","f1"], loc=1, bbox_to_anchor=(1, 3.5), borderaxespad=0.) 283 | plt.show() 284 | 285 | 286 | def sample_shuffle(X): 287 | # n_samples = X.shape[0] 288 | n_samples = len(X) 289 | s = np.arange(n_samples) 290 | np.random.shuffle(s) 291 | return np.array(X[s]) 292 | 293 | # Helper function to plot a decision boundary. 294 | # If you don't fully understand this function don't worry, it just generates the contour plot below. 295 | def plot_decision_boundary(pred_func, X, y): 296 | # Set min and max values and give it some padding 297 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 298 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 299 | h = 0.01 300 | # Generate a grid of points with distance h between them 301 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 302 | # Predict the function value for the whole gid 303 | Z = pred_func(np.c_[xx.ravel(), yy.ravel()]) 304 | Z = Z.reshape(xx.shape) 305 | # Plot the contour and training examples 306 | plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral) 307 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral) 308 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # OCAN: One-Class Adversarial Nets for Fraud Detection 3 | 4 | In this paper, we develop one-class adversarial nets (OCAN) for fraud detection with only benign users as training data. 5 | 6 | ## Running Environment 7 | 8 | The main packages you need to install are listed as follow 9 | 10 | ``` 11 | 1. python 2.7 12 | 2. tensorflow 1.3.0 13 | ``` 14 | 15 | ## DateSet 16 | 17 | For experiments, we evaluate **OCAN** on two real-world datasets: wiki and credit-card which have been attached in folder [data/](https://github.com/PanpanZheng/OCAN/tree/master/data). 18 | 19 | ## Model Evaluation 20 | 21 | The command line for OCAN goes as follow 22 | 23 | ``` 24 | python oc_gan.py $1 $2 25 | ``` 26 | **where** $1 refers to different datasets with wiki 1, credit-card(encoding) 2 and credit-card(raw) 3; $2 denotes whether some metrics, such as fm_loss and f1 in training process, are provided, with non-display 0 and display 1. 27 | 28 | 29 | ``` 30 | e.g. python oc_gan.py 1 0 31 | ``` 32 | The above command line shows the performance of OCAN on wiki without displaying metrics in the training process. 33 | 34 | -------------------------------------------------------------------------------- /bg_dataset.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 2/15/2018 4 | Python Version: 2.7 5 | ''' 6 | import numpy as np 7 | from bg_utils import one_hot 8 | 9 | def load_data(x_benign, x_vandal, n_b_lab, n_v_lab, n_b_test, n_v_test, oh=True): 10 | 11 | # labeled data (supervised) 12 | x_lab_ben = x_benign[0:n_b_lab] 13 | x_lab_van = x_vandal[0:n_v_lab] 14 | x_lab = x_lab_ben.tolist() + x_lab_van.tolist() 15 | x_lab = np.array(x_lab) 16 | y_lab = np.ones(len(x_lab), dtype=np.int32) 17 | y_lab[len(x_lab_ben):] = 0 18 | if oh: 19 | y_lab = one_hot(y_lab, 3) 20 | 21 | 22 | # unlabeled data (unsupervised) 23 | # x_unl_ben = x_benign[len(x_lab_ben):-n_b_test] 24 | # x_unl_van = x_vandal[len(x_lab_van):-n_v_test] 25 | x_unl_ben = x_benign[len(x_lab_ben):-3*n_b_test] 26 | x_unl_van = x_vandal[len(x_lab_van):-3*n_v_test] 27 | x_unl = x_unl_ben.tolist() + x_unl_van.tolist() 28 | x_unl = np.array(x_unl) 29 | 30 | 31 | # test data. 32 | x_benign_test = x_benign[len(x_lab_ben) + len(x_unl_ben):] 33 | x_vandal_test = x_vandal[len(x_lab_van) + len(x_unl_van):] 34 | x_test = x_benign_test.tolist() + x_vandal_test.tolist() 35 | x_test = np.array(x_test) 36 | y_test = np.ones(len(x_test), dtype=np.int32) 37 | y_test[len(x_benign_test):] = 0 38 | 39 | return x_lab, y_lab, x_unl, x_test, y_test 40 | 41 | 42 | 43 | 44 | def load_data_unbal(x_benign, x_vandal, n_b_lab, n_v_lab, n_b_test, n_v_test, oh=True): 45 | 46 | # labeled data (supervised) 47 | x_lab_ben = x_benign[0:n_b_lab] 48 | x_lab_van = x_vandal[0:n_v_lab] 49 | x_lab = x_lab_ben.tolist() + x_lab_van.tolist() 50 | x_lab = np.array(x_lab) 51 | y_lab = np.ones(len(x_lab), dtype=np.int32) 52 | y_lab[len(x_lab_ben):] = 0 53 | if oh: 54 | y_lab = one_hot(y_lab, 3) 55 | 56 | print x_lab_ben.shape, x_lab_van.shape 57 | 58 | 59 | # unlabeled data (unsupervised) 60 | x_unl_ben = x_benign[len(x_lab_ben):-3*n_b_test] 61 | x_unl_van = x_vandal[len(x_lab_van):-3*n_v_test] 62 | x_unl = x_unl_ben.tolist() + x_unl_van.tolist() 63 | x_unl = np.array(x_unl) 64 | print x_unl_ben.shape, x_unl_van.shape 65 | 66 | 67 | # test data. 68 | x_benign_test = x_benign[len(x_lab_ben) + len(x_unl_ben):] 69 | x_vandal_test = x_vandal[len(x_lab_van) + len(x_unl_van):] 70 | x_test = x_benign_test.tolist() + x_vandal_test.tolist() 71 | x_test = np.array(x_test) 72 | y_test = np.ones(len(x_test), dtype=np.int32) 73 | y_test[len(x_benign_test):] = 0 74 | print x_benign_test.shape, x_vandal_test.shape 75 | 76 | return x_lab, y_lab, x_unl, x_test, y_test 77 | -------------------------------------------------------------------------------- /bg_utils.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 2/15/2018 4 | Python Version: 2.7 5 | ''' 6 | import numpy as np 7 | import tensorflow as tf 8 | from sklearn.neighbors.kde import KernelDensity 9 | import matplotlib.pyplot as plt 10 | 11 | 12 | def one_hot(x, depth): 13 | x_one_hot = np.zeros((len(x), depth), dtype=np.int32) 14 | x = x.astype(int) 15 | for i in range(x_one_hot.shape[0]): 16 | x_one_hot[i, x[i]] = 1 17 | return x_one_hot 18 | 19 | 20 | def xavier_init(size): # initialize the weight-matrix W. 21 | in_dim = size[0] 22 | xavier_stddev = 1. / tf.sqrt(in_dim / 2.) 23 | return tf.random_normal(shape=size, stddev=xavier_stddev) 24 | 25 | 26 | def sample_Z(m, n): # generating the input for G. 27 | return np.random.uniform(-1., 1., size=[m, n]) 28 | 29 | 30 | def sample_shuffle_spv(X, labels): 31 | n_samples = len(X) 32 | s = np.arange(n_samples) 33 | np.random.shuffle(s) 34 | return np.array(X[s]), labels[s] 35 | 36 | 37 | def sample_shuffle_uspv(X): 38 | n_samples = len(X) 39 | s = np.arange(n_samples) 40 | np.random.shuffle(s) 41 | return np.array(X[s]) 42 | 43 | 44 | def kde_density_estimator(X,kernel='gaussian',bandwidth=0.2): 45 | return KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(X) 46 | 47 | def complement_density(kde, X, sf=0.5): 48 | # probs = map(lambda x: np.exp(kde.score([x])), X) 49 | probs = np.exp(kde.score_samples(X)) 50 | thrld = np.median(probs) 51 | return np.array( 52 | map(lambda x: low_density(x, thrld, sf), probs) 53 | ) 54 | 55 | def low_density(prob, thrld, sf): 56 | 57 | if prob > thrld: 58 | return sf * np.reciprocal(prob) 59 | # return sf * (1-prob) 60 | else: 61 | return thrld 62 | 63 | 64 | 65 | def pull_away_loss(g): 66 | 67 | Nor = tf.norm(g, axis=1) 68 | Nor_mat = tf.tile(tf.expand_dims(Nor, axis=1), 69 | [1, tf.shape(g)[1]]) 70 | X = tf.divide(g, Nor_mat) 71 | X_X = tf.square(tf.matmul(X, tf.transpose(X))) 72 | mask = tf.subtract(tf.ones_like(X_X), 73 | tf.diag( 74 | tf.ones([tf.shape(X_X)[0]])) 75 | ) 76 | pt_loss = tf.divide(tf.reduce_sum(tf.multiply(X_X, mask)), 77 | tf.multiply( 78 | tf.cast(tf.shape(X_X)[0], tf.float32), 79 | tf.cast(tf.shape(X_X)[0]-1, tf.float32))) 80 | 81 | return pt_loss 82 | 83 | 84 | def draw_trend(D_real_prob, D_fake_prob, D_val_prob, fm_loss, f1): 85 | 86 | fig = plt.figure() 87 | fig.patch.set_facecolor('w') 88 | # plt.subplot(311) 89 | p1, = plt.plot(D_real_prob, "-g") 90 | p2, = plt.plot(D_fake_prob, "--r") 91 | p3, = plt.plot(D_val_prob, ":c") 92 | plt.xlabel("# of epoch") 93 | plt.ylabel("probability") 94 | leg = plt.legend([p1, p2, p3], [r'$p(y|V_B)$', r'$p(y|\~{V})$', r'$p(y|V_M)$'], loc=1, bbox_to_anchor=(1, 1), borderaxespad=0.) 95 | leg.draw_frame(False) 96 | # plt.legend(frameon=False) 97 | 98 | fig = plt.figure() 99 | fig.patch.set_facecolor('w') 100 | # plt.subplot(312) 101 | p4, = plt.plot(fm_loss, "-b") 102 | plt.xlabel("# of epoch") 103 | plt.ylabel("feature matching loss") 104 | # plt.legend([p4], ["d_real_prob", "d_fake_prob", "d_val_prob"], loc=1, bbox_to_anchor=(1, 1), borderaxespad=0.) 105 | 106 | fig = plt.figure() 107 | fig.patch.set_facecolor('w') 108 | # plt.subplot(313) 109 | p5, = plt.plot(f1, "-y") 110 | plt.xlabel("# of epoch") 111 | plt.ylabel("F1") 112 | # plt.legend([p1, p2, p3, p4, p5], ["d_real_prob", "d_fake_prob", "d_val_prob", "fm_loss","f1"], loc=1, bbox_to_anchor=(1, 3.5), borderaxespad=0.) 113 | plt.show() 114 | 115 | 116 | def plot_decision_boundary(pred_func, X, y): 117 | # Set min and max values and give it some padding 118 | x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 119 | y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 120 | h = 0.01 121 | # Generate a grid of points with distance h between them 122 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 123 | # Predict the function value for the whole gid 124 | Z = pred_func(np.c_[xx.ravel(), yy.ravel()]) 125 | Z = Z.reshape(xx.shape) 126 | # Plot the contour and training examples 127 | plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral) 128 | plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral) 129 | -------------------------------------------------------------------------------- /data/credit_card/ben_hid_repre_r2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/credit_card/ben_hid_repre_r2.npy -------------------------------------------------------------------------------- /data/credit_card/van_hid_repre_r2.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/credit_card/van_hid_repre_r2.npy -------------------------------------------------------------------------------- /data/raw_credit_card/ben_raw_r0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/raw_credit_card/ben_raw_r0.npy -------------------------------------------------------------------------------- /data/raw_credit_card/van_raw_r0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/raw_credit_card/van_raw_r0.npy -------------------------------------------------------------------------------- /data/wiki/X_v8_4_50_Ben.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/wiki/X_v8_4_50_Ben.npy -------------------------------------------------------------------------------- /data/wiki/X_v8_4_50_Van.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/wiki/X_v8_4_50_Van.npy -------------------------------------------------------------------------------- /data/wiki/ben_hid_emd_4_50_8_200_r0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/wiki/ben_hid_emd_4_50_8_200_r0.npy -------------------------------------------------------------------------------- /data/wiki/val_hid_emd_4_50_8_200_r0.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ILoveAI2019/OCAN/d91d1ea5de813791c2ea773402a4abbf6dc3073c/data/wiki/val_hid_emd_4_50_8_200_r0.npy -------------------------------------------------------------------------------- /oc_gan.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Panpan Zheng 3 | Date created: 2/15/2018 4 | Python Version: 2.7 5 | ''' 6 | 7 | import tensorflow as tf 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | from sklearn.metrics import classification_report, accuracy_score 11 | from sklearn.preprocessing import MinMaxScaler 12 | import os 13 | 14 | from bg_utils import pull_away_loss, one_hot, xavier_init, sample_shuffle_spv, sample_shuffle_uspv, sample_Z, draw_trend 15 | from bg_dataset import load_data, load_data_unbal 16 | import sys 17 | 18 | 19 | 20 | en_ae = int(sys.argv[1]) # en_ae == 1 for wiki dataset with autoencoding; 21 | # en_ae == 2 for credit card dataset with autoencoding; 22 | # en_ae == 3 for credit card dataset without autoencoding. 23 | 24 | dra_tra_pro = int(sys.argv[2]) # dra_tra_pro == 1 for printing training trend, discr_probabiltiy, f1 and fm_loss; 25 | # dra_tra_pro == 1 for printing training trend, discr_probabiltiy, f1 and fm_loss; 26 | 27 | 28 | # print en_ae, dra_tra_pro 29 | # 30 | # exit(0) 31 | # en_ae = 3 # 1 for wiki dataset with autoencoding; 2 for credit card dataset with autoencoding; 3 for credit card dataset without autoencoding. 32 | # dra_tra_pro = False 33 | 34 | if en_ae == 1: 35 | mb_size = 100 36 | dim_input = 200 37 | elif en_ae == 2: 38 | mb_size = 70 39 | dim_input = 50 40 | else: 41 | mb_size = 70 42 | dim_input = 30 43 | 44 | 45 | D_dim = [dim_input, 100, 50, 2] 46 | G_dim = [50, 100, dim_input] 47 | Z_dim = G_dim[0] 48 | 49 | 50 | # define placeholders for labeled-data, unlabeled-data, noise-data and target-data. 51 | 52 | X_oc = tf.placeholder(tf.float32, shape=[None, dim_input]) 53 | Z = tf.placeholder(tf.float32, shape=[None, Z_dim]) 54 | X_tar = tf.placeholder(tf.float32, shape=[None, dim_input]) 55 | # X_val = tf.placeholder(tf.float32, shape=[None, dim_input]) 56 | 57 | 58 | # declare weights and biases of discriminator. 59 | 60 | D_W1 = tf.Variable(xavier_init([D_dim[0], D_dim[1]])) 61 | D_b1 = tf.Variable(tf.zeros(shape=[D_dim[1]])) 62 | 63 | D_W2 = tf.Variable(xavier_init([D_dim[1], D_dim[2]])) 64 | D_b2 = tf.Variable(tf.zeros(shape=[D_dim[2]])) 65 | 66 | D_W3 = tf.Variable(xavier_init([D_dim[2], D_dim[3]])) 67 | D_b3 = tf.Variable(tf.zeros(shape=[D_dim[3]])) 68 | 69 | theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3] 70 | 71 | 72 | 73 | # declare weights and biases of generator. 74 | 75 | G_W1 = tf.Variable(xavier_init([G_dim[0], G_dim[1]])) 76 | G_b1 = tf.Variable(tf.zeros(shape=[G_dim[1]])) 77 | 78 | G_W2 = tf.Variable(xavier_init([G_dim[1], G_dim[2]])) 79 | G_b2 = tf.Variable(tf.zeros(shape=[G_dim[2]])) 80 | 81 | theta_G = [G_W1, G_W2, G_b1, G_b2] 82 | 83 | 84 | # declare weights and biases of pre-train net for density estimation. 85 | 86 | T_W1 = tf.Variable(xavier_init([D_dim[0], D_dim[1]])) 87 | T_b1 = tf.Variable(tf.zeros(shape=[D_dim[1]])) 88 | 89 | T_W2 = tf.Variable(xavier_init([D_dim[1], D_dim[2]])) 90 | T_b2 = tf.Variable(tf.zeros(shape=[D_dim[2]])) 91 | 92 | T_W3 = tf.Variable(xavier_init([D_dim[2], D_dim[3]])) 93 | T_b3 = tf.Variable(tf.zeros(shape=[D_dim[3]])) 94 | 95 | theta_T = [T_W1, T_W2, T_W3, T_b1, T_b2, T_b3] 96 | 97 | 98 | def generator(z): 99 | G_h1 = tf.nn.relu(tf.matmul(z, G_W1) + G_b1) 100 | G_logit = tf.nn.tanh(tf.matmul(G_h1, G_W2) + G_b2) 101 | return G_logit 102 | 103 | 104 | def discriminator(x): 105 | D_h1 = tf.nn.relu(tf.matmul(x, D_W1) + D_b1) 106 | D_h2 = tf.nn.relu(tf.matmul(D_h1, D_W2) + D_b2) 107 | D_logit = tf.matmul(D_h2, D_W3) + D_b3 108 | D_prob = tf.nn.softmax(D_logit) 109 | return D_prob, D_logit, D_h2 110 | 111 | 112 | # pre-train net for density estimation. 113 | 114 | def discriminator_tar(x): 115 | T_h1 = tf.nn.relu(tf.matmul(x, T_W1) + T_b1) 116 | T_h2 = tf.nn.relu(tf.matmul(T_h1, T_W2) + T_b2) 117 | T_logit = tf.matmul(T_h2, T_W3) + T_b3 118 | T_prob = tf.nn.softmax(T_logit) 119 | return T_prob, T_logit, T_h2 120 | 121 | 122 | D_prob_real, D_logit_real, D_h2_real = discriminator(X_oc) 123 | 124 | G_sample = generator(Z) 125 | D_prob_gen, D_logit_gen, D_h2_gen = discriminator(G_sample) 126 | 127 | D_prob_tar, D_logit_tar, D_h2_tar = discriminator_tar(X_tar) 128 | D_prob_tar_gen, D_logit_tar_gen, D_h2_tar_gen = discriminator_tar(G_sample) 129 | # D_prob_val, _, D_h1_val = discriminator(X_val) 130 | 131 | 132 | 133 | 134 | # disc. loss 135 | y_real= tf.placeholder(tf.int32, shape=[None, D_dim[3]]) 136 | y_gen = tf.placeholder(tf.int32, shape=[None, D_dim[3]]) 137 | 138 | D_loss_real = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=D_logit_real,labels=y_real)) 139 | D_loss_gen = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=D_logit_gen, labels=y_gen)) 140 | 141 | ent_real_loss = -tf.reduce_mean( 142 | tf.reduce_sum( 143 | tf.multiply(D_prob_real, tf.log(D_prob_real)), 1 144 | ) 145 | ) 146 | 147 | ent_gen_loss = -tf.reduce_mean( 148 | tf.reduce_sum( 149 | tf.multiply(D_prob_gen, tf.log(D_prob_gen)), 1 150 | ) 151 | ) 152 | 153 | D_loss = D_loss_real + D_loss_gen + 1.85 * ent_real_loss 154 | 155 | 156 | # gene. loss 157 | pt_loss = pull_away_loss(D_h2_tar_gen) 158 | 159 | y_tar= tf.placeholder(tf.int32, shape=[None, D_dim[3]]) 160 | T_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=D_logit_tar, labels=y_tar)) 161 | tar_thrld = tf.divide(tf.reduce_max(D_prob_tar_gen[:,-1]) + 162 | tf.reduce_min(D_prob_tar_gen[:,-1]), 2) 163 | 164 | # tar_thrld = tf.reduce_mean(D_prob_tar_gen[:,-1]) 165 | 166 | 167 | indicator = tf.sign( 168 | tf.subtract(D_prob_tar_gen[:,-1], 169 | tar_thrld)) 170 | condition = tf.greater(tf.zeros_like(indicator), indicator) 171 | mask_tar = tf.where(condition, tf.zeros_like(indicator), indicator) 172 | G_ent_loss = tf.reduce_mean(tf.multiply(tf.log(D_prob_tar_gen[:,-1]), mask_tar)) 173 | # G_ent_loss = tf.reduce_mean(tf.log(D_prob_tar_gen[:,-1])) 174 | 175 | fm_loss = tf.reduce_mean( 176 | tf.sqrt( 177 | tf.reduce_sum( 178 | tf.square(D_logit_real - D_logit_gen), 1 179 | ) 180 | ) 181 | ) 182 | 183 | G_loss = pt_loss + G_ent_loss + fm_loss 184 | 185 | D_solver = tf.train.GradientDescentOptimizer(learning_rate=1e-3).minimize(D_loss, var_list=theta_D) 186 | G_solver = tf.train.AdamOptimizer().minimize(G_loss, var_list=theta_G) 187 | T_solver = tf.train.GradientDescentOptimizer(learning_rate=1e-3).minimize(T_loss, var_list=theta_T) 188 | 189 | 190 | # Load data.... 191 | 192 | 193 | min_max_scaler = MinMaxScaler() 194 | 195 | if en_ae == 1: 196 | x_benign = min_max_scaler.fit_transform(np.load("./data/wiki/ben_hid_emd_4_50_8_200_r0.npy")) 197 | x_vandal = min_max_scaler.transform(np.load("./data/wiki/val_hid_emd_4_50_8_200_r0.npy")) 198 | elif en_ae == 2: 199 | x_benign = min_max_scaler.fit_transform(np.load("./data/credit_card/ben_hid_repre_r2.npy")) 200 | x_vandal = min_max_scaler.transform(np.load("./data/credit_card/van_hid_repre_r2.npy")) 201 | else: 202 | x_benign = min_max_scaler.fit_transform(np.load("./data/raw_credit_card/ben_raw_r0.npy")) 203 | x_vandal = min_max_scaler.transform(np.load("./data/raw_credit_card/van_raw_r0.npy")) 204 | 205 | 206 | #x_benign = min_max_scaler.fit_transform(np.load("./hidden_output/ben_hid_emd_4_50_8_200.npy")) 207 | #x_vandal = min_max_scaler.transform(np.load("./hidden_output/val_hid_emd_4_50_8_200.npy")) 208 | 209 | 210 | x_benign = sample_shuffle_uspv(x_benign) 211 | x_vandal = sample_shuffle_uspv(x_vandal) 212 | 213 | if en_ae == 1: 214 | x_benign = x_benign[0:10000] 215 | x_vandal = x_vandal[0:10000] 216 | x_pre = x_benign[0:7000] 217 | else: 218 | x_pre = x_benign[0:700] 219 | 220 | y_pre = np.zeros(len(x_pre)) 221 | y_pre = one_hot(y_pre, 2) 222 | 223 | x_train = x_pre 224 | 225 | y_real_mb = one_hot(np.zeros(mb_size), 2) 226 | y_fake_mb = one_hot(np.ones(mb_size), 2) 227 | 228 | if en_ae == 1: 229 | x_test = x_benign[-3000:].tolist() + x_vandal[-3000:].tolist() 230 | else: 231 | x_test = x_benign[-490:].tolist() + x_vandal[-490:].tolist() 232 | x_test = np.array(x_test) 233 | 234 | 235 | y_test = np.zeros(len(x_test)) 236 | if en_ae == 1: 237 | y_test[3000:] = 1 238 | else: 239 | y_test[490:] = 1 240 | 241 | 242 | sess = tf.Session() 243 | sess.run(tf.global_variables_initializer()) 244 | 245 | # pre-training for target distribution 246 | 247 | _ = sess.run(T_solver, 248 | feed_dict={ 249 | X_tar:x_pre, 250 | y_tar:y_pre 251 | }) 252 | 253 | q = np.divide(len(x_train), mb_size) 254 | 255 | # n_epoch = 1 256 | # 257 | # while n_epoch: 258 | 259 | d_ben_pro, d_fake_pro, fm_loss_coll = list(), list(), list() 260 | f1_score = list() 261 | d_val_pro = list() 262 | 263 | if en_ae == 1: 264 | n_round = 50 265 | else: 266 | n_round = 200 267 | 268 | for n_epoch in range(n_round): 269 | 270 | X_mb_oc = sample_shuffle_uspv(x_train) 271 | 272 | for n_batch in range(q): 273 | 274 | _, D_loss_curr, ent_real_curr = sess.run([D_solver, D_loss, ent_real_loss], 275 | feed_dict={ 276 | X_oc: X_mb_oc[n_batch*mb_size:(n_batch+1)*mb_size], 277 | Z: sample_Z(mb_size, Z_dim), 278 | y_real: y_real_mb, 279 | y_gen: y_fake_mb 280 | }) 281 | 282 | _, G_loss_curr, fm_loss_curr = sess.run([G_solver, G_loss, fm_loss], 283 | # _, G_loss_curr, fm_loss_, kld_ = sess.run([G_solver, G_loss, fm_loss, pt_loss + G_ent_loss], 284 | feed_dict={Z: sample_Z(mb_size, Z_dim), 285 | X_oc: X_mb_oc[n_batch*mb_size:(n_batch+1)*mb_size], 286 | }) 287 | 288 | D_prob_real_, D_prob_gen_ = sess.run([D_prob_real, D_prob_gen], 289 | feed_dict={X_oc: x_train, 290 | Z: sample_Z(len(x_train), Z_dim)}) 291 | 292 | if en_ae == 1: 293 | D_prob_vandal_ = sess.run(D_prob_real, 294 | feed_dict={X_oc: x_vandal[0:7000]}) 295 | # feed_dict={X_oc:x_vandal[-490:]}) 296 | else: 297 | D_prob_vandal_ = sess.run(D_prob_real, 298 | #feed_dict={X_oc: x_vandal[0:7000]}) 299 | feed_dict={X_oc:x_vandal[-490:]}) 300 | 301 | d_ben_pro.append(np.mean(D_prob_real_[:, 0])) 302 | d_fake_pro.append(np.mean(D_prob_gen_[:, 0])) 303 | d_val_pro.append(np.mean(D_prob_vandal_[:, 0])) 304 | fm_loss_coll.append(fm_loss_curr) 305 | 306 | prob, _ = sess.run([D_prob_real, D_logit_real], feed_dict={X_oc: x_test}) 307 | y_pred = np.argmax(prob, axis=1) 308 | conf_mat = classification_report(y_test, y_pred, target_names=['benign', 'vandal'], digits=4) 309 | f1_score.append(float(filter(None, conf_mat.strip().split(" "))[12])) 310 | # print conf_mat 311 | 312 | if not dra_tra_pro: 313 | acc = np.sum(y_pred == y_test)/float(len(y_pred)) 314 | print conf_mat 315 | print "acc:%s"%acc 316 | 317 | if dra_tra_pro: 318 | draw_trend(d_ben_pro, d_fake_pro, d_val_pro, fm_loss_coll, f1_score) 319 | 320 | exit(0) 321 | --------------------------------------------------------------------------------