├── LICENSE ├── README.md ├── bigan ├── kdd_utilities.py └── run_kdd.py ├── data ├── kdd.py └── kddcup.data_10_percent_corrected ├── main.py ├── requirements.txt └── utils ├── adapt_data.py └── evaluations.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Rajlaxmi Patil, Rajshekhar Biradar, Vinayakumar R, Uttam Ghosh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Network Traffic Anomaly Detection using PCA and BIGAN 2 | 3 | 4 | ## Prerequisites. 5 | To run the code, follow those steps: 6 | 7 | Install Python 3 8 | 9 | ``` 10 | sudo apt install python3 python3-pip 11 | ``` 12 | Download the project code: 13 | 14 | ``` 15 | git clone https://github.com/Rajlaxmi04/Network-Traffic-Anomaly-Detection-using-PCA-and-BiGAN 16 | ``` 17 | Install requirements (in the cloned repository): 18 | 19 | ``` 20 | pip3 install -r requirements.txt 21 | ``` 22 | 23 | ## Doing anomaly detection. 24 | 25 | Running the code with different options 26 | 27 | ``` 28 | python3 main.py run --nb_epochs= --label=<0, 1, 2, 3, 4, 5, 6, 7, 8, 9> --w= --m=<'cross-e','fm'> --d= --rd= --nc= 29 | ``` 30 | To reproduce the results of the paper, please use w=0.1 (as in the original AnoGAN paper which gives a weight of 0.1 to the discriminator loss), d=1 for the feature matching loss. 31 | 32 | Example command is shown below. Here nc is PCA n_components. Set to 28 for best performance results 33 | python main.py bigan kdd run --nb_epochs=10 --w=0.1 --m=cross-e --d=1 --nc=28 34 | 35 | 36 | Courtesy: https://arxiv.org/pdf/1802.06222.pdf 37 | Original code was modified by applying PCA to the dataset 38 | -------------------------------------------------------------------------------- /bigan/kdd_utilities.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import data.kdd as data 3 | 4 | """KDD BiGAN architecture. 5 | 6 | Generator (decoder), encoder and discriminator. 7 | 8 | """ 9 | 10 | 11 | learning_rate = 0.00001 12 | batch_size = 50 13 | layer = 1 14 | latent_dim = 32 15 | dis_inter_layer_dim = 128 16 | init_kernel = tf.contrib.layers.xavier_initializer() 17 | #shape = 21 18 | 19 | def encoder(x_inp, is_training=False, getter=None, reuse=False): 20 | """ Encoder architecture in tensorflow 21 | 22 | Maps the data into the latent space 23 | 24 | Args: 25 | x_inp (tensor): input data for the encoder. 26 | reuse (bool): sharing variables or not 27 | 28 | Returns: 29 | (tensor): last activation layer of the encoder 30 | 31 | """ 32 | 33 | with tf.variable_scope('encoder', reuse=reuse, custom_getter=getter): 34 | 35 | name_net = 'layer_1' 36 | with tf.variable_scope(name_net): 37 | net = tf.layers.dense(x_inp, 38 | units=64, 39 | kernel_initializer=init_kernel, 40 | name='fc') 41 | net = leakyReLu(net) 42 | 43 | name_net = 'layer_2' 44 | with tf.variable_scope(name_net): 45 | net = tf.layers.dense(net, 46 | units=latent_dim, 47 | kernel_initializer=init_kernel, 48 | name='fc') 49 | 50 | return net 51 | 52 | def decoder(z_inp, is_training=False, getter=None, reuse=False): 53 | """ Decoder architecture in tensorflow 54 | 55 | Generates data from the latent space 56 | 57 | Args: 58 | z_inp (tensor): variable in the latent space 59 | reuse (bool): sharing variables or not 60 | 61 | Returns: 62 | (tensor): last activation layer of the generator 63 | 64 | """ 65 | with tf.variable_scope('generator', reuse=reuse, custom_getter=getter): 66 | name_net = 'layer_1' 67 | with tf.variable_scope(name_net): 68 | net = tf.layers.dense(z_inp, 69 | units=64, 70 | kernel_initializer=init_kernel, 71 | name='fc') 72 | net = tf.nn.relu(net) 73 | 74 | name_net = 'layer_2' 75 | with tf.variable_scope(name_net): 76 | net = tf.layers.dense(net, 77 | units=128, 78 | kernel_initializer=init_kernel, 79 | name='fc') 80 | net = tf.nn.relu(net) 81 | 82 | name_net = 'layer_3' 83 | with tf.variable_scope(name_net): 84 | net = tf.layers.dense(net, 85 | units=data.get_shape_input()[1], 86 | kernel_initializer=init_kernel, 87 | name='fc') 88 | 89 | return net 90 | 91 | def discriminator(z_inp, x_inp, is_training=False, getter=None, reuse=False): 92 | """ Discriminator architecture in tensorflow 93 | 94 | Discriminates between pairs (E(x), x) and (z, G(z)) 95 | 96 | Args: 97 | z_inp (tensor): variable in the latent space 98 | x_inp (tensor): input data for the encoder. 99 | reuse (bool): sharing variables or not 100 | 101 | Returns: 102 | logits (tensor): last activation layer of the discriminator (shape 1) 103 | intermediate_layer (tensor): intermediate layer for feature matching 104 | 105 | """ 106 | with tf.variable_scope('discriminator', reuse=reuse, custom_getter=getter): 107 | # D(x) 108 | name_x = 'x_layer_1' 109 | with tf.variable_scope(name_x): 110 | x = tf.layers.dense(x_inp, 111 | units=128, 112 | kernel_initializer=init_kernel, 113 | name='fc') 114 | x = leakyReLu(x) 115 | x = tf.layers.dropout(x, rate=0.2, name='dropout', training=is_training) 116 | 117 | # D(z) 118 | name_z = 'z_fc_1' 119 | with tf.variable_scope(name_z): 120 | z = tf.layers.dense(z_inp, 128, kernel_initializer=init_kernel) 121 | z = leakyReLu(z) 122 | z = tf.layers.dropout(z, rate=0.2, name='dropout', training=is_training) 123 | 124 | # D(x,z) 125 | y = tf.concat([x, z], axis=1) 126 | 127 | name_y = 'y_fc_1' 128 | with tf.variable_scope(name_y): 129 | y = tf.layers.dense(y, 130 | dis_inter_layer_dim, 131 | kernel_initializer=init_kernel) 132 | y = leakyReLu(y) 133 | y = tf.layers.dropout(y, rate=0.2, name='dropout', training=is_training) 134 | 135 | intermediate_layer = y 136 | 137 | name_y = 'y_fc_logits' 138 | with tf.variable_scope(name_y): 139 | logits = tf.layers.dense(y, 140 | 1, 141 | kernel_initializer=init_kernel) 142 | 143 | return logits, intermediate_layer 144 | 145 | 146 | def leakyReLu(x, alpha=0.1, name='leaky_relu'): 147 | """ Leaky relu """ 148 | if name: 149 | with tf.variable_scope(name): 150 | return _leakyReLu_impl(x, alpha) 151 | else: 152 | return _leakyReLu_impl(x, alpha) 153 | 154 | def _leakyReLu_impl(x, alpha): 155 | return tf.nn.relu(x) - (alpha * tf.nn.relu(-x)) 156 | -------------------------------------------------------------------------------- /bigan/run_kdd.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import tensorflow as tf 4 | import logging 5 | import importlib 6 | import sys 7 | import bigan.kdd_utilities as network 8 | import data.kdd as data 9 | from sklearn.metrics import precision_recall_fscore_support 10 | 11 | RANDOM_SEED = 13 12 | FREQ_PRINT = 20 # print frequency image tensorboard [20] 13 | 14 | 15 | 16 | def get_getter(ema): # to update neural net with moving avg variables, suitable for ss learning cf Saliman 17 | def ema_getter(getter, name, *args, **kwargs): 18 | var = getter(name, *args, **kwargs) 19 | ema_var = ema.average(var) 20 | return ema_var if ema_var else var 21 | 22 | return ema_getter 23 | 24 | def display_parameters(batch_size, starting_lr, ema_decay, weight, method, degree): 25 | '''See parameters 26 | ''' 27 | print('Batch size: ', batch_size) 28 | print('Starting learning rate: ', starting_lr) 29 | print('EMA Decay: ', ema_decay) 30 | print('Weight: ', weight) 31 | print('Method for discriminator: ', method) 32 | print('Degree for L norms: ', degree) 33 | 34 | def display_progression_epoch(j, id_max): 35 | '''See epoch progression 36 | ''' 37 | batch_progression = int((j / id_max) * 100) 38 | sys.stdout.write(str(batch_progression) + ' % epoch' + chr(13)) 39 | _ = sys.stdout.flush 40 | 41 | def create_logdir(method, weight, rd): 42 | """ Directory to save training logs, weights, biases, etc.""" 43 | return "bigan/train_logs/kdd/{}/{}/{}".format(weight, method, rd) 44 | 45 | def train_and_test(nb_epochs, weight, method, degree, random_seed, nc): 46 | """ Runs the Bigan on the KDD dataset 47 | 48 | Note: 49 | Saves summaries on tensorboard. To display them, please use cmd line 50 | tensorboard --logdir=model.training_logdir() --port=number 51 | Args: 52 | nb_epochs (int): number of epochs 53 | weight (float, optional): weight for the anomaly score composition 54 | method (str, optional): 'fm' for ``Feature Matching`` or "cross-e" 55 | for ``cross entropy``, "efm" etc. 56 | anomalous_label (int): int in range 0 to 10, is the class/digit 57 | which is considered outlier 58 | """ 59 | logger = logging.getLogger("BiGAN.train.kdd.{}".format(method)) 60 | 61 | data.set_nc(nc) 62 | print("get_shape", data.get_shape_input()[1]) 63 | # Placeholders 64 | input_pl = tf.placeholder(tf.float32, shape=data.get_shape_input(), name="input") 65 | is_training_pl = tf.placeholder(tf.bool, [], name='is_training_pl') 66 | learning_rate = tf.placeholder(tf.float32, shape=(), name="lr_pl") 67 | 68 | # Data 69 | trainx, trainy = data.get_train() 70 | trainx_copy = trainx.copy() 71 | testx, testy = data.get_test() 72 | 73 | # Parameters 74 | starting_lr = network.learning_rate 75 | batch_size = network.batch_size 76 | latent_dim = network.latent_dim 77 | ema_decay = 0.9999 78 | 79 | rng = np.random.RandomState(RANDOM_SEED) 80 | nr_batches_train = int(trainx.shape[0] / batch_size) 81 | nr_batches_test = int(testx.shape[0] / batch_size) 82 | 83 | logger.info('Building training graph...') 84 | 85 | logger.warn("The BiGAN is training with the following parameters:") 86 | display_parameters(batch_size, starting_lr, ema_decay, weight, method, degree) 87 | 88 | gen = network.decoder 89 | enc = network.encoder 90 | dis = network.discriminator 91 | 92 | with tf.variable_scope('encoder_model'): 93 | z_gen = enc(input_pl, is_training=is_training_pl) 94 | 95 | with tf.variable_scope('generator_model'): 96 | z = tf.random_normal([batch_size, latent_dim]) 97 | x_gen = gen(z, is_training=is_training_pl) 98 | 99 | with tf.variable_scope('discriminator_model'): 100 | l_encoder, inter_layer_inp = dis(z_gen,input_pl, is_training=is_training_pl) 101 | l_generator, inter_layer_rct = dis(z, x_gen, is_training=is_training_pl, reuse=True) 102 | 103 | with tf.name_scope('loss_functions'): 104 | # discriminator 105 | loss_dis_enc = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(l_encoder),logits=l_encoder)) 106 | loss_dis_gen = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(l_generator),logits=l_generator)) 107 | loss_discriminator = loss_dis_gen + loss_dis_enc 108 | # generator 109 | loss_generator = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(l_generator),logits=l_generator)) 110 | # encoder 111 | loss_encoder = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.zeros_like(l_encoder),logits=l_encoder)) 112 | 113 | with tf.name_scope('optimizers'): 114 | # control op dependencies for batch norm and trainable variables 115 | tvars = tf.trainable_variables() 116 | dvars = [var for var in tvars if 'discriminator_model' in var.name] 117 | gvars = [var for var in tvars if 'generator_model' in var.name] 118 | evars = [var for var in tvars if 'encoder_model' in var.name] 119 | 120 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 121 | update_ops_gen = [x for x in update_ops if ('generator_model' in x.name)] 122 | update_ops_enc = [x for x in update_ops if ('encoder_model' in x.name)] 123 | update_ops_dis = [x for x in update_ops if ('discriminator_model' in x.name)] 124 | 125 | optimizer_dis = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.5, name='dis_optimizer') 126 | optimizer_gen = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.5, name='gen_optimizer') 127 | optimizer_enc = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.5, name='enc_optimizer') 128 | 129 | with tf.control_dependencies(update_ops_gen): 130 | gen_op = optimizer_gen.minimize(loss_generator, var_list=gvars) 131 | with tf.control_dependencies(update_ops_enc): 132 | enc_op = optimizer_enc.minimize(loss_encoder, var_list=evars) 133 | with tf.control_dependencies(update_ops_dis): 134 | dis_op = optimizer_dis.minimize(loss_discriminator, var_list=dvars) 135 | 136 | # Exponential Moving Average for estimation 137 | dis_ema = tf.train.ExponentialMovingAverage(decay=ema_decay) 138 | maintain_averages_op_dis = dis_ema.apply(dvars) 139 | 140 | with tf.control_dependencies([dis_op]): 141 | train_dis_op = tf.group(maintain_averages_op_dis) 142 | 143 | gen_ema = tf.train.ExponentialMovingAverage(decay=ema_decay) 144 | maintain_averages_op_gen = gen_ema.apply(gvars) 145 | 146 | with tf.control_dependencies([gen_op]): 147 | train_gen_op = tf.group(maintain_averages_op_gen) 148 | 149 | enc_ema = tf.train.ExponentialMovingAverage(decay=ema_decay) 150 | maintain_averages_op_enc = enc_ema.apply(evars) 151 | 152 | with tf.control_dependencies([enc_op]): 153 | train_enc_op = tf.group(maintain_averages_op_enc) 154 | 155 | with tf.name_scope('summary'): 156 | with tf.name_scope('dis_summary'): 157 | tf.summary.scalar('loss_discriminator', loss_discriminator, ['dis']) 158 | tf.summary.scalar('loss_dis_encoder', loss_dis_enc, ['dis']) 159 | tf.summary.scalar('loss_dis_gen', loss_dis_gen, ['dis']) 160 | 161 | with tf.name_scope('gen_summary'): 162 | tf.summary.scalar('loss_generator', loss_generator, ['gen']) 163 | tf.summary.scalar('loss_encoder', loss_encoder, ['gen']) 164 | 165 | sum_op_dis = tf.summary.merge_all('dis') 166 | sum_op_gen = tf.summary.merge_all('gen') 167 | 168 | logger.info('Building testing graph...') 169 | 170 | with tf.variable_scope('encoder_model'): 171 | z_gen_ema = enc(input_pl, is_training=is_training_pl, 172 | getter=get_getter(enc_ema), reuse=True) 173 | 174 | with tf.variable_scope('generator_model'): 175 | reconstruct_ema = gen(z_gen_ema, is_training=is_training_pl, 176 | getter=get_getter(gen_ema), reuse=True) 177 | 178 | with tf.variable_scope('discriminator_model'): 179 | l_encoder_ema, inter_layer_inp_ema = dis(z_gen_ema, 180 | input_pl, 181 | is_training=is_training_pl, 182 | getter=get_getter(dis_ema), 183 | reuse=True) 184 | l_generator_ema, inter_layer_rct_ema = dis(z_gen_ema, 185 | reconstruct_ema, 186 | is_training=is_training_pl, 187 | getter=get_getter(dis_ema), 188 | reuse=True) 189 | with tf.name_scope('Testing'): 190 | with tf.variable_scope('Reconstruction_loss'): 191 | delta = input_pl - reconstruct_ema 192 | delta_flat = tf.contrib.layers.flatten(delta) 193 | gen_score = tf.norm(delta_flat, ord=degree, axis=1, 194 | keep_dims=False, name='epsilon') 195 | 196 | with tf.variable_scope('Discriminator_loss'): 197 | if method == "cross-e": 198 | dis_score = tf.nn.sigmoid_cross_entropy_with_logits( 199 | labels=tf.ones_like(l_generator_ema),logits=l_generator_ema) 200 | 201 | elif method == "fm": 202 | fm = inter_layer_inp_ema - inter_layer_rct_ema 203 | fm = tf.contrib.layers.flatten(fm) 204 | dis_score = tf.norm(fm, ord=degree, axis=1, 205 | keep_dims=False, name='d_loss') 206 | 207 | dis_score = tf.squeeze(dis_score) 208 | 209 | with tf.variable_scope('Score'): 210 | list_scores = (1 - weight) * gen_score + weight * dis_score 211 | 212 | 213 | logdir = create_logdir(weight, method, random_seed) 214 | 215 | sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=None, 216 | save_model_secs=120) 217 | 218 | logger.info('Start training...') 219 | with sv.managed_session() as sess: 220 | 221 | logger.info('Initialization done') 222 | writer = tf.summary.FileWriter(logdir, sess.graph) 223 | train_batch = 0 224 | epoch = 0 225 | 226 | while not sv.should_stop() and epoch < nb_epochs: 227 | 228 | lr = starting_lr 229 | begin = time.time() 230 | 231 | # construct randomly permuted minibatches 232 | trainx = trainx[rng.permutation(trainx.shape[0])] # shuffling dataset 233 | trainx_copy = trainx_copy[rng.permutation(trainx.shape[0])] 234 | train_loss_dis, train_loss_gen, train_loss_enc = [0, 0, 0] 235 | 236 | # training 237 | for t in range(nr_batches_train): 238 | 239 | display_progression_epoch(t, nr_batches_train) 240 | ran_from = t * batch_size 241 | ran_to = (t + 1) * batch_size 242 | 243 | # train discriminator 244 | feed_dict = {input_pl: trainx[ran_from:ran_to], 245 | is_training_pl: True, 246 | learning_rate:lr} 247 | 248 | _, ld, sm = sess.run([train_dis_op, 249 | loss_discriminator, 250 | sum_op_dis], 251 | feed_dict=feed_dict) 252 | train_loss_dis += ld 253 | writer.add_summary(sm, train_batch) 254 | 255 | # train generator and encoder 256 | feed_dict = {input_pl: trainx_copy[ran_from:ran_to], 257 | is_training_pl: True, 258 | learning_rate:lr} 259 | _,_, le, lg, sm = sess.run([train_gen_op, 260 | train_enc_op, 261 | loss_encoder, 262 | loss_generator, 263 | sum_op_gen], 264 | feed_dict=feed_dict) 265 | train_loss_gen += lg 266 | train_loss_enc += le 267 | writer.add_summary(sm, train_batch) 268 | 269 | train_batch += 1 270 | 271 | train_loss_gen /= nr_batches_train 272 | train_loss_enc /= nr_batches_train 273 | train_loss_dis /= nr_batches_train 274 | 275 | logger.info('Epoch terminated') 276 | print("Epoch %d | time = %ds | loss gen = %.4f | loss enc = %.4f | loss dis = %.4f " 277 | % (epoch, time.time() - begin, train_loss_gen, train_loss_enc, train_loss_dis)) 278 | 279 | epoch += 1 280 | 281 | logger.warn('Testing evaluation...') 282 | 283 | inds = rng.permutation(testx.shape[0]) 284 | testx = testx[inds] # shuffling dataset 285 | testy = testy[inds] # shuffling dataset 286 | scores = [] 287 | inference_time = [] 288 | 289 | # Create scores 290 | for t in range(nr_batches_test): 291 | 292 | # construct randomly permuted minibatches 293 | ran_from = t * batch_size 294 | ran_to = (t + 1) * batch_size 295 | begin_val_batch = time.time() 296 | 297 | feed_dict = {input_pl: testx[ran_from:ran_to], 298 | is_training_pl:False} 299 | 300 | scores += sess.run(list_scores, 301 | feed_dict=feed_dict).tolist() 302 | inference_time.append(time.time() - begin_val_batch) 303 | 304 | logger.info('Testing : mean inference time is %.4f' % ( 305 | np.mean(inference_time))) 306 | 307 | ran_from = nr_batches_test * batch_size 308 | ran_to = (nr_batches_test + 1) * batch_size 309 | size = testx[ran_from:ran_to].shape[0] 310 | fill = np.ones([batch_size - size, data.get_shape_input()[1]]) 311 | 312 | batch = np.concatenate([testx[ran_from:ran_to], fill], axis=0) 313 | feed_dict = {input_pl: batch, 314 | is_training_pl: False} 315 | 316 | batch_score = sess.run(list_scores, 317 | feed_dict=feed_dict).tolist() 318 | 319 | scores += batch_score[:size] 320 | 321 | # Highest 80% are anomalous 322 | per = np.percentile(scores, 80) 323 | 324 | y_pred = scores.copy() 325 | y_pred = np.array(y_pred) 326 | 327 | inds = (y_pred < per) 328 | inds_comp = (y_pred >= per) 329 | 330 | y_pred[inds] = 0 331 | y_pred[inds_comp] = 1 332 | 333 | 334 | precision, recall, f1,_ = precision_recall_fscore_support(testy, 335 | y_pred, 336 | average='binary') 337 | 338 | print( 339 | "Testing(%d) : Prec = %.4f | Rec = %.4f | F1 = %.4f " 340 | % (nc, precision, recall, f1)) 341 | content = "Testing(%d) : Prec = %.4f | Rec = %.4f | F1 = %.4f " % (nc, precision, recall, f1) 342 | f1=open('./output.txt', 'a') 343 | f1.write(content + "\r\n") 344 | 345 | def run(nb_epochs, weight, method, degree, label, nc, random_seed=42): 346 | """ Runs the training process""" 347 | print("Raj nc", nc) 348 | with tf.Graph().as_default(): 349 | # Set the graph level seed 350 | tf.set_random_seed(random_seed) 351 | train_and_test(nb_epochs, weight, method, degree, random_seed, nc) 352 | -------------------------------------------------------------------------------- /data/kdd.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | import pandas as pd 4 | from sklearn.model_selection import train_test_split 5 | from sklearn.preprocessing import MinMaxScaler 6 | from sklearn.preprocessing import minmax_scale 7 | from sklearn.preprocessing import MaxAbsScaler 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.preprocessing import RobustScaler 10 | from sklearn.preprocessing import Normalizer 11 | from sklearn.preprocessing import QuantileTransformer 12 | from sklearn.preprocessing import PowerTransformer 13 | 14 | logger = logging.getLogger(__name__) 15 | nc = 0 16 | 17 | def get_train(*args): 18 | """Get training dataset for KDD 10 percent""" 19 | return _get_adapted_dataset("train") 20 | 21 | def get_test(*args): 22 | """Get testing dataset for KDD 10 percent""" 23 | return _get_adapted_dataset("test") 24 | 25 | def set_nc(n): 26 | global nc 27 | nc = n 28 | 29 | def get_shape_input(): 30 | """Get shape of the dataset for KDD 10 percent""" 31 | return (None, nc) 32 | 33 | def get_shape_label(): 34 | """Get shape of the labels in KDD 10 percent""" 35 | return (None,) 36 | 37 | def _get_dataset(): 38 | """ Gets the basic dataset 39 | Returns : 40 | dataset (dict): containing the data 41 | dataset['x_train'] (np.array): training images shape 42 | (?, 120) 43 | dataset['y_train'] (np.array): training labels shape 44 | (?,) 45 | dataset['x_test'] (np.array): testing images shape 46 | (?, 120) 47 | dataset['y_test'] (np.array): testing labels shape 48 | (?,) 49 | """ 50 | col_names = _col_names() 51 | df = pd.read_csv("data/kddcup.data_10_percent_corrected", header=None, names=col_names) 52 | text_l = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login'] 53 | old_df = df 54 | for name in text_l: 55 | _encode_text_dummy(df, name) 56 | labels = df['label'].copy() 57 | labels[labels != 'normal.'] = 0 58 | labels[labels == 'normal.'] = 1 59 | 60 | df['label'] = labels 61 | 62 | df_train = df.sample(frac=0.5, random_state=42) 63 | df_test = df.loc[~df.index.isin(df_train.index)] 64 | 65 | x_train, y_train = _to_xy(df_train, target='label') 66 | y_train = y_train.flatten().astype(int) 67 | x_test, y_test = _to_xy(df_test, target='label') 68 | y_test = y_test.flatten().astype(int) 69 | 70 | x_train = x_train[y_train != 1] 71 | y_train = y_train[y_train != 1] 72 | 73 | scaler = MinMaxScaler() 74 | scaler.fit(x_train) 75 | scaler.transform(x_train) 76 | scaler.transform(x_test) 77 | 78 | from sklearn.decomposition import PCA 79 | pca = PCA(n_components=get_shape_input()[1], random_state=42) 80 | pca.fit(x_train) 81 | x_train_pca = pca.transform(x_train) 82 | x_test_pca = pca.transform(x_test) 83 | 84 | 85 | dataset = {} 86 | dataset['x_train'] = x_train_pca.astype(np.float32) 87 | dataset['y_train'] = y_train.astype(np.float32) 88 | dataset['x_test'] = x_test_pca.astype(np.float32) 89 | dataset['y_test'] = y_test.astype(np.float32) 90 | return dataset 91 | 92 | def _get_adapted_dataset(split): 93 | """ Gets the adapted dataset for the experiments 94 | 95 | Args : 96 | split (str): train or test 97 | Returns : 98 | (tuple): images and labels 99 | """ 100 | dataset = _get_dataset() 101 | key_img = 'x_' + split 102 | key_lbl = 'y_' + split 103 | 104 | if split != 'train': 105 | dataset[key_img], dataset[key_lbl] = _adapt(dataset[key_img], 106 | dataset[key_lbl]) 107 | 108 | return (dataset[key_img], dataset[key_lbl]) 109 | 110 | def _encode_text_dummy(df, name): 111 | """Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] 112 | for red,green,blue) 113 | """ 114 | dummies = pd.get_dummies(df.loc[:,name]) 115 | for x in dummies.columns: 116 | dummy_name = "{}-{}".format(name, x) 117 | df.loc[:, dummy_name] = dummies[x] 118 | df.drop(name, axis=1, inplace=True) 119 | 120 | def _to_xy(df, target): 121 | """Converts a Pandas dataframe to the x,y inputs that TensorFlow needs""" 122 | result = [] 123 | for x in df.columns: 124 | if x != target: 125 | result.append(x) 126 | dummies = df[target] 127 | return df.as_matrix(result).astype(np.float32), dummies.as_matrix().astype(np.float32) 128 | 129 | def _col_names(): 130 | """Column names of the dataframe""" 131 | return ["duration","protocol_type","service","flag","src_bytes", 132 | "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins", 133 | "logged_in","num_compromised","root_shell","su_attempted","num_root", 134 | "num_file_creations","num_shells","num_access_files","num_outbound_cmds", 135 | "is_host_login","is_guest_login","count","srv_count","serror_rate", 136 | "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", 137 | "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count", 138 | "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate", 139 | "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate", 140 | "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"] 141 | 142 | def _adapt(x, y, rho=0.2): 143 | """Adapt the ratio of normal/anomalous data""" 144 | 145 | # Normal data: label =0, anomalous data: label =1 146 | 147 | rng = np.random.RandomState(42) # seed shuffling 148 | 149 | inliersx = x[y == 0] 150 | inliersy = y[y == 0] 151 | outliersx = x[y == 1] 152 | outliersy = y[y == 1] 153 | 154 | size_outliers = outliersx.shape[0] 155 | inds = rng.permutation(size_outliers) 156 | outliersx, outliersy = outliersx[inds], outliersy[inds] 157 | 158 | size_test = inliersx.shape[0] 159 | out_size_test = int(size_test*rho/(1-rho)) 160 | 161 | outestx = outliersx[:out_size_test] 162 | outesty = outliersy[:out_size_test] 163 | 164 | testx = np.concatenate((inliersx,outestx), axis=0) 165 | testy = np.concatenate((inliersy,outesty), axis=0) 166 | 167 | size_test = testx.shape[0] 168 | inds = rng.permutation(size_test) 169 | testx, testy = testx[inds], testy[inds] 170 | 171 | return testx, testy 172 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #! python3 2 | 3 | import argparse 4 | import importlib 5 | import logging 6 | import os 7 | import shutil 8 | import urllib3 9 | import zipfile 10 | 11 | import data 12 | 13 | # Logging 14 | console = logging.StreamHandler() 15 | console.setLevel(logging.INFO) 16 | console.setFormatter(logging.Formatter('[%(asctime)s %(levelname)-3s @%(name)s] %(message)s', datefmt='%H:%M:%S')) 17 | logging.basicConfig(level=logging.DEBUG, handlers=[console]) 18 | logging.getLogger("tensorflow").setLevel(logging.WARNING) 19 | logger = logging.getLogger("AnomalyDetection") 20 | 21 | 22 | def run(args): 23 | print(""" 24 | ______ _____ _____ ____ 25 | |_ `.|_ _| / ___ `. .' '. 26 | | | `. \ | | |_/___) | | .--. | 27 | | | | | | | _ .'____.' | | | | 28 | _| |_.' /_| |__/ | / /_____ _| `--' | 29 | |______.'|________| |_______|(_)'.____.' 30 | 31 | """) 32 | 33 | has_effect = False 34 | 35 | if args.example and args.dataset and args.split: 36 | try: 37 | mod_name = "{}.{}_{}".format(args.example, args.split, args.dataset) 38 | logger.info("Running script at {}".format(mod_name)) 39 | 40 | mod = importlib.import_module(mod_name) 41 | mod.run(args.nb_epochs, args.w, args.m, args.d, args.label, args.nc, args.rd) 42 | 43 | except Exception as e: 44 | logger.exception(e) 45 | logger.error("Uhoh, the script halted with an error.") 46 | else: 47 | if not has_effect: 48 | logger.error("Script halted without any effect. To run code, use command:\npython3 main.py {train, test, run}") 49 | 50 | def path(d): 51 | try: 52 | assert os.path.isdir(d) 53 | return d 54 | except Exception as e: 55 | raise argparse.ArgumentTypeError("Example {} cannot be located.".format(d)) 56 | 57 | if __name__ == "__main__": 58 | 59 | parser = argparse.ArgumentParser(description='Run examples from the DL 2.0 Anomaly Detector.') 60 | parser.add_argument('example', nargs="?", type=path, help='the folder name of the example you want to run e.g bigan') 61 | parser.add_argument('dataset', nargs="?", choices=['kdd'], help='the name of the dataset you want to run the experiments on') 62 | parser.add_argument('split', nargs="?", choices=['run'], help='train the example or evaluate it') 63 | parser.add_argument('--nb_epochs', nargs="?", type=int, help='number of epochs you want to train the dataset on') 64 | parser.add_argument('--label', nargs="?", type=int, help='anomalous label for the experiment') 65 | parser.add_argument('--w', nargs="?", default=0.1, type=float, help='weight for the sum of the mapping loss function') 66 | parser.add_argument('--m', nargs="?", default='fm', choices=['cross-e', 'fm'], help='mode/method for discriminator loss') 67 | parser.add_argument('--d', nargs="?", default=1, type=int, help='degree for the L norm') 68 | parser.add_argument('--rd', nargs="?", default=42, type=int, help='random_seed') 69 | parser.add_argument('--nc', nargs="?", default=0, type=int, help='Number of columns') 70 | run(parser.parse_args()) 71 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-gpu 2 | keras 3 | numpy 4 | urllib3 5 | gensim 6 | pillow 7 | regex 8 | -------------------------------------------------------------------------------- /utils/adapt_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import importlib 3 | 4 | def adapt_labels(true_labels, label): 5 | """Adapt labels to anomaly detection context 6 | 7 | Args : 8 | true_labels (list): list of ints 9 | label (int): label which is considered anomalous 10 | Returns : 11 | true_labels (list): list of labels, 1 for anomalous and 0 for normal 12 | """ 13 | if label == 0: 14 | (true_labels[true_labels == label], true_labels[true_labels != label]) = (0, 1) 15 | true_labels = [1] * true_labels.shape[0] - true_labels 16 | else: 17 | (true_labels[true_labels != label], true_labels[true_labels == label]) = (0, 1) 18 | 19 | return true_labels 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /utils/evaluations.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from sklearn.metrics import roc_curve, auc, average_precision_score, precision_recall_curve, auc 5 | 6 | 7 | def do_prc(scores, true_labels, file_name='', directory='', plot=True): 8 | """ Does the PRC curve 9 | 10 | Args : 11 | scores (list): list of scores from the decision function 12 | true_labels (list): list of labels associated to the scores 13 | file_name (str): name of the PRC curve 14 | directory (str): directory to save the jpg file 15 | plot (bool): plots the PRC curve or not 16 | Returns: 17 | prc_auc (float): area under the under the PRC curve 18 | """ 19 | precision, recall, thresholds = precision_recall_curve(true_labels, scores) 20 | prc_auc = auc(recall, precision) 21 | 22 | if plot: 23 | plt.figure() 24 | plt.step(recall, precision, color='b', alpha=0.2, where='post') 25 | plt.fill_between(recall, precision, step='post', alpha=0.2, color='b') 26 | plt.xlabel('Recall') 27 | plt.ylabel('Precision') 28 | plt.ylim([0.0, 1.05]) 29 | plt.xlim([0.0, 1.0]) 30 | plt.title('Precision-Recall curve: AUC=%0.4f' 31 | %(prc_auc)) 32 | if not os.path.exists(directory): 33 | os.makedirs(directory) 34 | plt.savefig('results/' + file_name + '_prc.jpg') 35 | plt.close() 36 | 37 | return prc_auc 38 | 39 | 40 | --------------------------------------------------------------------------------