├── README.md ├── data └── KDDTest+.csv └── nsl_kdd_binary_calassification_with_transformer.ipynb /README.md: -------------------------------------------------------------------------------- 1 | 2 | # NSL KDD binary classification with Transformer 3 | 4 | I used it to classify the NSL-KDD dataset by making a slight change on the code I got from the [keras documentation page.](https://keras.io/examples/nlp/text_classification_with_transformer/) 5 | 6 | ## importing of required libraries 7 | 8 | 9 | 10 | ```python 11 | import tensorflow as tf 12 | from tensorflow import keras 13 | from tensorflow.keras import layers 14 | 15 | 16 | import numpy as np 17 | import pandas as pd 18 | 19 | from sklearn import preprocessing 20 | from sklearn.model_selection import train_test_split 21 | ``` 22 | 23 | ## Implement multi head self attention as a Keras layer 24 | 25 | 26 | 27 | ```python 28 | 29 | class MultiHeadSelfAttention(layers.Layer): 30 | def __init__(self, embed_dim, num_heads=8): 31 | super(MultiHeadSelfAttention, self).__init__() 32 | self.embed_dim = embed_dim 33 | self.num_heads = num_heads 34 | if embed_dim % num_heads != 0: 35 | raise ValueError( 36 | f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}" 37 | ) 38 | self.projection_dim = embed_dim // num_heads 39 | self.query_dense = layers.Dense(embed_dim) 40 | self.key_dense = layers.Dense(embed_dim) 41 | self.value_dense = layers.Dense(embed_dim) 42 | self.combine_heads = layers.Dense(embed_dim) 43 | 44 | def attention(self, query, key, value): 45 | score = tf.matmul(query, key, transpose_b=True) 46 | dim_key = tf.cast(tf.shape(key)[-1], tf.float32) 47 | scaled_score = score / tf.math.sqrt(dim_key) 48 | weights = tf.nn.softmax(scaled_score, axis=-1) 49 | output = tf.matmul(weights, value) 50 | return output, weights 51 | 52 | def separate_heads(self, x, batch_size): 53 | x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim)) 54 | return tf.transpose(x, perm=[0, 2, 1, 3]) 55 | 56 | def call(self, inputs): 57 | # x.shape = [batch_size, seq_len, embedding_dim] 58 | batch_size = tf.shape(inputs)[0] 59 | query = self.query_dense(inputs) # (batch_size, seq_len, embed_dim) 60 | key = self.key_dense(inputs) # (batch_size, seq_len, embed_dim) 61 | value = self.value_dense(inputs) # (batch_size, seq_len, embed_dim) 62 | query = self.separate_heads( 63 | query, batch_size 64 | ) # (batch_size, num_heads, seq_len, projection_dim) 65 | key = self.separate_heads( 66 | key, batch_size 67 | ) # (batch_size, num_heads, seq_len, projection_dim) 68 | value = self.separate_heads( 69 | value, batch_size 70 | ) # (batch_size, num_heads, seq_len, projection_dim) 71 | attention, weights = self.attention(query, key, value) 72 | attention = tf.transpose( 73 | attention, perm=[0, 2, 1, 3] 74 | ) # (batch_size, seq_len, num_heads, projection_dim) 75 | concat_attention = tf.reshape( 76 | attention, (batch_size, -1, self.embed_dim) 77 | ) # (batch_size, seq_len, embed_dim) 78 | output = self.combine_heads( 79 | concat_attention 80 | ) # (batch_size, seq_len, embed_dim) 81 | return output 82 | 83 | 84 | ``` 85 | 86 | ## Implement a Transformer block as a layer 87 | 88 | 89 | 90 | ```python 91 | 92 | class TransformerBlock(layers.Layer): 93 | def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1): 94 | super(TransformerBlock, self).__init__() 95 | self.att = MultiHeadSelfAttention(embed_dim, num_heads) 96 | self.ffn = keras.Sequential( 97 | [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),] 98 | ) 99 | self.layernorm1 = layers.LayerNormalization(epsilon=1e-6) 100 | self.layernorm2 = layers.LayerNormalization(epsilon=1e-6) 101 | self.dropout1 = layers.Dropout(rate) 102 | self.dropout2 = layers.Dropout(rate) 103 | 104 | def call(self, inputs, training): 105 | attn_output = self.att(inputs) 106 | attn_output = self.dropout1(attn_output, training=training) 107 | out1 = self.layernorm1(inputs + attn_output) 108 | ffn_output = self.ffn(out1) 109 | ffn_output = self.dropout2(ffn_output, training=training) 110 | return self.layernorm2(out1 + ffn_output) 111 | 112 | 113 | ``` 114 | 115 | ## Implement embedding layer 116 | 117 | Two seperate embedding layers, one for tokens, one for token index (positions). 118 | 119 | 120 | 121 | ```python 122 | class TokenAndPositionEmbedding(layers.Layer): 123 | def __init__(self, maxlen, vocab_size, embed_dim): 124 | super(TokenAndPositionEmbedding, self).__init__() 125 | self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim) 126 | self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim) 127 | 128 | def call(self, x): 129 | maxlen = tf.shape(x)[-1] 130 | positions = tf.range(start=0, limit=maxlen, delta=1) 131 | positions = self.pos_emb(positions) 132 | x = self.token_emb(x) 133 | return x + positions 134 | 135 | 136 | ``` 137 | 138 | # prepare NSL KDD dataset 139 | 140 | 141 | ## reading CSV files 142 | 143 | 144 | ```python 145 | # c_names ---> column names 146 | c_names = ["duration","protocol_type","service","flag","src_bytes", 147 | "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins", 148 | "logged_in","num_compromised","root_shell","su_attempted","num_root", 149 | "num_file_creations","num_shells","num_access_files","num_outbound_cmds", 150 | "is_host_login","is_guest_login","count","srv_count","serror_rate", 151 | "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate", 152 | "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count", 153 | "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate", 154 | "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate", 155 | "dst_host_rerror_rate","dst_host_srv_rerror_rate","labels","difficulty_degree"] 156 | 157 | train = pd.read_csv( "data/KDDTrain+.csv", names=c_names) # train file 158 | test = pd.read_csv("data/KDDTest+.csv", names=c_names) # test file 159 | ``` 160 | 161 | ## deletion of unnecessary feature (difficulty_degree) 162 | 163 | 164 | ```python 165 | del train["difficulty_degree"] 166 | del test["difficulty_degree"] 167 | ``` 168 | 169 | ## Converting object features to categories first and then to dummy tables (except "labels") 170 | 171 | 172 | ```python 173 | for i in c_names: 174 | print((train[i].dtypes)) 175 | if train[i].dtypes==object: 176 | train[i] = train[i].astype('category') 177 | test[i] = test[i].astype('category') 178 | if i=="labels": 179 | break 180 | train=pd.get_dummies(train, columns=[i]) 181 | test=pd.get_dummies(test, columns=[i]) 182 | ``` 183 | 184 | int64 185 | object 186 | object 187 | object 188 | int64 189 | int64 190 | int64 191 | int64 192 | int64 193 | int64 194 | int64 195 | int64 196 | int64 197 | int64 198 | int64 199 | int64 200 | int64 201 | int64 202 | int64 203 | int64 204 | int64 205 | int64 206 | int64 207 | int64 208 | float64 209 | float64 210 | float64 211 | float64 212 | float64 213 | float64 214 | float64 215 | int64 216 | int64 217 | float64 218 | float64 219 | float64 220 | float64 221 | float64 222 | float64 223 | float64 224 | float64 225 | object 226 | 227 | 228 | ## labels feature converts to binary 229 | 230 | 231 | ```python 232 | # TRAIN 233 | attack_or_not=[] 234 | for i in train["labels"]:#it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm 235 | if i =="normal": 236 | attack_or_not.append(1) 237 | else: 238 | attack_or_not.append(0) 239 | train["labels"]=attack_or_not 240 | ``` 241 | 242 | 243 | ```python 244 | # TEST 245 | attack_or_not=[] 246 | for i in test["labels"]:#it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm 247 | if i =="normal": 248 | attack_or_not.append(1) 249 | else: 250 | attack_or_not.append(0) 251 | test["labels"]=attack_or_not 252 | ``` 253 | 254 | ## Synchronizing Test and Train datasets. 255 | ### Add "0" for the feature that does not exist in one of these two datasets. 256 | 257 | 258 | ```python 259 | f=list(train.columns) 260 | e=list(test.columns) 261 | 262 | for i in f: 263 | if i not in e: 264 | zero_data =pd.array(np.zeros(len(test["labels"]))) 265 | print(len(zero_data)) 266 | test[i] = zero_data 267 | print(i) 268 | for i in e: 269 | if i not in f: 270 | zero_data = np.zeros(len(train["labels"])) 271 | train[i] = zero_data 272 | print(i) 273 | 274 | ``` 275 | 276 | 22543 277 | service_aol 278 | 22543 279 | service_harvest 280 | 22543 281 | service_http_2784 282 | 22543 283 | service_http_8001 284 | 22543 285 | service_red_i 286 | 22543 287 | service_tftp_u 288 | 22543 289 | service_urh_i 290 | 291 | 292 | ## separation of features (data) and Label (target) 293 | 294 | 295 | ```python 296 | y = train["labels"] #this section separates the label and the data into two separate pieces, as Label=y Data=X 297 | del train["labels"] 298 | X = train 299 | ``` 300 | 301 | 302 | ```python 303 | y_test = test["labels"] #this section separates the label and the data into two separate pieces, as Label=y Data=X 304 | del test["labels"] 305 | x_test=test 306 | ``` 307 | 308 | ## Normalization and Standardization 309 | 310 | 311 | ```python 312 | X = preprocessing.scale(X) 313 | X = preprocessing.normalize(X) 314 | ``` 315 | 316 | 317 | ```python 318 | x_test = preprocessing.scale(x_test) 319 | x_test = preprocessing.normalize(x_test) 320 | ``` 321 | 322 | ## Separating Train data into two parts as train and validation 323 | 324 | 325 | ```python 326 | x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify=y) 327 | print(len(x_train), "Training sequences",x_train.shape) 328 | print(len(x_val), "Validation sequences",x_val.shape) 329 | print(len(x_test), "Test sequences",x_test.shape) 330 | ``` 331 | 332 | 100778 Training sequences (100778, 122) 333 | 25195 Validation sequences (25195, 122) 334 | 22543 Test sequences (22543, 122) 335 | 336 | 337 | ## Create classifier model using transformer layer 338 | 339 | Transformer layer outputs one vector for each time step of our input sequence. 340 | Here, we take the mean across all time steps and 341 | use a feed forward network on top of it to classify text. 342 | 343 | 344 | 345 | ```python 346 | maxlen=122 347 | vocab_size = 100000 # Only consider the top 20k words 348 | 349 | 350 | 351 | embed_dim = 32 # Embedding size for each token 352 | num_heads = 2 # Number of attention heads 353 | ff_dim = 32 # Hidden layer size in feed forward network inside transformer 354 | 355 | inputs = layers.Input(shape=(maxlen,)) 356 | embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim) 357 | x = embedding_layer(inputs) 358 | transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim) 359 | x = transformer_block(x) 360 | x = layers.GlobalAveragePooling1D()(x) 361 | x = layers.Dropout(0.1)(x) 362 | x = layers.Dense(20, activation="relu")(x) 363 | x = layers.Dropout(0.1)(x) 364 | outputs = layers.Dense(2, activation="softmax")(x) 365 | 366 | model = keras.Model(inputs=inputs, outputs=outputs) 367 | 368 | 369 | ``` 370 | 371 | 372 | ```python 373 | x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen) 374 | x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen) 375 | x_test = keras.preprocessing.sequence.pad_sequences(x_test , maxlen=maxlen) 376 | 377 | ``` 378 | 379 | ## Train 380 | 381 | 382 | ```python 383 | model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"]) 384 | history = model.fit( 385 | x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val) 386 | ) 387 | 388 | ``` 389 | 390 | Train on 100778 samples, validate on 25195 samples 391 | Epoch 1/2 392 | 100778/100778 [==============================] - 240s 2ms/sample - loss: 0.6915 - accuracy: 0.5331 - val_loss: 0.6908 - val_accuracy: 0.5346 393 | Epoch 2/2 394 | 100778/100778 [==============================] - 221s 2ms/sample - loss: 0.6908 - accuracy: 0.5345 - val_loss: 0.6910 - val_accuracy: 0.5346 395 | 396 | 397 | ## Evaluate 398 | 399 | 400 | ```python 401 | score = model.evaluate(x_test, y_test, verbose=0) 402 | print("Test loss:", score[0]) 403 | print("Test accuracy:", score[1]) 404 | ``` 405 | 406 | Test loss: 0.7010403732089466 407 | Test accuracy: 0.43073237 408 | 409 | 410 | 411 | ```python 412 | score = model.evaluate(x_val, y_val, verbose=0) 413 | print("Test loss:", score[0]) 414 | print("Test accuracy:", score[1]) 415 | ``` 416 | 417 | Test loss: 0.690967743196618 418 | Test accuracy: 0.5345902 419 | 420 | -------------------------------------------------------------------------------- /data/KDDTest+.csv: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /nsl_kdd_binary_calassification_with_transformer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "52GSZ0J_zaYj" 8 | }, 9 | "source": [ 10 | "# NSL KDD binary classification with Transformer\n", 11 | "\n", 12 | "I used it to classify the NSL-KDD dataset by making a slight change on the code I got from the [keras documentation page.](https://keras.io/examples/nlp/text_classification_with_transformer/)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": { 18 | "colab_type": "text", 19 | "id": "i7MlsDyNzaYr" 20 | }, 21 | "source": [ 22 | "## importing of required libraries \n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": { 29 | "colab": {}, 30 | "colab_type": "code", 31 | "id": "Y8gX00uEzaYt" 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "import tensorflow as tf\n", 36 | "from tensorflow import keras\n", 37 | "from tensorflow.keras import layers\n", 38 | "\n", 39 | "\n", 40 | "import numpy as np\n", 41 | "import pandas as pd\n", 42 | "\n", 43 | "from sklearn import preprocessing\n", 44 | "from sklearn.model_selection import train_test_split" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "colab_type": "text", 51 | "id": "RVj4msuOzaYz" 52 | }, 53 | "source": [ 54 | "## Implement multi head self attention as a Keras layer\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 2, 60 | "metadata": { 61 | "colab": {}, 62 | "colab_type": "code", 63 | "id": "p09RV47lzaY3" 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "\n", 68 | "class MultiHeadSelfAttention(layers.Layer):\n", 69 | " def __init__(self, embed_dim, num_heads=8):\n", 70 | " super(MultiHeadSelfAttention, self).__init__()\n", 71 | " self.embed_dim = embed_dim\n", 72 | " self.num_heads = num_heads\n", 73 | " if embed_dim % num_heads != 0:\n", 74 | " raise ValueError(\n", 75 | " f\"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}\"\n", 76 | " )\n", 77 | " self.projection_dim = embed_dim // num_heads\n", 78 | " self.query_dense = layers.Dense(embed_dim)\n", 79 | " self.key_dense = layers.Dense(embed_dim)\n", 80 | " self.value_dense = layers.Dense(embed_dim)\n", 81 | " self.combine_heads = layers.Dense(embed_dim)\n", 82 | "\n", 83 | " def attention(self, query, key, value):\n", 84 | " score = tf.matmul(query, key, transpose_b=True)\n", 85 | " dim_key = tf.cast(tf.shape(key)[-1], tf.float32)\n", 86 | " scaled_score = score / tf.math.sqrt(dim_key)\n", 87 | " weights = tf.nn.softmax(scaled_score, axis=-1)\n", 88 | " output = tf.matmul(weights, value)\n", 89 | " return output, weights\n", 90 | "\n", 91 | " def separate_heads(self, x, batch_size):\n", 92 | " x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))\n", 93 | " return tf.transpose(x, perm=[0, 2, 1, 3])\n", 94 | "\n", 95 | " def call(self, inputs):\n", 96 | " # x.shape = [batch_size, seq_len, embedding_dim]\n", 97 | " batch_size = tf.shape(inputs)[0]\n", 98 | " query = self.query_dense(inputs) # (batch_size, seq_len, embed_dim)\n", 99 | " key = self.key_dense(inputs) # (batch_size, seq_len, embed_dim)\n", 100 | " value = self.value_dense(inputs) # (batch_size, seq_len, embed_dim)\n", 101 | " query = self.separate_heads(\n", 102 | " query, batch_size\n", 103 | " ) # (batch_size, num_heads, seq_len, projection_dim)\n", 104 | " key = self.separate_heads(\n", 105 | " key, batch_size\n", 106 | " ) # (batch_size, num_heads, seq_len, projection_dim)\n", 107 | " value = self.separate_heads(\n", 108 | " value, batch_size\n", 109 | " ) # (batch_size, num_heads, seq_len, projection_dim)\n", 110 | " attention, weights = self.attention(query, key, value)\n", 111 | " attention = tf.transpose(\n", 112 | " attention, perm=[0, 2, 1, 3]\n", 113 | " ) # (batch_size, seq_len, num_heads, projection_dim)\n", 114 | " concat_attention = tf.reshape(\n", 115 | " attention, (batch_size, -1, self.embed_dim)\n", 116 | " ) # (batch_size, seq_len, embed_dim)\n", 117 | " output = self.combine_heads(\n", 118 | " concat_attention\n", 119 | " ) # (batch_size, seq_len, embed_dim)\n", 120 | " return output\n", 121 | "\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": { 127 | "colab_type": "text", 128 | "id": "GANkWrJ7zaY6" 129 | }, 130 | "source": [ 131 | "## Implement a Transformer block as a layer\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 3, 137 | "metadata": { 138 | "colab": {}, 139 | "colab_type": "code", 140 | "id": "Cj879AGizaY7" 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "\n", 145 | "class TransformerBlock(layers.Layer):\n", 146 | " def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):\n", 147 | " super(TransformerBlock, self).__init__()\n", 148 | " self.att = MultiHeadSelfAttention(embed_dim, num_heads)\n", 149 | " self.ffn = keras.Sequential(\n", 150 | " [layers.Dense(ff_dim, activation=\"relu\"), layers.Dense(embed_dim),]\n", 151 | " )\n", 152 | " self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)\n", 153 | " self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)\n", 154 | " self.dropout1 = layers.Dropout(rate)\n", 155 | " self.dropout2 = layers.Dropout(rate)\n", 156 | "\n", 157 | " def call(self, inputs, training):\n", 158 | " attn_output = self.att(inputs)\n", 159 | " attn_output = self.dropout1(attn_output, training=training)\n", 160 | " out1 = self.layernorm1(inputs + attn_output)\n", 161 | " ffn_output = self.ffn(out1)\n", 162 | " ffn_output = self.dropout2(ffn_output, training=training)\n", 163 | " return self.layernorm2(out1 + ffn_output)\n", 164 | "\n" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "colab_type": "text", 171 | "id": "fZ5U2V7RzaZC" 172 | }, 173 | "source": [ 174 | "## Implement embedding layer\n", 175 | "\n", 176 | "Two seperate embedding layers, one for tokens, one for token index (positions).\n" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 4, 182 | "metadata": { 183 | "colab": {}, 184 | "colab_type": "code", 185 | "id": "SADnKmhRzaZE" 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "class TokenAndPositionEmbedding(layers.Layer):\n", 190 | " def __init__(self, maxlen, vocab_size, embed_dim):\n", 191 | " super(TokenAndPositionEmbedding, self).__init__()\n", 192 | " self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)\n", 193 | " self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)\n", 194 | "\n", 195 | " def call(self, x):\n", 196 | " maxlen = tf.shape(x)[-1]\n", 197 | " positions = tf.range(start=0, limit=maxlen, delta=1)\n", 198 | " positions = self.pos_emb(positions)\n", 199 | " x = self.token_emb(x)\n", 200 | " return x + positions\n", 201 | "\n" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": { 207 | "colab_type": "text", 208 | "id": "7yVOPPwSzaZO" 209 | }, 210 | "source": [ 211 | "# prepare NSL KDD dataset\n" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "## reading CSV files" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 5, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "# c_names ---> column names\n", 228 | "c_names = [\"duration\",\"protocol_type\",\"service\",\"flag\",\"src_bytes\",\n", 229 | " \"dst_bytes\",\"land\",\"wrong_fragment\",\"urgent\",\"hot\",\"num_failed_logins\",\n", 230 | " \"logged_in\",\"num_compromised\",\"root_shell\",\"su_attempted\",\"num_root\",\n", 231 | " \"num_file_creations\",\"num_shells\",\"num_access_files\",\"num_outbound_cmds\",\n", 232 | " \"is_host_login\",\"is_guest_login\",\"count\",\"srv_count\",\"serror_rate\",\n", 233 | " \"srv_serror_rate\",\"rerror_rate\",\"srv_rerror_rate\",\"same_srv_rate\",\n", 234 | " \"diff_srv_rate\",\"srv_diff_host_rate\",\"dst_host_count\",\"dst_host_srv_count\",\n", 235 | " \"dst_host_same_srv_rate\",\"dst_host_diff_srv_rate\",\"dst_host_same_src_port_rate\",\n", 236 | " \"dst_host_srv_diff_host_rate\",\"dst_host_serror_rate\",\"dst_host_srv_serror_rate\",\n", 237 | " \"dst_host_rerror_rate\",\"dst_host_srv_rerror_rate\",\"labels\",\"difficulty_degree\"]\n", 238 | "\n", 239 | "train = pd.read_csv( \"data/KDDTrain+.csv\", names=c_names) # train file\n", 240 | "test = pd.read_csv(\"data/KDDTest+.csv\", names=c_names) # test file" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "## deletion of unnecessary feature (difficulty_degree)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 6, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "del train[\"difficulty_degree\"] \n", 257 | "del test[\"difficulty_degree\"] " 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "## Converting object features to categories first and then to dummy tables (except \"labels\")" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 7, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stdout", 274 | "output_type": "stream", 275 | "text": [ 276 | "int64\n", 277 | "object\n", 278 | "object\n", 279 | "object\n", 280 | "int64\n", 281 | "int64\n", 282 | "int64\n", 283 | "int64\n", 284 | "int64\n", 285 | "int64\n", 286 | "int64\n", 287 | "int64\n", 288 | "int64\n", 289 | "int64\n", 290 | "int64\n", 291 | "int64\n", 292 | "int64\n", 293 | "int64\n", 294 | "int64\n", 295 | "int64\n", 296 | "int64\n", 297 | "int64\n", 298 | "int64\n", 299 | "int64\n", 300 | "float64\n", 301 | "float64\n", 302 | "float64\n", 303 | "float64\n", 304 | "float64\n", 305 | "float64\n", 306 | "float64\n", 307 | "int64\n", 308 | "int64\n", 309 | "float64\n", 310 | "float64\n", 311 | "float64\n", 312 | "float64\n", 313 | "float64\n", 314 | "float64\n", 315 | "float64\n", 316 | "float64\n", 317 | "object\n" 318 | ] 319 | } 320 | ], 321 | "source": [ 322 | "for i in c_names:\n", 323 | " print((train[i].dtypes))\n", 324 | " if train[i].dtypes==object:\n", 325 | " train[i] = train[i].astype('category')\n", 326 | " test[i] = test[i].astype('category')\n", 327 | " if i==\"labels\":\n", 328 | " break\n", 329 | " train=pd.get_dummies(train, columns=[i])\n", 330 | " test=pd.get_dummies(test, columns=[i]) " 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "## labels feature converts to binary" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 8, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "# TRAIN\n", 347 | "attack_or_not=[]\n", 348 | "for i in train[\"labels\"]:#it changes the normal label to \"1\" and the attack tag to \"0\" for use in the machine learning algorithm\n", 349 | " if i ==\"normal\":\n", 350 | " attack_or_not.append(1)\n", 351 | " else:\n", 352 | " attack_or_not.append(0) \n", 353 | "train[\"labels\"]=attack_or_not" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 9, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "# TEST\n", 363 | "attack_or_not=[]\n", 364 | "for i in test[\"labels\"]:#it changes the normal label to \"1\" and the attack tag to \"0\" for use in the machine learning algorithm\n", 365 | " if i ==\"normal\":\n", 366 | " attack_or_not.append(1)\n", 367 | " else:\n", 368 | " attack_or_not.append(0) \n", 369 | "test[\"labels\"]=attack_or_not" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "## Synchronizing Test and Train datasets.\n", 377 | "### Add \"0\" for the feature that does not exist in one of these two datasets." 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 10, 383 | "metadata": {}, 384 | "outputs": [ 385 | { 386 | "name": "stdout", 387 | "output_type": "stream", 388 | "text": [ 389 | "22543\n", 390 | "service_aol\n", 391 | "22543\n", 392 | "service_harvest\n", 393 | "22543\n", 394 | "service_http_2784\n", 395 | "22543\n", 396 | "service_http_8001\n", 397 | "22543\n", 398 | "service_red_i\n", 399 | "22543\n", 400 | "service_tftp_u\n", 401 | "22543\n", 402 | "service_urh_i\n" 403 | ] 404 | } 405 | ], 406 | "source": [ 407 | "f=list(train.columns)\n", 408 | "e=list(test.columns)\n", 409 | "\n", 410 | "for i in f:\n", 411 | " if i not in e:\n", 412 | " zero_data =pd.array(np.zeros(len(test[\"labels\"]))) \n", 413 | " print(len(zero_data))\n", 414 | " test[i] = zero_data\n", 415 | " print(i)\n", 416 | "for i in e:\n", 417 | " if i not in f:\n", 418 | " zero_data = np.zeros(len(train[\"labels\"]))\n", 419 | " train[i] = zero_data\n", 420 | " print(i)\n" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "## separation of features (data) and Label (target)" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 11, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "y = train[\"labels\"] #this section separates the label and the data into two separate pieces, as Label=y Data=X \n", 437 | "del train[\"labels\"] \n", 438 | "X = train" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 12, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "y_test = test[\"labels\"] #this section separates the label and the data into two separate pieces, as Label=y Data=X \n", 448 | "del test[\"labels\"] \n", 449 | "x_test=test" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "## Normalization and Standardization" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 13, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "X = preprocessing.scale(X)\n", 466 | "X = preprocessing.normalize(X)" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 14, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "x_test = preprocessing.scale(x_test)\n", 476 | "x_test = preprocessing.normalize(x_test)" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": {}, 482 | "source": [ 483 | "## Separating Train data into two parts as train and validation" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 15, 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "name": "stdout", 493 | "output_type": "stream", 494 | "text": [ 495 | "100778 Training sequences (100778, 122)\n", 496 | "25195 Validation sequences (25195, 122)\n", 497 | "22543 Test sequences (22543, 122)\n" 498 | ] 499 | } 500 | ], 501 | "source": [ 502 | "x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify=y)\n", 503 | "print(len(x_train), \"Training sequences\",x_train.shape)\n", 504 | "print(len(x_val), \"Validation sequences\",x_val.shape)\n", 505 | "print(len(x_test), \"Test sequences\",x_test.shape)" 506 | ] 507 | }, 508 | { 509 | "cell_type": "markdown", 510 | "metadata": { 511 | "colab_type": "text", 512 | "id": "ZGkJpYWEzaZU" 513 | }, 514 | "source": [ 515 | "## Create classifier model using transformer layer\n", 516 | "\n", 517 | "Transformer layer outputs one vector for each time step of our input sequence.\n", 518 | "Here, we take the mean across all time steps and\n", 519 | "use a feed forward network on top of it to classify text.\n" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": 16, 525 | "metadata": { 526 | "colab": {}, 527 | "colab_type": "code", 528 | "id": "bum8sPyKzaZV" 529 | }, 530 | "outputs": [], 531 | "source": [ 532 | "maxlen=122\n", 533 | "vocab_size = 100000 # Only consider the top 20k words\n", 534 | "\n", 535 | "\n", 536 | "\n", 537 | "embed_dim = 32 # Embedding size for each token\n", 538 | "num_heads = 2 # Number of attention heads\n", 539 | "ff_dim = 32 # Hidden layer size in feed forward network inside transformer\n", 540 | "\n", 541 | "inputs = layers.Input(shape=(maxlen,))\n", 542 | "embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)\n", 543 | "x = embedding_layer(inputs)\n", 544 | "transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)\n", 545 | "x = transformer_block(x)\n", 546 | "x = layers.GlobalAveragePooling1D()(x)\n", 547 | "x = layers.Dropout(0.1)(x)\n", 548 | "x = layers.Dense(20, activation=\"relu\")(x)\n", 549 | "x = layers.Dropout(0.1)(x)\n", 550 | "outputs = layers.Dense(2, activation=\"softmax\")(x)\n", 551 | "\n", 552 | "model = keras.Model(inputs=inputs, outputs=outputs)\n", 553 | "\n" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 18, 559 | "metadata": {}, 560 | "outputs": [], 561 | "source": [ 562 | "x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)\n", 563 | "x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)\n", 564 | "x_test = keras.preprocessing.sequence.pad_sequences(x_test , maxlen=maxlen)\n" 565 | ] 566 | }, 567 | { 568 | "cell_type": "markdown", 569 | "metadata": {}, 570 | "source": [ 571 | "## Train" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 19, 577 | "metadata": { 578 | "colab": { 579 | "base_uri": "https://localhost:8080/", 580 | "height": 84 581 | }, 582 | "colab_type": "code", 583 | "id": "98T8ht1ezaZk", 584 | "outputId": "030b4f93-ccf8-496a-f416-6848021638e1" 585 | }, 586 | "outputs": [ 587 | { 588 | "name": "stdout", 589 | "output_type": "stream", 590 | "text": [ 591 | "Train on 100778 samples, validate on 25195 samples\n", 592 | "Epoch 1/2\n", 593 | "100778/100778 [==============================] - 240s 2ms/sample - loss: 0.6915 - accuracy: 0.5331 - val_loss: 0.6908 - val_accuracy: 0.5346\n", 594 | "Epoch 2/2\n", 595 | "100778/100778 [==============================] - 221s 2ms/sample - loss: 0.6908 - accuracy: 0.5345 - val_loss: 0.6910 - val_accuracy: 0.5346\n" 596 | ] 597 | } 598 | ], 599 | "source": [ 600 | "model.compile(\"adam\", \"sparse_categorical_crossentropy\", metrics=[\"accuracy\"])\n", 601 | "history = model.fit(\n", 602 | " x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)\n", 603 | ")\n" 604 | ] 605 | }, 606 | { 607 | "cell_type": "markdown", 608 | "metadata": {}, 609 | "source": [ 610 | "## Evaluate" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 20, 616 | "metadata": {}, 617 | "outputs": [ 618 | { 619 | "name": "stdout", 620 | "output_type": "stream", 621 | "text": [ 622 | "Test loss: 0.7010403732089466\n", 623 | "Test accuracy: 0.43073237\n" 624 | ] 625 | } 626 | ], 627 | "source": [ 628 | "score = model.evaluate(x_test, y_test, verbose=0)\n", 629 | "print(\"Test loss:\", score[0])\n", 630 | "print(\"Test accuracy:\", score[1])" 631 | ] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": 21, 636 | "metadata": { 637 | "colab": { 638 | "base_uri": "https://localhost:8080/", 639 | "height": 50 640 | }, 641 | "colab_type": "code", 642 | "id": "Zm1_60H5z0fp", 643 | "outputId": "09bc4104-5d63-4826-c0a6-535cc9f271f5" 644 | }, 645 | "outputs": [ 646 | { 647 | "name": "stdout", 648 | "output_type": "stream", 649 | "text": [ 650 | "Test loss: 0.690967743196618\n", 651 | "Test accuracy: 0.5345902\n" 652 | ] 653 | } 654 | ], 655 | "source": [ 656 | "score = model.evaluate(x_val, y_val, verbose=0)\n", 657 | "print(\"Test loss:\", score[0])\n", 658 | "print(\"Test accuracy:\", score[1])" 659 | ] 660 | } 661 | ], 662 | "metadata": { 663 | "colab": { 664 | "collapsed_sections": [], 665 | "name": "text_classification_with_transformer", 666 | "provenance": [], 667 | "toc_visible": true 668 | }, 669 | "kernelspec": { 670 | "display_name": "Python 3", 671 | "language": "python", 672 | "name": "python3" 673 | }, 674 | "language_info": { 675 | "codemirror_mode": { 676 | "name": "ipython", 677 | "version": 3 678 | }, 679 | "file_extension": ".py", 680 | "mimetype": "text/x-python", 681 | "name": "python", 682 | "nbconvert_exporter": "python", 683 | "pygments_lexer": "ipython3", 684 | "version": "3.6.5" 685 | } 686 | }, 687 | "nbformat": 4, 688 | "nbformat_minor": 1 689 | } 690 | --------------------------------------------------------------------------------