├── README.md
├── data
    └── KDDTest+.csv
└── nsl_kdd_binary_calassification_with_transformer.ipynb


/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # NSL KDD binary classification with Transformer
  3 | 
  4 | I used it to classify the NSL-KDD dataset by making a slight change on the code I got from the [keras documentation page.](https://keras.io/examples/nlp/text_classification_with_transformer/)
  5 | 
  6 | ## importing of required libraries 
  7 | 
  8 | 
  9 | 
 10 | ```python
 11 | import tensorflow as tf
 12 | from tensorflow import keras
 13 | from tensorflow.keras import layers
 14 | 
 15 | 
 16 | import numpy as np
 17 | import pandas as pd
 18 | 
 19 | from sklearn import preprocessing
 20 | from sklearn.model_selection import train_test_split
 21 | ```
 22 | 
 23 | ## Implement multi head self attention as a Keras layer
 24 | 
 25 | 
 26 | 
 27 | ```python
 28 | 
 29 | class MultiHeadSelfAttention(layers.Layer):
 30 |     def __init__(self, embed_dim, num_heads=8):
 31 |         super(MultiHeadSelfAttention, self).__init__()
 32 |         self.embed_dim = embed_dim
 33 |         self.num_heads = num_heads
 34 |         if embed_dim % num_heads != 0:
 35 |             raise ValueError(
 36 |                 f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
 37 |             )
 38 |         self.projection_dim = embed_dim // num_heads
 39 |         self.query_dense = layers.Dense(embed_dim)
 40 |         self.key_dense = layers.Dense(embed_dim)
 41 |         self.value_dense = layers.Dense(embed_dim)
 42 |         self.combine_heads = layers.Dense(embed_dim)
 43 | 
 44 |     def attention(self, query, key, value):
 45 |         score = tf.matmul(query, key, transpose_b=True)
 46 |         dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
 47 |         scaled_score = score / tf.math.sqrt(dim_key)
 48 |         weights = tf.nn.softmax(scaled_score, axis=-1)
 49 |         output = tf.matmul(weights, value)
 50 |         return output, weights
 51 | 
 52 |     def separate_heads(self, x, batch_size):
 53 |         x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
 54 |         return tf.transpose(x, perm=[0, 2, 1, 3])
 55 | 
 56 |     def call(self, inputs):
 57 |         # x.shape = [batch_size, seq_len, embedding_dim]
 58 |         batch_size = tf.shape(inputs)[0]
 59 |         query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
 60 |         key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
 61 |         value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
 62 |         query = self.separate_heads(
 63 |             query, batch_size
 64 |         )  # (batch_size, num_heads, seq_len, projection_dim)
 65 |         key = self.separate_heads(
 66 |             key, batch_size
 67 |         )  # (batch_size, num_heads, seq_len, projection_dim)
 68 |         value = self.separate_heads(
 69 |             value, batch_size
 70 |         )  # (batch_size, num_heads, seq_len, projection_dim)
 71 |         attention, weights = self.attention(query, key, value)
 72 |         attention = tf.transpose(
 73 |             attention, perm=[0, 2, 1, 3]
 74 |         )  # (batch_size, seq_len, num_heads, projection_dim)
 75 |         concat_attention = tf.reshape(
 76 |             attention, (batch_size, -1, self.embed_dim)
 77 |         )  # (batch_size, seq_len, embed_dim)
 78 |         output = self.combine_heads(
 79 |             concat_attention
 80 |         )  # (batch_size, seq_len, embed_dim)
 81 |         return output
 82 | 
 83 | 
 84 | ```
 85 | 
 86 | ## Implement a Transformer block as a layer
 87 | 
 88 | 
 89 | 
 90 | ```python
 91 | 
 92 | class TransformerBlock(layers.Layer):
 93 |     def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
 94 |         super(TransformerBlock, self).__init__()
 95 |         self.att = MultiHeadSelfAttention(embed_dim, num_heads)
 96 |         self.ffn = keras.Sequential(
 97 |             [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
 98 |         )
 99 |         self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
100 |         self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
101 |         self.dropout1 = layers.Dropout(rate)
102 |         self.dropout2 = layers.Dropout(rate)
103 | 
104 |     def call(self, inputs, training):
105 |         attn_output = self.att(inputs)
106 |         attn_output = self.dropout1(attn_output, training=training)
107 |         out1 = self.layernorm1(inputs + attn_output)
108 |         ffn_output = self.ffn(out1)
109 |         ffn_output = self.dropout2(ffn_output, training=training)
110 |         return self.layernorm2(out1 + ffn_output)
111 | 
112 | 
113 | ```
114 | 
115 | ## Implement embedding layer
116 | 
117 | Two seperate embedding layers, one for tokens, one for token index (positions).
118 | 
119 | 
120 | 
121 | ```python
122 | class TokenAndPositionEmbedding(layers.Layer):
123 |     def __init__(self, maxlen, vocab_size, embed_dim):
124 |         super(TokenAndPositionEmbedding, self).__init__()
125 |         self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
126 |         self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
127 | 
128 |     def call(self, x):
129 |         maxlen = tf.shape(x)[-1]
130 |         positions = tf.range(start=0, limit=maxlen, delta=1)
131 |         positions = self.pos_emb(positions)
132 |         x = self.token_emb(x)
133 |         return x + positions
134 | 
135 | 
136 | ```
137 | 
138 | # prepare NSL KDD dataset
139 | 
140 | 
141 | ## reading CSV files
142 | 
143 | 
144 | ```python
145 | # c_names --->  column names
146 | c_names = ["duration","protocol_type","service","flag","src_bytes",
147 |     "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
148 |     "logged_in","num_compromised","root_shell","su_attempted","num_root",
149 |     "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
150 |     "is_host_login","is_guest_login","count","srv_count","serror_rate",
151 |     "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
152 |     "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
153 |     "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
154 |     "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
155 |     "dst_host_rerror_rate","dst_host_srv_rerror_rate","labels","difficulty_degree"]
156 | 
157 | train = pd.read_csv( "data/KDDTrain+.csv", names=c_names) # train file
158 | test = pd.read_csv("data/KDDTest+.csv", names=c_names) # test file
159 | ```
160 | 
161 | ## deletion of unnecessary feature (difficulty_degree)
162 | 
163 | 
164 | ```python
165 | del train["difficulty_degree"] 
166 | del test["difficulty_degree"] 
167 | ```
168 | 
169 | ## Converting object features to categories first and then to dummy tables (except "labels")
170 | 
171 | 
172 | ```python
173 | for i in c_names:
174 |     print((train[i].dtypes))
175 |     if train[i].dtypes==object:
176 |         train[i] = train[i].astype('category')
177 |         test[i] = test[i].astype('category')
178 |         if i=="labels":
179 |             break
180 |         train=pd.get_dummies(train, columns=[i])
181 |         test=pd.get_dummies(test, columns=[i])   
182 | ```
183 | 
184 |     int64
185 |     object
186 |     object
187 |     object
188 |     int64
189 |     int64
190 |     int64
191 |     int64
192 |     int64
193 |     int64
194 |     int64
195 |     int64
196 |     int64
197 |     int64
198 |     int64
199 |     int64
200 |     int64
201 |     int64
202 |     int64
203 |     int64
204 |     int64
205 |     int64
206 |     int64
207 |     int64
208 |     float64
209 |     float64
210 |     float64
211 |     float64
212 |     float64
213 |     float64
214 |     float64
215 |     int64
216 |     int64
217 |     float64
218 |     float64
219 |     float64
220 |     float64
221 |     float64
222 |     float64
223 |     float64
224 |     float64
225 |     object
226 |     
227 | 
228 | ## labels feature  converts to binary
229 | 
230 | 
231 | ```python
232 | # TRAIN
233 | attack_or_not=[]
234 | for i in train["labels"]:#it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm
235 |     if i =="normal":
236 |         attack_or_not.append(1)
237 |     else:
238 |         attack_or_not.append(0)           
239 | train["labels"]=attack_or_not
240 | ```
241 | 
242 | 
243 | ```python
244 | # TEST
245 | attack_or_not=[]
246 | for i in test["labels"]:#it changes the normal label to "1" and the attack tag to "0" for use in the machine learning algorithm
247 |     if i =="normal":
248 |         attack_or_not.append(1)
249 |     else:
250 |         attack_or_not.append(0)           
251 | test["labels"]=attack_or_not
252 | ```
253 | 
254 | ## Synchronizing Test and Train datasets.
255 | ### Add "0" for the feature that does not exist in one of these two datasets.
256 | 
257 | 
258 | ```python
259 | f=list(train.columns)
260 | e=list(test.columns)
261 | 
262 | for i in f:
263 |     if i not in e:
264 |         zero_data =pd.array(np.zeros(len(test["labels"]))) 
265 |         print(len(zero_data))
266 |         test[i] = zero_data
267 |         print(i)
268 | for i in e:
269 |     if i not in f:
270 |         zero_data = np.zeros(len(train["labels"]))
271 |         train[i] = zero_data
272 |         print(i)
273 | 
274 | ```
275 | 
276 |     22543
277 |     service_aol
278 |     22543
279 |     service_harvest
280 |     22543
281 |     service_http_2784
282 |     22543
283 |     service_http_8001
284 |     22543
285 |     service_red_i
286 |     22543
287 |     service_tftp_u
288 |     22543
289 |     service_urh_i
290 |     
291 | 
292 | ## separation of features (data) and Label (target)
293 | 
294 | 
295 | ```python
296 | y = train["labels"] #this section separates the label and the data into two separate pieces, as Label=y Data=X 
297 | del train["labels"] 
298 | X = train
299 | ```
300 | 
301 | 
302 | ```python
303 | y_test = test["labels"] #this section separates the label and the data into two separate pieces, as Label=y Data=X 
304 | del test["labels"] 
305 | x_test=test
306 | ```
307 | 
308 | ## Normalization and Standardization
309 | 
310 | 
311 | ```python
312 | X = preprocessing.scale(X)
313 | X = preprocessing.normalize(X)
314 | ```
315 | 
316 | 
317 | ```python
318 | x_test = preprocessing.scale(x_test)
319 | x_test = preprocessing.normalize(x_test)
320 | ```
321 | 
322 | ## Separating Train data into two parts as train and validation
323 | 
324 | 
325 | ```python
326 | x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify=y)
327 | print(len(x_train), "Training sequences",x_train.shape)
328 | print(len(x_val), "Validation sequences",x_val.shape)
329 | print(len(x_test), "Test sequences",x_test.shape)
330 | ```
331 | 
332 |     100778 Training sequences (100778, 122)
333 |     25195 Validation sequences (25195, 122)
334 |     22543 Test sequences (22543, 122)
335 |     
336 | 
337 | ## Create classifier model using transformer layer
338 | 
339 | Transformer layer outputs one vector for each time step of our input sequence.
340 | Here, we take the mean across all time steps and
341 | use a feed forward network on top of it to classify text.
342 | 
343 | 
344 | 
345 | ```python
346 | maxlen=122
347 | vocab_size = 100000  # Only consider the top 20k words
348 | 
349 | 
350 | 
351 | embed_dim = 32  # Embedding size for each token
352 | num_heads = 2  # Number of attention heads
353 | ff_dim = 32  # Hidden layer size in feed forward network inside transformer
354 | 
355 | inputs = layers.Input(shape=(maxlen,))
356 | embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
357 | x = embedding_layer(inputs)
358 | transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
359 | x = transformer_block(x)
360 | x = layers.GlobalAveragePooling1D()(x)
361 | x = layers.Dropout(0.1)(x)
362 | x = layers.Dense(20, activation="relu")(x)
363 | x = layers.Dropout(0.1)(x)
364 | outputs = layers.Dense(2, activation="softmax")(x)
365 | 
366 | model = keras.Model(inputs=inputs, outputs=outputs)
367 | 
368 | 
369 | ```
370 | 
371 | 
372 | ```python
373 | x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
374 | x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)
375 | x_test = keras.preprocessing.sequence.pad_sequences(x_test , maxlen=maxlen)
376 | 
377 | ```
378 | 
379 | ## Train
380 | 
381 | 
382 | ```python
383 | model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
384 | history = model.fit(
385 |     x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)
386 | )
387 | 
388 | ```
389 | 
390 |     Train on 100778 samples, validate on 25195 samples
391 |     Epoch 1/2
392 |     100778/100778 [==============================] - 240s 2ms/sample - loss: 0.6915 - accuracy: 0.5331 - val_loss: 0.6908 - val_accuracy: 0.5346
393 |     Epoch 2/2
394 |     100778/100778 [==============================] - 221s 2ms/sample - loss: 0.6908 - accuracy: 0.5345 - val_loss: 0.6910 - val_accuracy: 0.5346
395 |     
396 | 
397 | ## Evaluate
398 | 
399 | 
400 | ```python
401 | score = model.evaluate(x_test, y_test, verbose=0)
402 | print("Test loss:", score[0])
403 | print("Test accuracy:", score[1])
404 | ```
405 | 
406 |     Test loss: 0.7010403732089466
407 |     Test accuracy: 0.43073237
408 |     
409 | 
410 | 
411 | ```python
412 | score = model.evaluate(x_val, y_val, verbose=0)
413 | print("Test loss:", score[0])
414 | print("Test accuracy:", score[1])
415 | ```
416 | 
417 |     Test loss: 0.690967743196618
418 |     Test accuracy: 0.5345902
419 |     
420 | 


--------------------------------------------------------------------------------
/data/KDDTest+.csv:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/nsl_kdd_binary_calassification_with_transformer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "52GSZ0J_zaYj"
  8 |    },
  9 |    "source": [
 10 |     "# NSL KDD binary classification with Transformer\n",
 11 |     "\n",
 12 |     "I used it to classify the NSL-KDD dataset by making a slight change on the code I got from the [keras documentation page.](https://keras.io/examples/nlp/text_classification_with_transformer/)"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {
 18 |     "colab_type": "text",
 19 |     "id": "i7MlsDyNzaYr"
 20 |    },
 21 |    "source": [
 22 |     "## importing of required libraries \n"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {
 29 |     "colab": {},
 30 |     "colab_type": "code",
 31 |     "id": "Y8gX00uEzaYt"
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "import tensorflow as tf\n",
 36 |     "from tensorflow import keras\n",
 37 |     "from tensorflow.keras import layers\n",
 38 |     "\n",
 39 |     "\n",
 40 |     "import numpy as np\n",
 41 |     "import pandas as pd\n",
 42 |     "\n",
 43 |     "from sklearn import preprocessing\n",
 44 |     "from sklearn.model_selection import train_test_split"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {
 50 |     "colab_type": "text",
 51 |     "id": "RVj4msuOzaYz"
 52 |    },
 53 |    "source": [
 54 |     "## Implement multi head self attention as a Keras layer\n"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 2,
 60 |    "metadata": {
 61 |     "colab": {},
 62 |     "colab_type": "code",
 63 |     "id": "p09RV47lzaY3"
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "\n",
 68 |     "class MultiHeadSelfAttention(layers.Layer):\n",
 69 |     "    def __init__(self, embed_dim, num_heads=8):\n",
 70 |     "        super(MultiHeadSelfAttention, self).__init__()\n",
 71 |     "        self.embed_dim = embed_dim\n",
 72 |     "        self.num_heads = num_heads\n",
 73 |     "        if embed_dim % num_heads != 0:\n",
 74 |     "            raise ValueError(\n",
 75 |     "                f\"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}\"\n",
 76 |     "            )\n",
 77 |     "        self.projection_dim = embed_dim // num_heads\n",
 78 |     "        self.query_dense = layers.Dense(embed_dim)\n",
 79 |     "        self.key_dense = layers.Dense(embed_dim)\n",
 80 |     "        self.value_dense = layers.Dense(embed_dim)\n",
 81 |     "        self.combine_heads = layers.Dense(embed_dim)\n",
 82 |     "\n",
 83 |     "    def attention(self, query, key, value):\n",
 84 |     "        score = tf.matmul(query, key, transpose_b=True)\n",
 85 |     "        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)\n",
 86 |     "        scaled_score = score / tf.math.sqrt(dim_key)\n",
 87 |     "        weights = tf.nn.softmax(scaled_score, axis=-1)\n",
 88 |     "        output = tf.matmul(weights, value)\n",
 89 |     "        return output, weights\n",
 90 |     "\n",
 91 |     "    def separate_heads(self, x, batch_size):\n",
 92 |     "        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))\n",
 93 |     "        return tf.transpose(x, perm=[0, 2, 1, 3])\n",
 94 |     "\n",
 95 |     "    def call(self, inputs):\n",
 96 |     "        # x.shape = [batch_size, seq_len, embedding_dim]\n",
 97 |     "        batch_size = tf.shape(inputs)[0]\n",
 98 |     "        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)\n",
 99 |     "        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)\n",
100 |     "        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)\n",
101 |     "        query = self.separate_heads(\n",
102 |     "            query, batch_size\n",
103 |     "        )  # (batch_size, num_heads, seq_len, projection_dim)\n",
104 |     "        key = self.separate_heads(\n",
105 |     "            key, batch_size\n",
106 |     "        )  # (batch_size, num_heads, seq_len, projection_dim)\n",
107 |     "        value = self.separate_heads(\n",
108 |     "            value, batch_size\n",
109 |     "        )  # (batch_size, num_heads, seq_len, projection_dim)\n",
110 |     "        attention, weights = self.attention(query, key, value)\n",
111 |     "        attention = tf.transpose(\n",
112 |     "            attention, perm=[0, 2, 1, 3]\n",
113 |     "        )  # (batch_size, seq_len, num_heads, projection_dim)\n",
114 |     "        concat_attention = tf.reshape(\n",
115 |     "            attention, (batch_size, -1, self.embed_dim)\n",
116 |     "        )  # (batch_size, seq_len, embed_dim)\n",
117 |     "        output = self.combine_heads(\n",
118 |     "            concat_attention\n",
119 |     "        )  # (batch_size, seq_len, embed_dim)\n",
120 |     "        return output\n",
121 |     "\n"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {
127 |     "colab_type": "text",
128 |     "id": "GANkWrJ7zaY6"
129 |    },
130 |    "source": [
131 |     "## Implement a Transformer block as a layer\n"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 3,
137 |    "metadata": {
138 |     "colab": {},
139 |     "colab_type": "code",
140 |     "id": "Cj879AGizaY7"
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "\n",
145 |     "class TransformerBlock(layers.Layer):\n",
146 |     "    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):\n",
147 |     "        super(TransformerBlock, self).__init__()\n",
148 |     "        self.att = MultiHeadSelfAttention(embed_dim, num_heads)\n",
149 |     "        self.ffn = keras.Sequential(\n",
150 |     "            [layers.Dense(ff_dim, activation=\"relu\"), layers.Dense(embed_dim),]\n",
151 |     "        )\n",
152 |     "        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)\n",
153 |     "        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)\n",
154 |     "        self.dropout1 = layers.Dropout(rate)\n",
155 |     "        self.dropout2 = layers.Dropout(rate)\n",
156 |     "\n",
157 |     "    def call(self, inputs, training):\n",
158 |     "        attn_output = self.att(inputs)\n",
159 |     "        attn_output = self.dropout1(attn_output, training=training)\n",
160 |     "        out1 = self.layernorm1(inputs + attn_output)\n",
161 |     "        ffn_output = self.ffn(out1)\n",
162 |     "        ffn_output = self.dropout2(ffn_output, training=training)\n",
163 |     "        return self.layernorm2(out1 + ffn_output)\n",
164 |     "\n"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {
170 |     "colab_type": "text",
171 |     "id": "fZ5U2V7RzaZC"
172 |    },
173 |    "source": [
174 |     "## Implement embedding layer\n",
175 |     "\n",
176 |     "Two seperate embedding layers, one for tokens, one for token index (positions).\n"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 4,
182 |    "metadata": {
183 |     "colab": {},
184 |     "colab_type": "code",
185 |     "id": "SADnKmhRzaZE"
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "class TokenAndPositionEmbedding(layers.Layer):\n",
190 |     "    def __init__(self, maxlen, vocab_size, embed_dim):\n",
191 |     "        super(TokenAndPositionEmbedding, self).__init__()\n",
192 |     "        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)\n",
193 |     "        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)\n",
194 |     "\n",
195 |     "    def call(self, x):\n",
196 |     "        maxlen = tf.shape(x)[-1]\n",
197 |     "        positions = tf.range(start=0, limit=maxlen, delta=1)\n",
198 |     "        positions = self.pos_emb(positions)\n",
199 |     "        x = self.token_emb(x)\n",
200 |     "        return x + positions\n",
201 |     "\n"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {
207 |     "colab_type": "text",
208 |     "id": "7yVOPPwSzaZO"
209 |    },
210 |    "source": [
211 |     "# prepare NSL KDD dataset\n"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "metadata": {},
217 |    "source": [
218 |     "## reading CSV files"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 5,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "# c_names --->  column names\n",
228 |     "c_names = [\"duration\",\"protocol_type\",\"service\",\"flag\",\"src_bytes\",\n",
229 |     "    \"dst_bytes\",\"land\",\"wrong_fragment\",\"urgent\",\"hot\",\"num_failed_logins\",\n",
230 |     "    \"logged_in\",\"num_compromised\",\"root_shell\",\"su_attempted\",\"num_root\",\n",
231 |     "    \"num_file_creations\",\"num_shells\",\"num_access_files\",\"num_outbound_cmds\",\n",
232 |     "    \"is_host_login\",\"is_guest_login\",\"count\",\"srv_count\",\"serror_rate\",\n",
233 |     "    \"srv_serror_rate\",\"rerror_rate\",\"srv_rerror_rate\",\"same_srv_rate\",\n",
234 |     "    \"diff_srv_rate\",\"srv_diff_host_rate\",\"dst_host_count\",\"dst_host_srv_count\",\n",
235 |     "    \"dst_host_same_srv_rate\",\"dst_host_diff_srv_rate\",\"dst_host_same_src_port_rate\",\n",
236 |     "    \"dst_host_srv_diff_host_rate\",\"dst_host_serror_rate\",\"dst_host_srv_serror_rate\",\n",
237 |     "    \"dst_host_rerror_rate\",\"dst_host_srv_rerror_rate\",\"labels\",\"difficulty_degree\"]\n",
238 |     "\n",
239 |     "train = pd.read_csv( \"data/KDDTrain+.csv\", names=c_names) # train file\n",
240 |     "test = pd.read_csv(\"data/KDDTest+.csv\", names=c_names) # test file"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "markdown",
245 |    "metadata": {},
246 |    "source": [
247 |     "## deletion of unnecessary feature (difficulty_degree)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 6,
253 |    "metadata": {},
254 |    "outputs": [],
255 |    "source": [
256 |     "del train[\"difficulty_degree\"] \n",
257 |     "del test[\"difficulty_degree\"] "
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "## Converting object features to categories first and then to dummy tables (except \"labels\")"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 7,
270 |    "metadata": {},
271 |    "outputs": [
272 |     {
273 |      "name": "stdout",
274 |      "output_type": "stream",
275 |      "text": [
276 |       "int64\n",
277 |       "object\n",
278 |       "object\n",
279 |       "object\n",
280 |       "int64\n",
281 |       "int64\n",
282 |       "int64\n",
283 |       "int64\n",
284 |       "int64\n",
285 |       "int64\n",
286 |       "int64\n",
287 |       "int64\n",
288 |       "int64\n",
289 |       "int64\n",
290 |       "int64\n",
291 |       "int64\n",
292 |       "int64\n",
293 |       "int64\n",
294 |       "int64\n",
295 |       "int64\n",
296 |       "int64\n",
297 |       "int64\n",
298 |       "int64\n",
299 |       "int64\n",
300 |       "float64\n",
301 |       "float64\n",
302 |       "float64\n",
303 |       "float64\n",
304 |       "float64\n",
305 |       "float64\n",
306 |       "float64\n",
307 |       "int64\n",
308 |       "int64\n",
309 |       "float64\n",
310 |       "float64\n",
311 |       "float64\n",
312 |       "float64\n",
313 |       "float64\n",
314 |       "float64\n",
315 |       "float64\n",
316 |       "float64\n",
317 |       "object\n"
318 |      ]
319 |     }
320 |    ],
321 |    "source": [
322 |     "for i in c_names:\n",
323 |     "    print((train[i].dtypes))\n",
324 |     "    if train[i].dtypes==object:\n",
325 |     "        train[i] = train[i].astype('category')\n",
326 |     "        test[i] = test[i].astype('category')\n",
327 |     "        if i==\"labels\":\n",
328 |     "            break\n",
329 |     "        train=pd.get_dummies(train, columns=[i])\n",
330 |     "        test=pd.get_dummies(test, columns=[i])   "
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {},
336 |    "source": [
337 |     "## labels feature  converts to binary"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 8,
343 |    "metadata": {},
344 |    "outputs": [],
345 |    "source": [
346 |     "# TRAIN\n",
347 |     "attack_or_not=[]\n",
348 |     "for i in train[\"labels\"]:#it changes the normal label to \"1\" and the attack tag to \"0\" for use in the machine learning algorithm\n",
349 |     "    if i ==\"normal\":\n",
350 |     "        attack_or_not.append(1)\n",
351 |     "    else:\n",
352 |     "        attack_or_not.append(0)           \n",
353 |     "train[\"labels\"]=attack_or_not"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": 9,
359 |    "metadata": {},
360 |    "outputs": [],
361 |    "source": [
362 |     "# TEST\n",
363 |     "attack_or_not=[]\n",
364 |     "for i in test[\"labels\"]:#it changes the normal label to \"1\" and the attack tag to \"0\" for use in the machine learning algorithm\n",
365 |     "    if i ==\"normal\":\n",
366 |     "        attack_or_not.append(1)\n",
367 |     "    else:\n",
368 |     "        attack_or_not.append(0)           \n",
369 |     "test[\"labels\"]=attack_or_not"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "metadata": {},
375 |    "source": [
376 |     "## Synchronizing Test and Train datasets.\n",
377 |     "### Add \"0\" for the feature that does not exist in one of these two datasets."
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 10,
383 |    "metadata": {},
384 |    "outputs": [
385 |     {
386 |      "name": "stdout",
387 |      "output_type": "stream",
388 |      "text": [
389 |       "22543\n",
390 |       "service_aol\n",
391 |       "22543\n",
392 |       "service_harvest\n",
393 |       "22543\n",
394 |       "service_http_2784\n",
395 |       "22543\n",
396 |       "service_http_8001\n",
397 |       "22543\n",
398 |       "service_red_i\n",
399 |       "22543\n",
400 |       "service_tftp_u\n",
401 |       "22543\n",
402 |       "service_urh_i\n"
403 |      ]
404 |     }
405 |    ],
406 |    "source": [
407 |     "f=list(train.columns)\n",
408 |     "e=list(test.columns)\n",
409 |     "\n",
410 |     "for i in f:\n",
411 |     "    if i not in e:\n",
412 |     "        zero_data =pd.array(np.zeros(len(test[\"labels\"]))) \n",
413 |     "        print(len(zero_data))\n",
414 |     "        test[i] = zero_data\n",
415 |     "        print(i)\n",
416 |     "for i in e:\n",
417 |     "    if i not in f:\n",
418 |     "        zero_data = np.zeros(len(train[\"labels\"]))\n",
419 |     "        train[i] = zero_data\n",
420 |     "        print(i)\n"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "markdown",
425 |    "metadata": {},
426 |    "source": [
427 |     "## separation of features (data) and Label (target)"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": 11,
433 |    "metadata": {},
434 |    "outputs": [],
435 |    "source": [
436 |     "y = train[\"labels\"] #this section separates the label and the data into two separate pieces, as Label=y Data=X \n",
437 |     "del train[\"labels\"] \n",
438 |     "X = train"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": 12,
444 |    "metadata": {},
445 |    "outputs": [],
446 |    "source": [
447 |     "y_test = test[\"labels\"] #this section separates the label and the data into two separate pieces, as Label=y Data=X \n",
448 |     "del test[\"labels\"] \n",
449 |     "x_test=test"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "markdown",
454 |    "metadata": {},
455 |    "source": [
456 |     "## Normalization and Standardization"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": 13,
462 |    "metadata": {},
463 |    "outputs": [],
464 |    "source": [
465 |     "X = preprocessing.scale(X)\n",
466 |     "X = preprocessing.normalize(X)"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": 14,
472 |    "metadata": {},
473 |    "outputs": [],
474 |    "source": [
475 |     "x_test = preprocessing.scale(x_test)\n",
476 |     "x_test = preprocessing.normalize(x_test)"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "markdown",
481 |    "metadata": {},
482 |    "source": [
483 |     "## Separating Train data into two parts as train and validation"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": 15,
489 |    "metadata": {},
490 |    "outputs": [
491 |     {
492 |      "name": "stdout",
493 |      "output_type": "stream",
494 |      "text": [
495 |       "100778 Training sequences (100778, 122)\n",
496 |       "25195 Validation sequences (25195, 122)\n",
497 |       "22543 Test sequences (22543, 122)\n"
498 |      ]
499 |     }
500 |    ],
501 |    "source": [
502 |     "x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify=y)\n",
503 |     "print(len(x_train), \"Training sequences\",x_train.shape)\n",
504 |     "print(len(x_val), \"Validation sequences\",x_val.shape)\n",
505 |     "print(len(x_test), \"Test sequences\",x_test.shape)"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "markdown",
510 |    "metadata": {
511 |     "colab_type": "text",
512 |     "id": "ZGkJpYWEzaZU"
513 |    },
514 |    "source": [
515 |     "## Create classifier model using transformer layer\n",
516 |     "\n",
517 |     "Transformer layer outputs one vector for each time step of our input sequence.\n",
518 |     "Here, we take the mean across all time steps and\n",
519 |     "use a feed forward network on top of it to classify text.\n"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "code",
524 |    "execution_count": 16,
525 |    "metadata": {
526 |     "colab": {},
527 |     "colab_type": "code",
528 |     "id": "bum8sPyKzaZV"
529 |    },
530 |    "outputs": [],
531 |    "source": [
532 |     "maxlen=122\n",
533 |     "vocab_size = 100000  # Only consider the top 20k words\n",
534 |     "\n",
535 |     "\n",
536 |     "\n",
537 |     "embed_dim = 32  # Embedding size for each token\n",
538 |     "num_heads = 2  # Number of attention heads\n",
539 |     "ff_dim = 32  # Hidden layer size in feed forward network inside transformer\n",
540 |     "\n",
541 |     "inputs = layers.Input(shape=(maxlen,))\n",
542 |     "embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)\n",
543 |     "x = embedding_layer(inputs)\n",
544 |     "transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)\n",
545 |     "x = transformer_block(x)\n",
546 |     "x = layers.GlobalAveragePooling1D()(x)\n",
547 |     "x = layers.Dropout(0.1)(x)\n",
548 |     "x = layers.Dense(20, activation=\"relu\")(x)\n",
549 |     "x = layers.Dropout(0.1)(x)\n",
550 |     "outputs = layers.Dense(2, activation=\"softmax\")(x)\n",
551 |     "\n",
552 |     "model = keras.Model(inputs=inputs, outputs=outputs)\n",
553 |     "\n"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": 18,
559 |    "metadata": {},
560 |    "outputs": [],
561 |    "source": [
562 |     "x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)\n",
563 |     "x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)\n",
564 |     "x_test = keras.preprocessing.sequence.pad_sequences(x_test , maxlen=maxlen)\n"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "markdown",
569 |    "metadata": {},
570 |    "source": [
571 |     "## Train"
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "code",
576 |    "execution_count": 19,
577 |    "metadata": {
578 |     "colab": {
579 |      "base_uri": "https://localhost:8080/",
580 |      "height": 84
581 |     },
582 |     "colab_type": "code",
583 |     "id": "98T8ht1ezaZk",
584 |     "outputId": "030b4f93-ccf8-496a-f416-6848021638e1"
585 |    },
586 |    "outputs": [
587 |     {
588 |      "name": "stdout",
589 |      "output_type": "stream",
590 |      "text": [
591 |       "Train on 100778 samples, validate on 25195 samples\n",
592 |       "Epoch 1/2\n",
593 |       "100778/100778 [==============================] - 240s 2ms/sample - loss: 0.6915 - accuracy: 0.5331 - val_loss: 0.6908 - val_accuracy: 0.5346\n",
594 |       "Epoch 2/2\n",
595 |       "100778/100778 [==============================] - 221s 2ms/sample - loss: 0.6908 - accuracy: 0.5345 - val_loss: 0.6910 - val_accuracy: 0.5346\n"
596 |      ]
597 |     }
598 |    ],
599 |    "source": [
600 |     "model.compile(\"adam\", \"sparse_categorical_crossentropy\", metrics=[\"accuracy\"])\n",
601 |     "history = model.fit(\n",
602 |     "    x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)\n",
603 |     ")\n"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "markdown",
608 |    "metadata": {},
609 |    "source": [
610 |     "## Evaluate"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": 20,
616 |    "metadata": {},
617 |    "outputs": [
618 |     {
619 |      "name": "stdout",
620 |      "output_type": "stream",
621 |      "text": [
622 |       "Test loss: 0.7010403732089466\n",
623 |       "Test accuracy: 0.43073237\n"
624 |      ]
625 |     }
626 |    ],
627 |    "source": [
628 |     "score = model.evaluate(x_test, y_test, verbose=0)\n",
629 |     "print(\"Test loss:\", score[0])\n",
630 |     "print(\"Test accuracy:\", score[1])"
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "code",
635 |    "execution_count": 21,
636 |    "metadata": {
637 |     "colab": {
638 |      "base_uri": "https://localhost:8080/",
639 |      "height": 50
640 |     },
641 |     "colab_type": "code",
642 |     "id": "Zm1_60H5z0fp",
643 |     "outputId": "09bc4104-5d63-4826-c0a6-535cc9f271f5"
644 |    },
645 |    "outputs": [
646 |     {
647 |      "name": "stdout",
648 |      "output_type": "stream",
649 |      "text": [
650 |       "Test loss: 0.690967743196618\n",
651 |       "Test accuracy: 0.5345902\n"
652 |      ]
653 |     }
654 |    ],
655 |    "source": [
656 |     "score = model.evaluate(x_val, y_val, verbose=0)\n",
657 |     "print(\"Test loss:\", score[0])\n",
658 |     "print(\"Test accuracy:\", score[1])"
659 |    ]
660 |   }
661 |  ],
662 |  "metadata": {
663 |   "colab": {
664 |    "collapsed_sections": [],
665 |    "name": "text_classification_with_transformer",
666 |    "provenance": [],
667 |    "toc_visible": true
668 |   },
669 |   "kernelspec": {
670 |    "display_name": "Python 3",
671 |    "language": "python",
672 |    "name": "python3"
673 |   },
674 |   "language_info": {
675 |    "codemirror_mode": {
676 |     "name": "ipython",
677 |     "version": 3
678 |    },
679 |    "file_extension": ".py",
680 |    "mimetype": "text/x-python",
681 |    "name": "python",
682 |    "nbconvert_exporter": "python",
683 |    "pygments_lexer": "ipython3",
684 |    "version": "3.6.5"
685 |   }
686 |  },
687 |  "nbformat": 4,
688 |  "nbformat_minor": 1
689 | }
690 | 


--------------------------------------------------------------------------------