├── README.md ├── SVM with multiclasses.ipynb └── SVM with multiclasses.py /README.md: -------------------------------------------------------------------------------- 1 | # Building-Intrusion-Detection-System 2 | Building Intrusion Detection System for Network Security Using the KD99 Data Set 3 | This code was written for the using jupyter notebook. The code is to build two different IDS mdels. 4 | The first one is for an IDS that is able to distinguish between different major attacks. 5 | The second IDS is able to distinguish between normal connection and attacks. The data set i used in this code is the KDD99 data set which is available for public use at the UCI Machine Learning Repository. 6 | Three algorithms i used using the Sklearn library; SVM, Decision Tree, and Naive Bayes. 7 | -------------------------------------------------------------------------------- /SVM with multiclasses.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "## Preliminaries\n", 12 | "import numpy as np \n", 13 | "import matplotlib.pyplot as plt\n", 14 | "from sklearn import datasets, preprocessing,cross_validation, feature_extraction\n", 15 | "from sklearn import linear_model, svm, metrics, ensemble, tree, ensemble\n", 16 | "from sklearn.decomposition import PCA\n", 17 | "import pandas as pd\n", 18 | "import urllib\n", 19 | "import csv\n", 20 | "\n", 21 | "# Helper functions\n", 22 | "def folds_to_split(data,targets,train,test):\n", 23 | " data_tr = pd.DataFrame(data).iloc[train]\n", 24 | " data_te = pd.DataFrame(data).iloc[test]\n", 25 | " labels_tr = pd.DataFrame(targets).iloc[train]\n", 26 | " labels_te = pd.DataFrame(targets).iloc[test]\n", 27 | " return [data_tr, data_te, labels_tr, labels_te]\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "\n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "## Using SVM To Build The Model 5 class labeling:" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 22, 47 | "metadata": { 48 | "collapsed": false 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "#let's load the data\n", 53 | "train_data = urllib.urlopen('/home/aziz/Downloads/kddcup.data_10_percent_corrected')\n", 54 | "test_data = urllib.urlopen('/home/aziz/Downloads/corrected')\n", 55 | "\n", 56 | "#Place both dataset into a dataframe\n", 57 | "train_multiclass = pd.read_csv(train_data, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class'])\n", 58 | "test_multiclass = pd.read_csv(test_data, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class'])" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "\n" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "## 1 Pre-Processing The Datasets:" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "### 1.1 Change Labels to The Right Class: " 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 23, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "## Replacing all the different attack types(24) to their proper general attack class\n", 91 | "train_multiclass.loc[(train_multiclass['Class'] =='smurf.')|(train_multiclass['Class'] =='neptune.') | (train_multiclass['Class'] =='back.') | (train_multiclass['Class'] =='teardrop.') |(train_multiclass['Class'] =='pod.')| (train_multiclass['Class']=='land.'),'Class'] = 'Dos'\n", 92 | "train_multiclass.loc[(train_multiclass['Class'] =='satan.')|(train_multiclass['Class'] =='ipsweep.') | (train_multiclass['Class'] =='portsweep.') | (train_multiclass['Class'] =='nmap.'),'Class'] = 'probe'\n", 93 | "train_multiclass.loc[(train_multiclass['Class'] =='spy.')|(train_multiclass['Class'] =='phf.')|(train_multiclass['Class'] =='multihop.')|(train_multiclass['Class'] =='ftp_write.') | (train_multiclass['Class'] =='imap.') | (train_multiclass['Class'] =='warezmaster.') |(train_multiclass['Class'] =='guess_passwd.')| (train_multiclass['Class']=='warezclient.'),'Class'] = 'r2l'\n", 94 | "train_multiclass.loc[(train_multiclass['Class'] =='buffer_overflow.')|(train_multiclass['Class'] =='rootkit.') | (train_multiclass['Class'] =='loadmodule.') | (train_multiclass['Class'] =='perl.'),'Class']='u2r'\n", 95 | "train_multiclass.loc[(train_multiclass['Class'] =='normal.'),'Class'] = 'normal'" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "\n" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 24, 108 | "metadata": { 109 | "collapsed": false 110 | }, 111 | "outputs": [], 112 | "source": [ 113 | "## Replacing all the different attack types(36) to their proper general attack class\n", 114 | "\n", 115 | "\n", 116 | "test_multiclass.loc[(test_multiclass['Class'] =='smurf.')|(test_multiclass['Class'] =='neptune.') | \n", 117 | " (test_multiclass['Class'] =='back.') | (test_multiclass['Class'] =='teardrop.') |\n", 118 | " (test_multiclass['Class'] =='pod.')| (test_multiclass['Class']=='land.')|\n", 119 | " (test_multiclass['Class']=='apache2.')|(test_multiclass['Class']=='udpstorm.')|\n", 120 | " (test_multiclass['Class']=='processtable.')|(test_multiclass['Class']=='mailbomb.'),'Class'] = 'Dos'\n", 121 | "\n", 122 | "\n", 123 | "test_multiclass.loc[(test_multiclass['Class'] =='guess_passwd.')|(test_multiclass['Class'] =='ftp_write.')|\n", 124 | " (test_multiclass['Class'] =='imap.')|(test_multiclass['Class'] =='phf.') | \n", 125 | " (test_multiclass['Class'] =='multihop.') | \n", 126 | " (test_multiclass['Class'] =='warezmaster.') |(test_multiclass['Class'] =='snmpgetattack.')| \n", 127 | " (test_multiclass['Class']=='named.')|(test_multiclass['Class'] =='xlock.')|\n", 128 | " (test_multiclass['Class'] =='xsnoop.')|(test_multiclass['Class'] =='sendmail.')|\n", 129 | " (test_multiclass['Class'] =='httptunnel.')|(test_multiclass['Class'] =='worm.')|\n", 130 | " (test_multiclass['Class'] =='snmpguess.'),'Class'] = 'r2l'\n", 131 | "\n", 132 | "test_multiclass.loc[(test_multiclass['Class'] =='satan.')|(test_multiclass['Class'] =='ipsweep.') | (test_multiclass['Class'] =='portsweep.') | (test_multiclass['Class'] =='nmap.')|\n", 133 | " (test_multiclass['Class'] =='saint.')|(test_multiclass['Class'] =='mscan.'),'Class'] = 'probe'\n", 134 | "\n", 135 | "test_multiclass.loc[(test_multiclass['Class'] =='buffer_overflow.')|(test_multiclass['Class'] =='rootkit.') | \n", 136 | " (test_multiclass['Class'] =='loadmodule.') | (test_multiclass['Class'] =='xterm.')|\n", 137 | " (test_multiclass['Class'] =='sqlattack.')|(test_multiclass['Class'] =='ps.')|\n", 138 | " (test_multiclass['Class'] =='perl.'),'Class']='u2r'\n", 139 | "\n", 140 | "test_multiclass.loc[(test_multiclass['Class'] =='normal.'),'Class'] = 'normal'" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "\n" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "### 2.2 Encoding The Dataset:" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 25, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "(494021, 118)\n", 169 | "(311029, 118)\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "# Decoding The Dataset: \n", 175 | "attr_encoder = feature_extraction.DictVectorizer(sparse=False)\n", 176 | "label_encoder = preprocessing.LabelEncoder()\n", 177 | "\n", 178 | "train_data_df_m = attr_encoder.fit_transform(train_multiclass.iloc[:,:-1].T.to_dict().values())\n", 179 | "train_target_df_m= label_encoder.fit_transform(train_multiclass.iloc[:,-1])\n", 180 | "\n", 181 | "\n", 182 | "train_data_decoded_m = pd.DataFrame(train_data_df_m)\n", 183 | "train_target_decoded_m = pd.DataFrame(train_target_df_m)\n", 184 | "\n", 185 | "test_data_df_m = attr_encoder.transform(test_multiclass.iloc[:,:-1].T.to_dict().values())\n", 186 | "test_target_df_m = label_encoder.transform(test_multiclass.iloc[:,-1])\n", 187 | "\n", 188 | "test_data_decoded_m = pd.DataFrame(test_data_df_m)\n", 189 | "test_target_decoded_m = pd.DataFrame(test_target_df_m)\n", 190 | "\n", 191 | "\n", 192 | "print train_data_decoded_m.shape\n", 193 | "print test_data_decoded_m.shape" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "### 1.3 Perfroming Feature Reduction using PCA" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 29, 206 | "metadata": { 207 | "collapsed": false 208 | }, 209 | "outputs": [ 210 | { 211 | "name": "stdout", 212 | "output_type": "stream", 213 | "text": [ 214 | "(494021, 29)\n", 215 | "(311029, 29)\n" 216 | ] 217 | } 218 | ], 219 | "source": [ 220 | "#load some modules to help\n", 221 | "from mpl_toolkits.mplot3d import Axes3D\n", 222 | "from sklearn.decomposition import PCA\n", 223 | "\n", 224 | "\n", 225 | "train_data_pca2 = PCA(n_components=29).fit_transform(train_data_decoded_m)\n", 226 | "test_data_pca2 = PCA(n_components=29).fit_transform(test_data_decoded_m)\n", 227 | "\n", 228 | "train_data_pca_df2 = pd.DataFrame(train_data_pca2)\n", 229 | "test_data_pca_df2 = pd.DataFrame(test_data_pca2)\n", 230 | "\n", 231 | "print train_data_pca_df2.shape\n", 232 | "print test_data_pca_df2.shape" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "### 1.4 Normalizing the Data Sets" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 30, 245 | "metadata": { 246 | "collapsed": true 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "#Creating our scaler and applyting it to our dataset after feature reduction\n", 251 | "standard_scaler = preprocessing.StandardScaler()\n", 252 | "train_ratio_standard_scaled_values2 = standard_scaler.fit_transform(train_data_pca_df2.values)\n", 253 | "train_data_scaled2=pd.DataFrame(train_ratio_standard_scaled_values2)\n", 254 | "\n", 255 | "test_ratio_standard_scaled_values2 = standard_scaler.fit_transform(test_data_pca_df2.values)\n", 256 | "test_data_scaled2=pd.DataFrame(test_ratio_standard_scaled_values2)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "\n" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "## 2 Classification:" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "### 2.1 Using SVM Algorithm" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 22, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [ 287 | { 288 | "name": "stdout", 289 | "output_type": "stream", 290 | "text": [ 291 | "0.88830623511\n", 292 | " precision recall f1-score support\n", 293 | "\n", 294 | " 0 0.95 0.98 0.96 229853\n", 295 | " 1 0.80 0.85 0.82 60593\n", 296 | " 2 0.00 0.00 0.00 4166\n", 297 | " 3 0.01 0.00 0.00 16347\n", 298 | " 4 0.00 0.00 0.00 70\n", 299 | "\n", 300 | "avg / total 0.86 0.89 0.87 311029\n", 301 | "\n", 302 | "Number of support vectors for each class [ 831 5029 89 183 22]\n", 303 | "[[ 3.06168543e-03 -2.62821770e-02 2.92121218e-02 ..., 5.25927608e+00\n", 304 | " -1.17429443e+00 -6.21666293e+00]\n", 305 | " [ 3.06168545e-03 -2.62823594e-02 3.05702387e-02 ..., 4.37957870e+00\n", 306 | " -3.62719957e+00 -6.25108464e+00]\n", 307 | " [ 2.01738062e-03 -2.62830927e-02 3.67396697e-02 ..., -6.92651351e+00\n", 308 | " -4.77964921e+00 -4.82940529e+00]\n", 309 | " ..., \n", 310 | " [ 3.05763774e-03 -2.61610300e-02 2.85855914e-02 ..., 6.05223733e+00\n", 311 | " -4.15001874e+00 -4.24851284e+00]\n", 312 | " [ 1.53772805e-03 3.08304779e-02 -4.14853393e-01 ..., 1.90579151e+01\n", 313 | " -1.03848798e+01 -4.81560377e+00]\n", 314 | " [ 6.97832472e-04 1.00867055e-01 -3.37710746e-02 ..., 8.80253938e+00\n", 315 | " 8.42640634e-01 -3.81469791e+00]]\n" 316 | ] 317 | } 318 | ], 319 | "source": [ 320 | "#Draft\n", 321 | "clf = svm.SVC(kernel='linear',class_weight=\"balanced\", max_iter=100000000)\n", 322 | "clf.fit(train_data_scaled2, train_target_decoded_m[0])\n", 323 | "clf_predict = clf.predict(test_data_scaled2)\n", 324 | "print clf.score(test_data_scaled2, test_target_decoded_m)\n", 325 | "print metrics.classification_report(test_target_decoded_m, clf_predict)\n" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "### 2.2 Using Decision Trees Algorithm:" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "#### 2.2.1 Performaing Corss Validation on The Training Set for Testing Different Paramter" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 37, 345 | "metadata": { 346 | "collapsed": false 347 | }, 348 | "outputs": [ 349 | { 350 | "name": "stdout", 351 | "output_type": "stream", 352 | "text": [ 353 | "criterion: gini\n", 354 | "min_samples_leaf: 5\n", 355 | "max_depth: 6\n", 356 | "Accuracy 0.998128\n", 357 | "dtype: float64\n", 358 | "\n", 359 | "\n", 360 | "criterion: gini\n", 361 | "min_samples_leaf: 5\n", 362 | "max_depth: 12\n", 363 | "Accuracy 0.999273\n", 364 | "dtype: float64\n", 365 | "\n", 366 | "\n", 367 | "criterion: gini\n", 368 | "min_samples_leaf: 10\n", 369 | "max_depth: 6\n", 370 | "Accuracy 0.998146\n", 371 | "dtype: float64\n", 372 | "\n", 373 | "\n", 374 | "criterion: gini\n", 375 | "min_samples_leaf: 10\n", 376 | "max_depth: 12\n", 377 | "Accuracy 0.999263\n", 378 | "dtype: float64\n", 379 | "\n", 380 | "\n", 381 | "criterion: entropy\n", 382 | "min_samples_leaf: 5\n", 383 | "max_depth: 6\n", 384 | "Accuracy 0.998889\n", 385 | "dtype: float64\n", 386 | "\n", 387 | "\n", 388 | "criterion: entropy\n", 389 | "min_samples_leaf: 5\n", 390 | "max_depth: 12\n", 391 | "Accuracy 0.99946\n", 392 | "dtype: float64\n", 393 | "\n", 394 | "\n", 395 | "criterion: entropy\n", 396 | "min_samples_leaf: 10\n", 397 | "max_depth: 6\n", 398 | "Accuracy 0.998911\n", 399 | "dtype: float64\n", 400 | "\n", 401 | "\n", 402 | "criterion: entropy\n", 403 | "min_samples_leaf: 10\n", 404 | "max_depth: 12\n", 405 | "Accuracy 0.999375\n", 406 | "dtype: float64\n", 407 | "\n", 408 | "\n" 409 | ] 410 | } 411 | ], 412 | "source": [ 413 | "## Testing SVM using Different Kernals with class weights balanced\n", 414 | "foldnum = 0\n", 415 | "fold_results = pd.DataFrame()\n", 416 | "criterion=[ 'gini','entropy']\n", 417 | "min_samples_leaf = [5,10]\n", 418 | "max_depth = [6,12]\n", 419 | "\n", 420 | "\n", 421 | "for cri in criterion:\n", 422 | " for leaf in min_samples_leaf:\n", 423 | " for depth in max_depth:\n", 424 | " foldnum = 0\n", 425 | " clf3 = tree.DecisionTreeClassifier(criterion=cri,min_samples_leaf=leaf,max_depth=depth,random_state=20160121,class_weight=\"balanced\")\n", 426 | " for train, test in cross_validation.KFold(len(train_data_scaled2), n_folds=5,shuffle=True,random_state=20160202): \n", 427 | " [ids_tr_data, ids_te_data, ids_tr_target, ids_te_target] = folds_to_split(train_data_scaled_1,train_target_decoded,train, test)\n", 428 | " clf3.fit(ids_tr_data, ids_tr_target[0])\n", 429 | " fold_results.loc[foldnum, 'Accuracy'] = clf3.score(ids_te_data, ids_te_target)\n", 430 | " foldnum+=1 \n", 431 | " print \"criterion:\",cri\n", 432 | " print \"min_samples_leaf:\",leaf\n", 433 | " print \"max_depth:\",depth\n", 434 | " print fold_results.mean()\n", 435 | " print \"\\n\"" 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": {}, 441 | "source": [ 442 | "#### 2.2.1 Testing the IDS Model on The Test Set:" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 46, 448 | "metadata": { 449 | "collapsed": false 450 | }, 451 | "outputs": [ 452 | { 453 | "name": "stdout", 454 | "output_type": "stream", 455 | "text": [ 456 | "Accuracy : 0.848322825203\n", 457 | " precision recall f1-score support\n", 458 | "\n", 459 | " 0 0.89 0.96 0.92 229853\n", 460 | " 1 0.77 0.69 0.73 60593\n", 461 | " 2 0.11 0.15 0.13 4166\n", 462 | " 3 0.35 0.03 0.06 16347\n", 463 | " 4 0.00 0.00 0.00 70\n", 464 | "\n", 465 | "avg / total 0.82 0.85 0.83 311029\n", 466 | "\n" 467 | ] 468 | } 469 | ], 470 | "source": [ 471 | "from sklearn.datasets import load_iris\n", 472 | "from sklearn import tree\n", 473 | "\n", 474 | "clf3 = tree.DecisionTreeClassifier(criterion = 'gini', min_samples_leaf = 5, max_depth=12,random_state=20160121,class_weight=\"balanced\")\n", 475 | "clf3.fit(train_data_scaled2, train_target_decoded_m[0])\n", 476 | "clf3_predict = clf3.predict(test_data_scaled2)\n", 477 | "print \"Accuracy :\", clf3.score(test_data_scaled2, test_target_decoded_m)\n", 478 | "print metrics.classification_report(test_target_decoded_m, clf3_predict)\n" 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": {}, 484 | "source": [ 485 | "### 2.3 Using Naive Bayes Algorithm:" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 35, 491 | "metadata": { 492 | "collapsed": false 493 | }, 494 | "outputs": [ 495 | { 496 | "name": "stdout", 497 | "output_type": "stream", 498 | "text": [ 499 | "0.749843262204\n", 500 | " precision recall f1-score support\n", 501 | "\n", 502 | " 0 0.96 0.78 0.86 229853\n", 503 | " 1 0.44 0.87 0.58 60593\n", 504 | " 2 0.00 0.00 0.00 4166\n", 505 | " 3 0.26 0.08 0.12 16347\n", 506 | " 4 0.75 0.09 0.15 70\n", 507 | "\n", 508 | "avg / total 0.81 0.75 0.76 311029\n", 509 | "\n" 510 | ] 511 | } 512 | ], 513 | "source": [ 514 | "from sklearn.naive_bayes import GaussianNB\n", 515 | "gnb = GaussianNB()\n", 516 | "y_pred = gnb.fit(train_data_scaled2, train_target_decoded_m[0])\n", 517 | "y_pred_predict3 = y_pred.predict(test_data_scaled2)\n", 518 | "print y_pred.score(test_data_scaled2, test_target_decoded_m)\n", 519 | "print metrics.classification_report(test_target_decoded_m, y_pred_predict3)" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "\n" 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": {}, 532 | "source": [ 533 | "\n" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "\n", 541 | "\n" 542 | ] 543 | }, 544 | { 545 | "cell_type": "markdown", 546 | "metadata": {}, 547 | "source": [ 548 | "\n", 549 | "\n", 550 | "\n", 551 | "\n", 552 | "\n", 553 | "\n", 554 | "\n" 555 | ] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "metadata": {}, 560 | "source": [ 561 | "# Building 2 Class IDS Model:" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 2, 567 | "metadata": { 568 | "collapsed": true 569 | }, 570 | "outputs": [], 571 | "source": [ 572 | "#let's load the data\n", 573 | "train_data_1 = urllib.urlopen('/home/aziz/Downloads/kddcup.data_10_percent_corrected')\n", 574 | "test_data_1 = urllib.urlopen('/home/aziz/Downloads/corrected')\n", 575 | "\n", 576 | "#Place both dataset into a dataframe\n", 577 | "train_class = pd.read_csv(train_data_1, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class'])\n", 578 | "test_class = pd.read_csv(test_data_1, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class'])" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "metadata": {}, 584 | "source": [ 585 | "\n" 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "metadata": {}, 591 | "source": [ 592 | "## 1 Pre-Processing The Datasets:" 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": {}, 598 | "source": [ 599 | "### 1.1 Converts Labels to The Right Class" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 3, 605 | "metadata": { 606 | "collapsed": false 607 | }, 608 | "outputs": [], 609 | "source": [ 610 | "train_class.loc[(train_class['Class'] !='normal.'),'Class'] = 'attack'\n", 611 | "train_class.loc[(train_class['Class'] =='normal.'),'Class'] = 'normal'\n", 612 | "\n", 613 | "test_class.loc[(test_class['Class'] !='normal.'),'Class'] = 'attack'\n", 614 | "test_class.loc[(test_class['Class'] =='normal.'),'Class'] = 'normal'" 615 | ] 616 | }, 617 | { 618 | "cell_type": "markdown", 619 | "metadata": {}, 620 | "source": [ 621 | "\n" 622 | ] 623 | }, 624 | { 625 | "cell_type": "markdown", 626 | "metadata": {}, 627 | "source": [ 628 | "### 1.2 Encoding The Dataset" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 4, 634 | "metadata": { 635 | "collapsed": false 636 | }, 637 | "outputs": [ 638 | { 639 | "name": "stdout", 640 | "output_type": "stream", 641 | "text": [ 642 | "(494021, 118)\n", 643 | "(311029, 118)\n" 644 | ] 645 | } 646 | ], 647 | "source": [ 648 | "# Decoding The Dataset: \n", 649 | "attr_encoder = feature_extraction.DictVectorizer(sparse=False)\n", 650 | "label_encoder = preprocessing.LabelEncoder()\n", 651 | "\n", 652 | "train_data_df = attr_encoder.fit_transform(train_class.iloc[:,:-1].T.to_dict().values())\n", 653 | "train_target_df= label_encoder.fit_transform(train_class.iloc[:,-1])\n", 654 | "\n", 655 | "\n", 656 | "train_data_decoded = pd.DataFrame(train_data_df)\n", 657 | "train_target_decoded = pd.DataFrame(train_target_df)\n", 658 | "\n", 659 | "test_data_df= attr_encoder.transform(test_class.iloc[:,:-1].T.to_dict().values())\n", 660 | "test_target_df= label_encoder.transform(test_class.iloc[:,-1])\n", 661 | "\n", 662 | "test_data_decoded = pd.DataFrame(test_data_df)\n", 663 | "test_target_decoded = pd.DataFrame(test_target_df)\n", 664 | "\n", 665 | "\n", 666 | "print train_data_decoded.shape\n", 667 | "print test_data_decoded.shape" 668 | ] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "metadata": {}, 673 | "source": [ 674 | "\n" 675 | ] 676 | }, 677 | { 678 | "cell_type": "markdown", 679 | "metadata": {}, 680 | "source": [ 681 | "### 1.3 Feature Reduction Using PCA" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": 9, 687 | "metadata": { 688 | "collapsed": false 689 | }, 690 | "outputs": [ 691 | { 692 | "name": "stdout", 693 | "output_type": "stream", 694 | "text": [ 695 | "(494021, 29)\n", 696 | "(311029, 29)\n" 697 | ] 698 | } 699 | ], 700 | "source": [ 701 | "#load some modules to help\n", 702 | "from mpl_toolkits.mplot3d import Axes3D\n", 703 | "from sklearn.decomposition import PCA\n", 704 | "\n", 705 | "\n", 706 | "train_data_pca_1 = PCA(n_components=29).fit_transform(train_data_decoded)\n", 707 | "test_data_pca_1 = PCA(n_components=29).fit_transform(test_data_decoded)\n", 708 | "\n", 709 | "train_data_pca_df_1 = pd.DataFrame(train_data_pca_1)\n", 710 | "test_data_pca_df_1 = pd.DataFrame(test_data_pca_1)\n", 711 | "\n", 712 | "print train_data_pca_df_1.shape\n", 713 | "print test_data_pca_df_1.shape" 714 | ] 715 | }, 716 | { 717 | "cell_type": "markdown", 718 | "metadata": {}, 719 | "source": [ 720 | "\n" 721 | ] 722 | }, 723 | { 724 | "cell_type": "markdown", 725 | "metadata": {}, 726 | "source": [ 727 | "### 1.4 Normalizing The Datasets" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 10, 733 | "metadata": { 734 | "collapsed": true 735 | }, 736 | "outputs": [], 737 | "source": [ 738 | "#Creating our scaler and applyting it to our dataset after feature reduction\n", 739 | "standard_scaler = preprocessing.StandardScaler()\n", 740 | "train_ratio_standard_scaled_values = standard_scaler.fit_transform(train_data_pca_df_1.values)\n", 741 | "train_data_scaled_1=pd.DataFrame(train_ratio_standard_scaled_values)\n", 742 | "\n", 743 | "test_ratio_standard_scaled_values = standard_scaler.fit_transform(test_data_pca_df_1.values)\n", 744 | "test_data_scaled_1=pd.DataFrame(test_ratio_standard_scaled_values)" 745 | ] 746 | }, 747 | { 748 | "cell_type": "markdown", 749 | "metadata": {}, 750 | "source": [ 751 | "\n" 752 | ] 753 | }, 754 | { 755 | "cell_type": "markdown", 756 | "metadata": {}, 757 | "source": [ 758 | "## 2 Classifiying The Data Set" 759 | ] 760 | }, 761 | { 762 | "cell_type": "markdown", 763 | "metadata": {}, 764 | "source": [ 765 | "### 2.1 Using SVM Algorithm:" 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": 8, 771 | "metadata": { 772 | "collapsed": false 773 | }, 774 | "outputs": [ 775 | { 776 | "name": "stdout", 777 | "output_type": "stream", 778 | "text": [ 779 | "0.922396303882\n", 780 | " precision recall f1-score support\n", 781 | "\n", 782 | " 0 0.95 0.95 0.95 250436\n", 783 | " 1 0.80 0.80 0.80 60593\n", 784 | "\n", 785 | "avg / total 0.92 0.92 0.92 311029\n", 786 | "\n", 787 | "Number of support vectors for each class [1694 1688]\n", 788 | "[[ 2.78947810e-03 4.43292344e-02 -6.15165127e-03 ..., 7.07344321e-01\n", 789 | " -4.92057076e-01 -2.28549088e+00]\n", 790 | " [ 2.01738092e-03 -2.62871757e-02 6.24009662e-02 ..., -5.36280799e+00\n", 791 | " -4.30704463e+00 2.56780715e+00]\n", 792 | " [ 2.01738062e-03 -2.62830927e-02 3.67396697e-02 ..., -6.92651351e+00\n", 793 | " -4.77964921e+00 -4.82940529e+00]\n", 794 | " ..., \n", 795 | " [ 1.54481243e-03 9.93833927e-02 -1.84981710e-02 ..., -3.49703128e-01\n", 796 | " -4.14043476e-02 -1.79917829e+00]\n", 797 | " [ 2.82793121e-03 -1.56889839e-02 3.34062598e-02 ..., 7.36771477e-01\n", 798 | " 4.55919104e+00 9.46427430e-01]\n", 799 | " [ 2.82388363e-03 3.23437103e-02 3.62694976e-02 ..., 4.35657953e-01\n", 800 | " 4.61212702e+00 9.38367066e-01]]\n" 801 | ] 802 | } 803 | ], 804 | "source": [ 805 | "#Draft\n", 806 | "lin = svm.SVC(kernel='linear', max_iter=100000000)\n", 807 | "lin.fit(train_data_scaled_1, train_target_decoded[0])\n", 808 | "lin_predict = lin.predict(test_data_scaled_1)\n", 809 | "print lin.score(test_data_scaled_1, test_target_decoded)\n", 810 | "print metrics.classification_report(test_target_decoded, lin_predict)\n", 811 | "print \"Number of support vectors for each class\", lin.n_support_\n", 812 | "print lin.support_vectors_" 813 | ] 814 | }, 815 | { 816 | "cell_type": "markdown", 817 | "metadata": {}, 818 | "source": [ 819 | "\n" 820 | ] 821 | }, 822 | { 823 | "cell_type": "markdown", 824 | "metadata": {}, 825 | "source": [ 826 | "### 2.2 Using Decision Trees Algorithm" 827 | ] 828 | }, 829 | { 830 | "cell_type": "markdown", 831 | "metadata": {}, 832 | "source": [ 833 | "#### 2.2.1 Performaing Corss Validation on The Training Set for Testing Different Paramter\n" 834 | ] 835 | }, 836 | { 837 | "cell_type": "code", 838 | "execution_count": 10, 839 | "metadata": { 840 | "collapsed": false 841 | }, 842 | "outputs": [ 843 | { 844 | "name": "stdout", 845 | "output_type": "stream", 846 | "text": [ 847 | "criterion: gini\n", 848 | "min_samples_leaf: 2\n", 849 | "max_depth: 1\n", 850 | "Accuracy 0.969376\n", 851 | "dtype: float64\n", 852 | "\n", 853 | "\n", 854 | "criterion: gini\n", 855 | "min_samples_leaf: 2\n", 856 | "max_depth: 6\n", 857 | "Accuracy 0.998154\n", 858 | "dtype: float64\n", 859 | "\n", 860 | "\n", 861 | "criterion: gini\n", 862 | "min_samples_leaf: 2\n", 863 | "max_depth: 12\n", 864 | "Accuracy 0.999326\n", 865 | "dtype: float64\n", 866 | "\n", 867 | "\n", 868 | "criterion: gini\n", 869 | "min_samples_leaf: 5\n", 870 | "max_depth: 1\n", 871 | "Accuracy 0.969376\n", 872 | "dtype: float64\n", 873 | "\n", 874 | "\n", 875 | "criterion: gini\n", 876 | "min_samples_leaf: 5\n", 877 | "max_depth: 6\n", 878 | "Accuracy 0.998128\n", 879 | "dtype: float64\n", 880 | "\n", 881 | "\n", 882 | "criterion: gini\n", 883 | "min_samples_leaf: 5\n", 884 | "max_depth: 12\n", 885 | "Accuracy 0.999273\n", 886 | "dtype: float64\n", 887 | "\n", 888 | "\n", 889 | "criterion: gini\n", 890 | "min_samples_leaf: 50\n", 891 | "max_depth: 1\n", 892 | "Accuracy 0.969376\n", 893 | "dtype: float64\n", 894 | "\n", 895 | "\n", 896 | "criterion: gini\n", 897 | "min_samples_leaf: 50\n", 898 | "max_depth: 6\n", 899 | "Accuracy 0.997601\n", 900 | "dtype: float64\n", 901 | "\n", 902 | "\n", 903 | "criterion: gini\n", 904 | "min_samples_leaf: 50\n", 905 | "max_depth: 12\n", 906 | "Accuracy 0.998296\n", 907 | "dtype: float64\n", 908 | "\n", 909 | "\n", 910 | "criterion: entropy\n", 911 | "min_samples_leaf: 2\n", 912 | "max_depth: 1\n", 913 | "Accuracy 0.969376\n", 914 | "dtype: float64\n", 915 | "\n", 916 | "\n", 917 | "criterion: entropy\n", 918 | "min_samples_leaf: 2\n", 919 | "max_depth: 6\n", 920 | "Accuracy 0.998911\n", 921 | "dtype: float64\n", 922 | "\n", 923 | "\n", 924 | "criterion: entropy\n", 925 | "min_samples_leaf: 2\n", 926 | "max_depth: 12\n", 927 | "Accuracy 0.999547\n", 928 | "dtype: float64\n", 929 | "\n", 930 | "\n", 931 | "criterion: entropy\n", 932 | "min_samples_leaf: 5\n", 933 | "max_depth: 1\n", 934 | "Accuracy 0.969376\n", 935 | "dtype: float64\n", 936 | "\n", 937 | "\n", 938 | "criterion: entropy\n", 939 | "min_samples_leaf: 5\n", 940 | "max_depth: 6\n", 941 | "Accuracy 0.998889\n", 942 | "dtype: float64\n", 943 | "\n", 944 | "\n", 945 | "criterion: entropy\n", 946 | "min_samples_leaf: 5\n", 947 | "max_depth: 12\n", 948 | "Accuracy 0.99946\n", 949 | "dtype: float64\n", 950 | "\n", 951 | "\n", 952 | "criterion: entropy\n", 953 | "min_samples_leaf: 50\n", 954 | "max_depth: 1\n", 955 | "Accuracy 0.969376\n", 956 | "dtype: float64\n", 957 | "\n", 958 | "\n", 959 | "criterion: entropy\n", 960 | "min_samples_leaf: 50\n", 961 | "max_depth: 6\n", 962 | "Accuracy 0.998326\n", 963 | "dtype: float64\n", 964 | "\n", 965 | "\n", 966 | "criterion: entropy\n", 967 | "min_samples_leaf: 50\n", 968 | "max_depth: 12\n", 969 | "Accuracy 0.998646\n", 970 | "dtype: float64\n", 971 | "\n", 972 | "\n" 973 | ] 974 | } 975 | ], 976 | "source": [ 977 | "## Testing SVM using Different Kernals with class weights balanced\n", 978 | "foldnum = 0\n", 979 | "fold_results = pd.DataFrame()\n", 980 | "criterion=[ 'gini','entropy']\n", 981 | "min_samples_leaf = [2, 5, 50]\n", 982 | "max_depth = [1,6,12]\n", 983 | "\n", 984 | "\n", 985 | "for cri in criterion:\n", 986 | " for leaf in min_samples_leaf:\n", 987 | " for depth in max_depth:\n", 988 | " foldnum = 0\n", 989 | " clf = tree.DecisionTreeClassifier(criterion=cri,min_samples_leaf=leaf,max_depth=depth,random_state=20160121,class_weight=\"balanced\")\n", 990 | " for train, test in cross_validation.KFold(len(train_data_scaled_1), n_folds=5,shuffle=True,random_state=20160202): \n", 991 | " [ids_tr_data, ids_te_data, ids_tr_target, ids_te_target] = folds_to_split(train_data_scaled_1,train_target_decoded,train, test)\n", 992 | " clf.fit(ids_tr_data, ids_tr_target[0])\n", 993 | " clf_predict = clf.predict(ids_te_data)\n", 994 | "\n", 995 | " fold_results.loc[foldnum, 'Accuracy'] = clf.score(ids_te_data, ids_te_target)\n", 996 | " foldnum+=1 \n", 997 | " print \"criterion:\",cri\n", 998 | " print \"min_samples_leaf:\",leaf\n", 999 | " print \"max_depth:\",depth\n", 1000 | " print fold_results.mean()\n", 1001 | " print \"\\n\"" 1002 | ] 1003 | }, 1004 | { 1005 | "cell_type": "markdown", 1006 | "metadata": {}, 1007 | "source": [ 1008 | "#### 2.2.1 Testing the IDS Model on The Test Set:" 1009 | ] 1010 | }, 1011 | { 1012 | "cell_type": "code", 1013 | "execution_count": 11, 1014 | "metadata": { 1015 | "collapsed": false 1016 | }, 1017 | "outputs": [ 1018 | { 1019 | "name": "stdout", 1020 | "output_type": "stream", 1021 | "text": [ 1022 | "Accuracy (via score): 0.824206102968\n", 1023 | " precision recall f1-score support\n", 1024 | "\n", 1025 | " 0 0.92 0.86 0.89 250436\n", 1026 | " 1 0.54 0.67 0.60 60593\n", 1027 | "\n", 1028 | "avg / total 0.84 0.82 0.83 311029\n", 1029 | "\n" 1030 | ] 1031 | } 1032 | ], 1033 | "source": [ 1034 | "\n", 1035 | "clf_t = tree.DecisionTreeClassifier(criterion='entropy',min_samples_leaf=2,max_depth=12,random_state=20160121,class_weight=\"balanced\")\n", 1036 | "clf_t.fit(train_data_scaled_1, train_target_decoded[0])\n", 1037 | "clf_predict = clf_t.predict(test_data_scaled_1)\n", 1038 | "\n", 1039 | "print \"Accuracy (via score):\", clf_t.score(test_data_scaled_1, test_target_decoded)\n", 1040 | "print metrics.classification_report(test_target_decoded, clf_predict)\n" 1041 | ] 1042 | }, 1043 | { 1044 | "cell_type": "markdown", 1045 | "metadata": { 1046 | "collapsed": true 1047 | }, 1048 | "source": [ 1049 | "### 2.3 Using Naive Bayes Algorithm:" 1050 | ] 1051 | }, 1052 | { 1053 | "cell_type": "code", 1054 | "execution_count": 17, 1055 | "metadata": { 1056 | "collapsed": false 1057 | }, 1058 | "outputs": [ 1059 | { 1060 | "name": "stdout", 1061 | "output_type": "stream", 1062 | "text": [ 1063 | "0.822302743474\n", 1064 | " precision recall f1-score support\n", 1065 | "\n", 1066 | " 0 0.87 0.91 0.89 250436\n", 1067 | " 1 0.55 0.45 0.50 60593\n", 1068 | "\n", 1069 | "avg / total 0.81 0.82 0.81 311029\n", 1070 | "\n", 1071 | "[ 0.80308934 0.19691066]\n" 1072 | ] 1073 | } 1074 | ], 1075 | "source": [ 1076 | "from sklearn.naive_bayes import GaussianNB\n", 1077 | "gnb = GaussianNB()\n", 1078 | "Naive = gnb.fit(train_data_scaled_1, train_target_decoded[0])\n", 1079 | "Naive_predict = Naive.predict(test_data_scaled_1)\n", 1080 | "print Naive.score(test_data_scaled_1, test_target_decoded)\n", 1081 | "print metrics.classification_report(test_target_decoded, Naive_predict)\n", 1082 | "print Naive.class_prior_" 1083 | ] 1084 | } 1085 | ], 1086 | "metadata": { 1087 | "kernelspec": { 1088 | "display_name": "Python 2", 1089 | "language": "python", 1090 | "name": "python2" 1091 | }, 1092 | "language_info": { 1093 | "codemirror_mode": { 1094 | "name": "ipython", 1095 | "version": 2 1096 | }, 1097 | "file_extension": ".py", 1098 | "mimetype": "text/x-python", 1099 | "name": "python", 1100 | "nbconvert_exporter": "python", 1101 | "pygments_lexer": "ipython2", 1102 | "version": "2.7.11" 1103 | } 1104 | }, 1105 | "nbformat": 4, 1106 | "nbformat_minor": 0 1107 | } 1108 | -------------------------------------------------------------------------------- /SVM with multiclasses.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | ## Preliminaries 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | from sklearn import datasets, preprocessing,cross_validation, feature_extraction 10 | from sklearn import linear_model, svm, metrics, ensemble, tree, ensemble 11 | from sklearn.decomposition import PCA 12 | import pandas as pd 13 | import urllib 14 | import csv 15 | 16 | # Helper functions 17 | def folds_to_split(data,targets,train,test): 18 | data_tr = pd.DataFrame(data).iloc[train] 19 | data_te = pd.DataFrame(data).iloc[test] 20 | labels_tr = pd.DataFrame(targets).iloc[train] 21 | labels_te = pd.DataFrame(targets).iloc[test] 22 | return [data_tr, data_te, labels_tr, labels_te] 23 | 24 | 25 | # 26 | # 27 | 28 | # ## Using SVM To Build The Model 5 class labeling: 29 | 30 | # In[22]: 31 | 32 | #let's load the data 33 | train_data = urllib.urlopen('/home/aziz/Downloads/kddcup.data_10_percent_corrected') 34 | test_data = urllib.urlopen('/home/aziz/Downloads/corrected') 35 | 36 | #Place both dataset into a dataframe 37 | train_multiclass = pd.read_csv(train_data, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class']) 38 | test_multiclass = pd.read_csv(test_data, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class']) 39 | 40 | 41 | # 42 | # 43 | 44 | # ## 1 Pre-Processing The Datasets: 45 | 46 | # ### 1.1 Change Labels to The Right Class: 47 | 48 | # In[23]: 49 | 50 | ## Replacing all the different attack types(24) to their proper general attack class 51 | train_multiclass.loc[(train_multiclass['Class'] =='smurf.')|(train_multiclass['Class'] =='neptune.') | (train_multiclass['Class'] =='back.') | (train_multiclass['Class'] =='teardrop.') |(train_multiclass['Class'] =='pod.')| (train_multiclass['Class']=='land.'),'Class'] = 'Dos' 52 | train_multiclass.loc[(train_multiclass['Class'] =='satan.')|(train_multiclass['Class'] =='ipsweep.') | (train_multiclass['Class'] =='portsweep.') | (train_multiclass['Class'] =='nmap.'),'Class'] = 'probe' 53 | train_multiclass.loc[(train_multiclass['Class'] =='spy.')|(train_multiclass['Class'] =='phf.')|(train_multiclass['Class'] =='multihop.')|(train_multiclass['Class'] =='ftp_write.') | (train_multiclass['Class'] =='imap.') | (train_multiclass['Class'] =='warezmaster.') |(train_multiclass['Class'] =='guess_passwd.')| (train_multiclass['Class']=='warezclient.'),'Class'] = 'r2l' 54 | train_multiclass.loc[(train_multiclass['Class'] =='buffer_overflow.')|(train_multiclass['Class'] =='rootkit.') | (train_multiclass['Class'] =='loadmodule.') | (train_multiclass['Class'] =='perl.'),'Class']='u2r' 55 | train_multiclass.loc[(train_multiclass['Class'] =='normal.'),'Class'] = 'normal' 56 | 57 | 58 | # 59 | # 60 | 61 | # In[24]: 62 | 63 | ## Replacing all the different attack types(36) to their proper general attack class 64 | 65 | 66 | test_multiclass.loc[(test_multiclass['Class'] =='smurf.')|(test_multiclass['Class'] =='neptune.') | 67 | (test_multiclass['Class'] =='back.') | (test_multiclass['Class'] =='teardrop.') | 68 | (test_multiclass['Class'] =='pod.')| (test_multiclass['Class']=='land.')| 69 | (test_multiclass['Class']=='apache2.')|(test_multiclass['Class']=='udpstorm.')| 70 | (test_multiclass['Class']=='processtable.')|(test_multiclass['Class']=='mailbomb.'),'Class'] = 'Dos' 71 | 72 | 73 | test_multiclass.loc[(test_multiclass['Class'] =='guess_passwd.')|(test_multiclass['Class'] =='ftp_write.')| 74 | (test_multiclass['Class'] =='imap.')|(test_multiclass['Class'] =='phf.') | 75 | (test_multiclass['Class'] =='multihop.') | 76 | (test_multiclass['Class'] =='warezmaster.') |(test_multiclass['Class'] =='snmpgetattack.')| 77 | (test_multiclass['Class']=='named.')|(test_multiclass['Class'] =='xlock.')| 78 | (test_multiclass['Class'] =='xsnoop.')|(test_multiclass['Class'] =='sendmail.')| 79 | (test_multiclass['Class'] =='httptunnel.')|(test_multiclass['Class'] =='worm.')| 80 | (test_multiclass['Class'] =='snmpguess.'),'Class'] = 'r2l' 81 | 82 | test_multiclass.loc[(test_multiclass['Class'] =='satan.')|(test_multiclass['Class'] =='ipsweep.') | (test_multiclass['Class'] =='portsweep.') | (test_multiclass['Class'] =='nmap.')| 83 | (test_multiclass['Class'] =='saint.')|(test_multiclass['Class'] =='mscan.'),'Class'] = 'probe' 84 | 85 | test_multiclass.loc[(test_multiclass['Class'] =='buffer_overflow.')|(test_multiclass['Class'] =='rootkit.') | 86 | (test_multiclass['Class'] =='loadmodule.') | (test_multiclass['Class'] =='xterm.')| 87 | (test_multiclass['Class'] =='sqlattack.')|(test_multiclass['Class'] =='ps.')| 88 | (test_multiclass['Class'] =='perl.'),'Class']='u2r' 89 | 90 | test_multiclass.loc[(test_multiclass['Class'] =='normal.'),'Class'] = 'normal' 91 | 92 | 93 | # 94 | # 95 | 96 | # ### 2.2 Encoding The Dataset: 97 | 98 | # In[25]: 99 | 100 | # Decoding The Dataset: 101 | attr_encoder = feature_extraction.DictVectorizer(sparse=False) 102 | label_encoder = preprocessing.LabelEncoder() 103 | 104 | train_data_df_m = attr_encoder.fit_transform(train_multiclass.iloc[:,:-1].T.to_dict().values()) 105 | train_target_df_m= label_encoder.fit_transform(train_multiclass.iloc[:,-1]) 106 | 107 | 108 | train_data_decoded_m = pd.DataFrame(train_data_df_m) 109 | train_target_decoded_m = pd.DataFrame(train_target_df_m) 110 | 111 | test_data_df_m = attr_encoder.transform(test_multiclass.iloc[:,:-1].T.to_dict().values()) 112 | test_target_df_m = label_encoder.transform(test_multiclass.iloc[:,-1]) 113 | 114 | test_data_decoded_m = pd.DataFrame(test_data_df_m) 115 | test_target_decoded_m = pd.DataFrame(test_target_df_m) 116 | 117 | 118 | print train_data_decoded_m.shape 119 | print test_data_decoded_m.shape 120 | 121 | 122 | # ### 1.3 Perfroming Feature Reduction using PCA 123 | 124 | # In[29]: 125 | 126 | #load some modules to help 127 | from mpl_toolkits.mplot3d import Axes3D 128 | from sklearn.decomposition import PCA 129 | 130 | 131 | train_data_pca2 = PCA(n_components=29).fit_transform(train_data_decoded_m) 132 | test_data_pca2 = PCA(n_components=29).fit_transform(test_data_decoded_m) 133 | 134 | train_data_pca_df2 = pd.DataFrame(train_data_pca2) 135 | test_data_pca_df2 = pd.DataFrame(test_data_pca2) 136 | 137 | print train_data_pca_df2.shape 138 | print test_data_pca_df2.shape 139 | 140 | 141 | # ### 1.4 Normalizing the Data Sets 142 | 143 | # In[30]: 144 | 145 | #Creating our scaler and applyting it to our dataset after feature reduction 146 | standard_scaler = preprocessing.StandardScaler() 147 | train_ratio_standard_scaled_values2 = standard_scaler.fit_transform(train_data_pca_df2.values) 148 | train_data_scaled2=pd.DataFrame(train_ratio_standard_scaled_values2) 149 | 150 | test_ratio_standard_scaled_values2 = standard_scaler.fit_transform(test_data_pca_df2.values) 151 | test_data_scaled2=pd.DataFrame(test_ratio_standard_scaled_values2) 152 | 153 | 154 | # 155 | # 156 | 157 | # ## 2 Classification: 158 | 159 | # ### 2.1 Using SVM Algorithm 160 | 161 | # In[22]: 162 | 163 | #Draft 164 | clf = svm.SVC(kernel='linear',class_weight="balanced", max_iter=100000000) 165 | clf.fit(train_data_scaled2, train_target_decoded_m[0]) 166 | clf_predict = clf.predict(test_data_scaled2) 167 | print clf.score(test_data_scaled2, test_target_decoded_m) 168 | print metrics.classification_report(test_target_decoded_m, clf_predict) 169 | 170 | 171 | # ### 2.2 Using Decision Trees Algorithm: 172 | 173 | # #### 2.2.1 Performaing Corss Validation on The Training Set for Testing Different Paramter 174 | 175 | # In[37]: 176 | 177 | ## Testing SVM using Different Kernals with class weights balanced 178 | foldnum = 0 179 | fold_results = pd.DataFrame() 180 | criterion=[ 'gini','entropy'] 181 | min_samples_leaf = [5,10] 182 | max_depth = [6,12] 183 | 184 | 185 | for cri in criterion: 186 | for leaf in min_samples_leaf: 187 | for depth in max_depth: 188 | foldnum = 0 189 | clf3 = tree.DecisionTreeClassifier(criterion=cri,min_samples_leaf=leaf,max_depth=depth,random_state=20160121,class_weight="balanced") 190 | for train, test in cross_validation.KFold(len(train_data_scaled2), n_folds=5,shuffle=True,random_state=20160202): 191 | [ids_tr_data, ids_te_data, ids_tr_target, ids_te_target] = folds_to_split(train_data_scaled_1,train_target_decoded,train, test) 192 | clf3.fit(ids_tr_data, ids_tr_target[0]) 193 | fold_results.loc[foldnum, 'Accuracy'] = clf3.score(ids_te_data, ids_te_target) 194 | foldnum+=1 195 | print "criterion:",cri 196 | print "min_samples_leaf:",leaf 197 | print "max_depth:",depth 198 | print fold_results.mean() 199 | print "\n" 200 | 201 | 202 | # #### 2.2.1 Testing the IDS Model on The Test Set: 203 | 204 | # In[46]: 205 | 206 | from sklearn.datasets import load_iris 207 | from sklearn import tree 208 | 209 | clf3 = tree.DecisionTreeClassifier(criterion = 'gini', min_samples_leaf = 5, max_depth=12,random_state=20160121,class_weight="balanced") 210 | clf3.fit(train_data_scaled2, train_target_decoded_m[0]) 211 | clf3_predict = clf3.predict(test_data_scaled2) 212 | print "Accuracy :", clf3.score(test_data_scaled2, test_target_decoded_m) 213 | print metrics.classification_report(test_target_decoded_m, clf3_predict) 214 | 215 | 216 | # ### 2.3 Using Naive Bayes Algorithm: 217 | 218 | # In[35]: 219 | 220 | from sklearn.naive_bayes import GaussianNB 221 | gnb = GaussianNB() 222 | y_pred = gnb.fit(train_data_scaled2, train_target_decoded_m[0]) 223 | y_pred_predict3 = y_pred.predict(test_data_scaled2) 224 | print y_pred.score(test_data_scaled2, test_target_decoded_m) 225 | print metrics.classification_report(test_target_decoded_m, y_pred_predict3) 226 | 227 | 228 | # 229 | # 230 | 231 | # 232 | # 233 | 234 | # 235 | # 236 | # 237 | 238 | # 239 | # 240 | # 241 | # 242 | # 243 | # 244 | # 245 | # 246 | 247 | # # Building 2 Class IDS Model: 248 | 249 | # In[2]: 250 | 251 | #let's load the data 252 | train_data_1 = urllib.urlopen('/home/aziz/Downloads/kddcup.data_10_percent_corrected') 253 | test_data_1 = urllib.urlopen('/home/aziz/Downloads/corrected') 254 | 255 | #Place both dataset into a dataframe 256 | train_class = pd.read_csv(train_data_1, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class']) 257 | test_class = pd.read_csv(test_data_1, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class']) 258 | 259 | 260 | # 261 | # 262 | 263 | # ## 1 Pre-Processing The Datasets: 264 | 265 | # ### 1.1 Converts Labels to The Right Class 266 | 267 | # In[3]: 268 | 269 | train_class.loc[(train_class['Class'] !='normal.'),'Class'] = 'attack' 270 | train_class.loc[(train_class['Class'] =='normal.'),'Class'] = 'normal' 271 | 272 | test_class.loc[(test_class['Class'] !='normal.'),'Class'] = 'attack' 273 | test_class.loc[(test_class['Class'] =='normal.'),'Class'] = 'normal' 274 | 275 | 276 | # 277 | # 278 | 279 | # ### 1.2 Encoding The Dataset 280 | 281 | # In[4]: 282 | 283 | # Decoding The Dataset: 284 | attr_encoder = feature_extraction.DictVectorizer(sparse=False) 285 | label_encoder = preprocessing.LabelEncoder() 286 | 287 | train_data_df = attr_encoder.fit_transform(train_class.iloc[:,:-1].T.to_dict().values()) 288 | train_target_df= label_encoder.fit_transform(train_class.iloc[:,-1]) 289 | 290 | 291 | train_data_decoded = pd.DataFrame(train_data_df) 292 | train_target_decoded = pd.DataFrame(train_target_df) 293 | 294 | test_data_df= attr_encoder.transform(test_class.iloc[:,:-1].T.to_dict().values()) 295 | test_target_df= label_encoder.transform(test_class.iloc[:,-1]) 296 | 297 | test_data_decoded = pd.DataFrame(test_data_df) 298 | test_target_decoded = pd.DataFrame(test_target_df) 299 | 300 | 301 | print train_data_decoded.shape 302 | print test_data_decoded.shape 303 | 304 | 305 | # 306 | # 307 | 308 | # ### 1.3 Feature Reduction Using PCA 309 | 310 | # In[9]: 311 | 312 | #load some modules to help 313 | from mpl_toolkits.mplot3d import Axes3D 314 | from sklearn.decomposition import PCA 315 | 316 | 317 | train_data_pca_1 = PCA(n_components=29).fit_transform(train_data_decoded) 318 | test_data_pca_1 = PCA(n_components=29).fit_transform(test_data_decoded) 319 | 320 | train_data_pca_df_1 = pd.DataFrame(train_data_pca_1) 321 | test_data_pca_df_1 = pd.DataFrame(test_data_pca_1) 322 | 323 | print train_data_pca_df_1.shape 324 | print test_data_pca_df_1.shape 325 | 326 | 327 | # 328 | # 329 | 330 | # ### 1.4 Normalizing The Datasets 331 | 332 | # In[10]: 333 | 334 | #Creating our scaler and applyting it to our dataset after feature reduction 335 | standard_scaler = preprocessing.StandardScaler() 336 | train_ratio_standard_scaled_values = standard_scaler.fit_transform(train_data_pca_df_1.values) 337 | train_data_scaled_1=pd.DataFrame(train_ratio_standard_scaled_values) 338 | 339 | test_ratio_standard_scaled_values = standard_scaler.fit_transform(test_data_pca_df_1.values) 340 | test_data_scaled_1=pd.DataFrame(test_ratio_standard_scaled_values) 341 | 342 | 343 | # 344 | # 345 | 346 | # ## 2 Classifiying The Data Set 347 | 348 | # ### 2.1 Using SVM Algorithm: 349 | 350 | # In[8]: 351 | 352 | #Draft 353 | lin = svm.SVC(kernel='linear', max_iter=100000000) 354 | lin.fit(train_data_scaled_1, train_target_decoded[0]) 355 | lin_predict = lin.predict(test_data_scaled_1) 356 | print lin.score(test_data_scaled_1, test_target_decoded) 357 | print metrics.classification_report(test_target_decoded, lin_predict) 358 | print "Number of support vectors for each class", lin.n_support_ 359 | print lin.support_vectors_ 360 | 361 | 362 | # 363 | # 364 | 365 | # ### 2.2 Using Decision Trees Algorithm 366 | 367 | # #### 2.2.1 Performaing Corss Validation on The Training Set for Testing Different Paramter 368 | # 369 | 370 | # In[10]: 371 | 372 | ## Testing SVM using Different Kernals with class weights balanced 373 | foldnum = 0 374 | fold_results = pd.DataFrame() 375 | criterion=[ 'gini','entropy'] 376 | min_samples_leaf = [2, 5, 50] 377 | max_depth = [1,6,12] 378 | 379 | 380 | for cri in criterion: 381 | for leaf in min_samples_leaf: 382 | for depth in max_depth: 383 | foldnum = 0 384 | clf = tree.DecisionTreeClassifier(criterion=cri,min_samples_leaf=leaf,max_depth=depth,random_state=20160121,class_weight="balanced") 385 | for train, test in cross_validation.KFold(len(train_data_scaled_1), n_folds=5,shuffle=True,random_state=20160202): 386 | [ids_tr_data, ids_te_data, ids_tr_target, ids_te_target] = folds_to_split(train_data_scaled_1,train_target_decoded,train, test) 387 | clf.fit(ids_tr_data, ids_tr_target[0]) 388 | clf_predict = clf.predict(ids_te_data) 389 | 390 | fold_results.loc[foldnum, 'Accuracy'] = clf.score(ids_te_data, ids_te_target) 391 | foldnum+=1 392 | print "criterion:",cri 393 | print "min_samples_leaf:",leaf 394 | print "max_depth:",depth 395 | print fold_results.mean() 396 | print "\n" 397 | 398 | 399 | # #### 2.2.1 Testing the IDS Model on The Test Set: 400 | 401 | # In[11]: 402 | 403 | 404 | clf_t = tree.DecisionTreeClassifier(criterion='entropy',min_samples_leaf=2,max_depth=12,random_state=20160121,class_weight="balanced") 405 | clf_t.fit(train_data_scaled_1, train_target_decoded[0]) 406 | clf_predict = clf_t.predict(test_data_scaled_1) 407 | 408 | print "Accuracy (via score):", clf_t.score(test_data_scaled_1, test_target_decoded) 409 | print metrics.classification_report(test_target_decoded, clf_predict) 410 | 411 | 412 | # ### 2.3 Using Naive Bayes Algorithm: 413 | 414 | # In[17]: 415 | 416 | from sklearn.naive_bayes import GaussianNB 417 | gnb = GaussianNB() 418 | Naive = gnb.fit(train_data_scaled_1, train_target_decoded[0]) 419 | Naive_predict = Naive.predict(test_data_scaled_1) 420 | print Naive.score(test_data_scaled_1, test_target_decoded) 421 | print metrics.classification_report(test_target_decoded, Naive_predict) 422 | print Naive.class_prior_ 423 | 424 | --------------------------------------------------------------------------------