├── README.md
├── SVM with multiclasses.ipynb
└── SVM with multiclasses.py


/README.md:
--------------------------------------------------------------------------------
1 | # Building-Intrusion-Detection-System
2 | Building Intrusion Detection System for Network Security Using the KD99 Data Set
3 | This code was written for the using jupyter notebook. The code is to build two different IDS mdels. 
4 | The first one is for an IDS that is able to distinguish between different major attacks. 
5 | The second IDS is able to distinguish between normal connection and attacks. The data set i used in this code is the KDD99 data set which is available for public use at the UCI Machine Learning Repository. 
6 | Three algorithms i used using the Sklearn library; SVM, Decision Tree, and Naive Bayes. 
7 | 


--------------------------------------------------------------------------------
/SVM with multiclasses.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {
   7 |     "collapsed": true
   8 |    },
   9 |    "outputs": [],
  10 |    "source": [
  11 |     "## Preliminaries\n",
  12 |     "import numpy as np \n",
  13 |     "import matplotlib.pyplot as plt\n",
  14 |     "from sklearn import datasets, preprocessing,cross_validation, feature_extraction\n",
  15 |     "from sklearn import linear_model, svm, metrics, ensemble, tree, ensemble\n",
  16 |     "from sklearn.decomposition import PCA\n",
  17 |     "import pandas as pd\n",
  18 |     "import urllib\n",
  19 |     "import csv\n",
  20 |     "\n",
  21 |     "# Helper functions\n",
  22 |     "def folds_to_split(data,targets,train,test):\n",
  23 |     "    data_tr = pd.DataFrame(data).iloc[train]\n",
  24 |     "    data_te = pd.DataFrame(data).iloc[test]\n",
  25 |     "    labels_tr = pd.DataFrame(targets).iloc[train]\n",
  26 |     "    labels_te = pd.DataFrame(targets).iloc[test]\n",
  27 |     "    return [data_tr, data_te, labels_tr, labels_te]\n"
  28 |    ]
  29 |   },
  30 |   {
  31 |    "cell_type": "markdown",
  32 |    "metadata": {},
  33 |    "source": [
  34 |     "\n"
  35 |    ]
  36 |   },
  37 |   {
  38 |    "cell_type": "markdown",
  39 |    "metadata": {},
  40 |    "source": [
  41 |     "## Using SVM To Build The Model 5 class labeling:"
  42 |    ]
  43 |   },
  44 |   {
  45 |    "cell_type": "code",
  46 |    "execution_count": 22,
  47 |    "metadata": {
  48 |     "collapsed": false
  49 |    },
  50 |    "outputs": [],
  51 |    "source": [
  52 |     "#let's load the data\n",
  53 |     "train_data = urllib.urlopen('/home/aziz/Downloads/kddcup.data_10_percent_corrected')\n",
  54 |     "test_data = urllib.urlopen('/home/aziz/Downloads/corrected')\n",
  55 |     "\n",
  56 |     "#Place both dataset into a dataframe\n",
  57 |     "train_multiclass = pd.read_csv(train_data, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class'])\n",
  58 |     "test_multiclass = pd.read_csv(test_data, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class'])"
  59 |    ]
  60 |   },
  61 |   {
  62 |    "cell_type": "markdown",
  63 |    "metadata": {},
  64 |    "source": [
  65 |     "\n"
  66 |    ]
  67 |   },
  68 |   {
  69 |    "cell_type": "markdown",
  70 |    "metadata": {},
  71 |    "source": [
  72 |     "## 1 Pre-Processing The Datasets:"
  73 |    ]
  74 |   },
  75 |   {
  76 |    "cell_type": "markdown",
  77 |    "metadata": {},
  78 |    "source": [
  79 |     "### 1.1 Change Labels to The Right Class: "
  80 |    ]
  81 |   },
  82 |   {
  83 |    "cell_type": "code",
  84 |    "execution_count": 23,
  85 |    "metadata": {
  86 |     "collapsed": true
  87 |    },
  88 |    "outputs": [],
  89 |    "source": [
  90 |     "## Replacing all the different attack types(24) to their proper general attack class\n",
  91 |     "train_multiclass.loc[(train_multiclass['Class'] =='smurf.')|(train_multiclass['Class'] =='neptune.') | (train_multiclass['Class'] =='back.') | (train_multiclass['Class'] =='teardrop.') |(train_multiclass['Class'] =='pod.')| (train_multiclass['Class']=='land.'),'Class'] = 'Dos'\n",
  92 |     "train_multiclass.loc[(train_multiclass['Class'] =='satan.')|(train_multiclass['Class'] =='ipsweep.') | (train_multiclass['Class'] =='portsweep.') | (train_multiclass['Class'] =='nmap.'),'Class'] = 'probe'\n",
  93 |     "train_multiclass.loc[(train_multiclass['Class'] =='spy.')|(train_multiclass['Class'] =='phf.')|(train_multiclass['Class'] =='multihop.')|(train_multiclass['Class'] =='ftp_write.') | (train_multiclass['Class'] =='imap.') | (train_multiclass['Class'] =='warezmaster.') |(train_multiclass['Class'] =='guess_passwd.')| (train_multiclass['Class']=='warezclient.'),'Class'] = 'r2l'\n",
  94 |     "train_multiclass.loc[(train_multiclass['Class'] =='buffer_overflow.')|(train_multiclass['Class'] =='rootkit.') | (train_multiclass['Class'] =='loadmodule.') | (train_multiclass['Class'] =='perl.'),'Class']='u2r'\n",
  95 |     "train_multiclass.loc[(train_multiclass['Class'] =='normal.'),'Class'] = 'normal'"
  96 |    ]
  97 |   },
  98 |   {
  99 |    "cell_type": "markdown",
 100 |    "metadata": {},
 101 |    "source": [
 102 |     "\n"
 103 |    ]
 104 |   },
 105 |   {
 106 |    "cell_type": "code",
 107 |    "execution_count": 24,
 108 |    "metadata": {
 109 |     "collapsed": false
 110 |    },
 111 |    "outputs": [],
 112 |    "source": [
 113 |     "## Replacing all the different attack types(36) to their proper general attack class\n",
 114 |     "\n",
 115 |     "\n",
 116 |     "test_multiclass.loc[(test_multiclass['Class'] =='smurf.')|(test_multiclass['Class'] =='neptune.') | \n",
 117 |     "                    (test_multiclass['Class'] =='back.') | (test_multiclass['Class'] =='teardrop.') |\n",
 118 |     "                    (test_multiclass['Class'] =='pod.')| (test_multiclass['Class']=='land.')|\n",
 119 |     "                   (test_multiclass['Class']=='apache2.')|(test_multiclass['Class']=='udpstorm.')|\n",
 120 |     "                   (test_multiclass['Class']=='processtable.')|(test_multiclass['Class']=='mailbomb.'),'Class'] = 'Dos'\n",
 121 |     "\n",
 122 |     "\n",
 123 |     "test_multiclass.loc[(test_multiclass['Class'] =='guess_passwd.')|(test_multiclass['Class'] =='ftp_write.')|\n",
 124 |     "                    (test_multiclass['Class'] =='imap.')|(test_multiclass['Class'] =='phf.') | \n",
 125 |     "                    (test_multiclass['Class'] =='multihop.') | \n",
 126 |     "                    (test_multiclass['Class'] =='warezmaster.') |(test_multiclass['Class'] =='snmpgetattack.')| \n",
 127 |     "                    (test_multiclass['Class']=='named.')|(test_multiclass['Class'] =='xlock.')|\n",
 128 |     "                    (test_multiclass['Class'] =='xsnoop.')|(test_multiclass['Class'] =='sendmail.')|\n",
 129 |     "                    (test_multiclass['Class'] =='httptunnel.')|(test_multiclass['Class'] =='worm.')|\n",
 130 |     "                    (test_multiclass['Class'] =='snmpguess.'),'Class'] = 'r2l'\n",
 131 |     "\n",
 132 |     "test_multiclass.loc[(test_multiclass['Class'] =='satan.')|(test_multiclass['Class'] =='ipsweep.') | (test_multiclass['Class'] =='portsweep.') | (test_multiclass['Class'] =='nmap.')|\n",
 133 |     "                    (test_multiclass['Class'] =='saint.')|(test_multiclass['Class'] =='mscan.'),'Class'] = 'probe'\n",
 134 |     "\n",
 135 |     "test_multiclass.loc[(test_multiclass['Class'] =='buffer_overflow.')|(test_multiclass['Class'] =='rootkit.') | \n",
 136 |     "                    (test_multiclass['Class'] =='loadmodule.') | (test_multiclass['Class'] =='xterm.')|\n",
 137 |     "                    (test_multiclass['Class'] =='sqlattack.')|(test_multiclass['Class'] =='ps.')|\n",
 138 |     "                    (test_multiclass['Class'] =='perl.'),'Class']='u2r'\n",
 139 |     "\n",
 140 |     "test_multiclass.loc[(test_multiclass['Class'] =='normal.'),'Class'] = 'normal'"
 141 |    ]
 142 |   },
 143 |   {
 144 |    "cell_type": "markdown",
 145 |    "metadata": {},
 146 |    "source": [
 147 |     "\n"
 148 |    ]
 149 |   },
 150 |   {
 151 |    "cell_type": "markdown",
 152 |    "metadata": {},
 153 |    "source": [
 154 |     "### 2.2 Encoding The Dataset:"
 155 |    ]
 156 |   },
 157 |   {
 158 |    "cell_type": "code",
 159 |    "execution_count": 25,
 160 |    "metadata": {
 161 |     "collapsed": false
 162 |    },
 163 |    "outputs": [
 164 |     {
 165 |      "name": "stdout",
 166 |      "output_type": "stream",
 167 |      "text": [
 168 |       "(494021, 118)\n",
 169 |       "(311029, 118)\n"
 170 |      ]
 171 |     }
 172 |    ],
 173 |    "source": [
 174 |     "# Decoding The Dataset: \n",
 175 |     "attr_encoder = feature_extraction.DictVectorizer(sparse=False)\n",
 176 |     "label_encoder = preprocessing.LabelEncoder()\n",
 177 |     "\n",
 178 |     "train_data_df_m = attr_encoder.fit_transform(train_multiclass.iloc[:,:-1].T.to_dict().values())\n",
 179 |     "train_target_df_m= label_encoder.fit_transform(train_multiclass.iloc[:,-1])\n",
 180 |     "\n",
 181 |     "\n",
 182 |     "train_data_decoded_m = pd.DataFrame(train_data_df_m)\n",
 183 |     "train_target_decoded_m = pd.DataFrame(train_target_df_m)\n",
 184 |     "\n",
 185 |     "test_data_df_m = attr_encoder.transform(test_multiclass.iloc[:,:-1].T.to_dict().values())\n",
 186 |     "test_target_df_m = label_encoder.transform(test_multiclass.iloc[:,-1])\n",
 187 |     "\n",
 188 |     "test_data_decoded_m = pd.DataFrame(test_data_df_m)\n",
 189 |     "test_target_decoded_m = pd.DataFrame(test_target_df_m)\n",
 190 |     "\n",
 191 |     "\n",
 192 |     "print train_data_decoded_m.shape\n",
 193 |     "print test_data_decoded_m.shape"
 194 |    ]
 195 |   },
 196 |   {
 197 |    "cell_type": "markdown",
 198 |    "metadata": {},
 199 |    "source": [
 200 |     "### 1.3 Perfroming Feature Reduction using PCA"
 201 |    ]
 202 |   },
 203 |   {
 204 |    "cell_type": "code",
 205 |    "execution_count": 29,
 206 |    "metadata": {
 207 |     "collapsed": false
 208 |    },
 209 |    "outputs": [
 210 |     {
 211 |      "name": "stdout",
 212 |      "output_type": "stream",
 213 |      "text": [
 214 |       "(494021, 29)\n",
 215 |       "(311029, 29)\n"
 216 |      ]
 217 |     }
 218 |    ],
 219 |    "source": [
 220 |     "#load some modules to help\n",
 221 |     "from mpl_toolkits.mplot3d import Axes3D\n",
 222 |     "from sklearn.decomposition import PCA\n",
 223 |     "\n",
 224 |     "\n",
 225 |     "train_data_pca2 = PCA(n_components=29).fit_transform(train_data_decoded_m)\n",
 226 |     "test_data_pca2 = PCA(n_components=29).fit_transform(test_data_decoded_m)\n",
 227 |     "\n",
 228 |     "train_data_pca_df2 = pd.DataFrame(train_data_pca2)\n",
 229 |     "test_data_pca_df2 = pd.DataFrame(test_data_pca2)\n",
 230 |     "\n",
 231 |     "print train_data_pca_df2.shape\n",
 232 |     "print test_data_pca_df2.shape"
 233 |    ]
 234 |   },
 235 |   {
 236 |    "cell_type": "markdown",
 237 |    "metadata": {},
 238 |    "source": [
 239 |     "### 1.4 Normalizing the Data Sets"
 240 |    ]
 241 |   },
 242 |   {
 243 |    "cell_type": "code",
 244 |    "execution_count": 30,
 245 |    "metadata": {
 246 |     "collapsed": true
 247 |    },
 248 |    "outputs": [],
 249 |    "source": [
 250 |     "#Creating our scaler and applyting it to our dataset after feature reduction\n",
 251 |     "standard_scaler = preprocessing.StandardScaler()\n",
 252 |     "train_ratio_standard_scaled_values2 = standard_scaler.fit_transform(train_data_pca_df2.values)\n",
 253 |     "train_data_scaled2=pd.DataFrame(train_ratio_standard_scaled_values2)\n",
 254 |     "\n",
 255 |     "test_ratio_standard_scaled_values2 = standard_scaler.fit_transform(test_data_pca_df2.values)\n",
 256 |     "test_data_scaled2=pd.DataFrame(test_ratio_standard_scaled_values2)"
 257 |    ]
 258 |   },
 259 |   {
 260 |    "cell_type": "markdown",
 261 |    "metadata": {},
 262 |    "source": [
 263 |     "\n"
 264 |    ]
 265 |   },
 266 |   {
 267 |    "cell_type": "markdown",
 268 |    "metadata": {},
 269 |    "source": [
 270 |     "## 2 Classification:"
 271 |    ]
 272 |   },
 273 |   {
 274 |    "cell_type": "markdown",
 275 |    "metadata": {},
 276 |    "source": [
 277 |     "### 2.1 Using SVM Algorithm"
 278 |    ]
 279 |   },
 280 |   {
 281 |    "cell_type": "code",
 282 |    "execution_count": 22,
 283 |    "metadata": {
 284 |     "collapsed": false
 285 |    },
 286 |    "outputs": [
 287 |     {
 288 |      "name": "stdout",
 289 |      "output_type": "stream",
 290 |      "text": [
 291 |       "0.88830623511\n",
 292 |       "             precision    recall  f1-score   support\n",
 293 |       "\n",
 294 |       "          0       0.95      0.98      0.96    229853\n",
 295 |       "          1       0.80      0.85      0.82     60593\n",
 296 |       "          2       0.00      0.00      0.00      4166\n",
 297 |       "          3       0.01      0.00      0.00     16347\n",
 298 |       "          4       0.00      0.00      0.00        70\n",
 299 |       "\n",
 300 |       "avg / total       0.86      0.89      0.87    311029\n",
 301 |       "\n",
 302 |       "Number of support vectors for each class [ 831 5029   89  183   22]\n",
 303 |       "[[  3.06168543e-03  -2.62821770e-02   2.92121218e-02 ...,   5.25927608e+00\n",
 304 |       "   -1.17429443e+00  -6.21666293e+00]\n",
 305 |       " [  3.06168545e-03  -2.62823594e-02   3.05702387e-02 ...,   4.37957870e+00\n",
 306 |       "   -3.62719957e+00  -6.25108464e+00]\n",
 307 |       " [  2.01738062e-03  -2.62830927e-02   3.67396697e-02 ...,  -6.92651351e+00\n",
 308 |       "   -4.77964921e+00  -4.82940529e+00]\n",
 309 |       " ..., \n",
 310 |       " [  3.05763774e-03  -2.61610300e-02   2.85855914e-02 ...,   6.05223733e+00\n",
 311 |       "   -4.15001874e+00  -4.24851284e+00]\n",
 312 |       " [  1.53772805e-03   3.08304779e-02  -4.14853393e-01 ...,   1.90579151e+01\n",
 313 |       "   -1.03848798e+01  -4.81560377e+00]\n",
 314 |       " [  6.97832472e-04   1.00867055e-01  -3.37710746e-02 ...,   8.80253938e+00\n",
 315 |       "    8.42640634e-01  -3.81469791e+00]]\n"
 316 |      ]
 317 |     }
 318 |    ],
 319 |    "source": [
 320 |     "#Draft\n",
 321 |     "clf = svm.SVC(kernel='linear',class_weight=\"balanced\", max_iter=100000000)\n",
 322 |     "clf.fit(train_data_scaled2, train_target_decoded_m[0])\n",
 323 |     "clf_predict = clf.predict(test_data_scaled2)\n",
 324 |     "print clf.score(test_data_scaled2, test_target_decoded_m)\n",
 325 |     "print metrics.classification_report(test_target_decoded_m, clf_predict)\n"
 326 |    ]
 327 |   },
 328 |   {
 329 |    "cell_type": "markdown",
 330 |    "metadata": {},
 331 |    "source": [
 332 |     "### 2.2 Using Decision Trees Algorithm:"
 333 |    ]
 334 |   },
 335 |   {
 336 |    "cell_type": "markdown",
 337 |    "metadata": {},
 338 |    "source": [
 339 |     "#### 2.2.1 Performaing Corss Validation on The Training Set for Testing Different Paramter"
 340 |    ]
 341 |   },
 342 |   {
 343 |    "cell_type": "code",
 344 |    "execution_count": 37,
 345 |    "metadata": {
 346 |     "collapsed": false
 347 |    },
 348 |    "outputs": [
 349 |     {
 350 |      "name": "stdout",
 351 |      "output_type": "stream",
 352 |      "text": [
 353 |       "criterion: gini\n",
 354 |       "min_samples_leaf: 5\n",
 355 |       "max_depth: 6\n",
 356 |       "Accuracy    0.998128\n",
 357 |       "dtype: float64\n",
 358 |       "\n",
 359 |       "\n",
 360 |       "criterion: gini\n",
 361 |       "min_samples_leaf: 5\n",
 362 |       "max_depth: 12\n",
 363 |       "Accuracy    0.999273\n",
 364 |       "dtype: float64\n",
 365 |       "\n",
 366 |       "\n",
 367 |       "criterion: gini\n",
 368 |       "min_samples_leaf: 10\n",
 369 |       "max_depth: 6\n",
 370 |       "Accuracy    0.998146\n",
 371 |       "dtype: float64\n",
 372 |       "\n",
 373 |       "\n",
 374 |       "criterion: gini\n",
 375 |       "min_samples_leaf: 10\n",
 376 |       "max_depth: 12\n",
 377 |       "Accuracy    0.999263\n",
 378 |       "dtype: float64\n",
 379 |       "\n",
 380 |       "\n",
 381 |       "criterion: entropy\n",
 382 |       "min_samples_leaf: 5\n",
 383 |       "max_depth: 6\n",
 384 |       "Accuracy    0.998889\n",
 385 |       "dtype: float64\n",
 386 |       "\n",
 387 |       "\n",
 388 |       "criterion: entropy\n",
 389 |       "min_samples_leaf: 5\n",
 390 |       "max_depth: 12\n",
 391 |       "Accuracy    0.99946\n",
 392 |       "dtype: float64\n",
 393 |       "\n",
 394 |       "\n",
 395 |       "criterion: entropy\n",
 396 |       "min_samples_leaf: 10\n",
 397 |       "max_depth: 6\n",
 398 |       "Accuracy    0.998911\n",
 399 |       "dtype: float64\n",
 400 |       "\n",
 401 |       "\n",
 402 |       "criterion: entropy\n",
 403 |       "min_samples_leaf: 10\n",
 404 |       "max_depth: 12\n",
 405 |       "Accuracy    0.999375\n",
 406 |       "dtype: float64\n",
 407 |       "\n",
 408 |       "\n"
 409 |      ]
 410 |     }
 411 |    ],
 412 |    "source": [
 413 |     "## Testing SVM using Different Kernals with class weights balanced\n",
 414 |     "foldnum = 0\n",
 415 |     "fold_results = pd.DataFrame()\n",
 416 |     "criterion=[ 'gini','entropy']\n",
 417 |     "min_samples_leaf = [5,10]\n",
 418 |     "max_depth = [6,12]\n",
 419 |     "\n",
 420 |     "\n",
 421 |     "for cri in criterion:\n",
 422 |     "    for leaf in min_samples_leaf:\n",
 423 |     "        for depth in max_depth:\n",
 424 |     "            foldnum = 0\n",
 425 |     "            clf3 = tree.DecisionTreeClassifier(criterion=cri,min_samples_leaf=leaf,max_depth=depth,random_state=20160121,class_weight=\"balanced\")\n",
 426 |     "            for train, test in cross_validation.KFold(len(train_data_scaled2), n_folds=5,shuffle=True,random_state=20160202):  \n",
 427 |     "                [ids_tr_data, ids_te_data, ids_tr_target, ids_te_target] = folds_to_split(train_data_scaled_1,train_target_decoded,train, test)\n",
 428 |     "                clf3.fit(ids_tr_data, ids_tr_target[0])\n",
 429 |     "                fold_results.loc[foldnum, 'Accuracy'] = clf3.score(ids_te_data, ids_te_target)\n",
 430 |     "                foldnum+=1 \n",
 431 |     "            print \"criterion:\",cri\n",
 432 |     "            print \"min_samples_leaf:\",leaf\n",
 433 |     "            print \"max_depth:\",depth\n",
 434 |     "            print fold_results.mean()\n",
 435 |     "            print \"\\n\""
 436 |    ]
 437 |   },
 438 |   {
 439 |    "cell_type": "markdown",
 440 |    "metadata": {},
 441 |    "source": [
 442 |     "#### 2.2.1 Testing the IDS Model on The Test Set:"
 443 |    ]
 444 |   },
 445 |   {
 446 |    "cell_type": "code",
 447 |    "execution_count": 46,
 448 |    "metadata": {
 449 |     "collapsed": false
 450 |    },
 451 |    "outputs": [
 452 |     {
 453 |      "name": "stdout",
 454 |      "output_type": "stream",
 455 |      "text": [
 456 |       "Accuracy : 0.848322825203\n",
 457 |       "             precision    recall  f1-score   support\n",
 458 |       "\n",
 459 |       "          0       0.89      0.96      0.92    229853\n",
 460 |       "          1       0.77      0.69      0.73     60593\n",
 461 |       "          2       0.11      0.15      0.13      4166\n",
 462 |       "          3       0.35      0.03      0.06     16347\n",
 463 |       "          4       0.00      0.00      0.00        70\n",
 464 |       "\n",
 465 |       "avg / total       0.82      0.85      0.83    311029\n",
 466 |       "\n"
 467 |      ]
 468 |     }
 469 |    ],
 470 |    "source": [
 471 |     "from sklearn.datasets import load_iris\n",
 472 |     "from sklearn import tree\n",
 473 |     "\n",
 474 |     "clf3 = tree.DecisionTreeClassifier(criterion = 'gini', min_samples_leaf = 5, max_depth=12,random_state=20160121,class_weight=\"balanced\")\n",
 475 |     "clf3.fit(train_data_scaled2, train_target_decoded_m[0])\n",
 476 |     "clf3_predict = clf3.predict(test_data_scaled2)\n",
 477 |     "print \"Accuracy :\", clf3.score(test_data_scaled2, test_target_decoded_m)\n",
 478 |     "print metrics.classification_report(test_target_decoded_m, clf3_predict)\n"
 479 |    ]
 480 |   },
 481 |   {
 482 |    "cell_type": "markdown",
 483 |    "metadata": {},
 484 |    "source": [
 485 |     "### 2.3 Using Naive Bayes Algorithm:"
 486 |    ]
 487 |   },
 488 |   {
 489 |    "cell_type": "code",
 490 |    "execution_count": 35,
 491 |    "metadata": {
 492 |     "collapsed": false
 493 |    },
 494 |    "outputs": [
 495 |     {
 496 |      "name": "stdout",
 497 |      "output_type": "stream",
 498 |      "text": [
 499 |       "0.749843262204\n",
 500 |       "             precision    recall  f1-score   support\n",
 501 |       "\n",
 502 |       "          0       0.96      0.78      0.86    229853\n",
 503 |       "          1       0.44      0.87      0.58     60593\n",
 504 |       "          2       0.00      0.00      0.00      4166\n",
 505 |       "          3       0.26      0.08      0.12     16347\n",
 506 |       "          4       0.75      0.09      0.15        70\n",
 507 |       "\n",
 508 |       "avg / total       0.81      0.75      0.76    311029\n",
 509 |       "\n"
 510 |      ]
 511 |     }
 512 |    ],
 513 |    "source": [
 514 |     "from sklearn.naive_bayes import GaussianNB\n",
 515 |     "gnb = GaussianNB()\n",
 516 |     "y_pred = gnb.fit(train_data_scaled2, train_target_decoded_m[0])\n",
 517 |     "y_pred_predict3 = y_pred.predict(test_data_scaled2)\n",
 518 |     "print y_pred.score(test_data_scaled2, test_target_decoded_m)\n",
 519 |     "print metrics.classification_report(test_target_decoded_m, y_pred_predict3)"
 520 |    ]
 521 |   },
 522 |   {
 523 |    "cell_type": "markdown",
 524 |    "metadata": {},
 525 |    "source": [
 526 |     "\n"
 527 |    ]
 528 |   },
 529 |   {
 530 |    "cell_type": "markdown",
 531 |    "metadata": {},
 532 |    "source": [
 533 |     "\n"
 534 |    ]
 535 |   },
 536 |   {
 537 |    "cell_type": "markdown",
 538 |    "metadata": {},
 539 |    "source": [
 540 |     "\n",
 541 |     "\n"
 542 |    ]
 543 |   },
 544 |   {
 545 |    "cell_type": "markdown",
 546 |    "metadata": {},
 547 |    "source": [
 548 |     "\n",
 549 |     "\n",
 550 |     "\n",
 551 |     "\n",
 552 |     "\n",
 553 |     "\n",
 554 |     "\n"
 555 |    ]
 556 |   },
 557 |   {
 558 |    "cell_type": "markdown",
 559 |    "metadata": {},
 560 |    "source": [
 561 |     "# Building 2 Class IDS Model:"
 562 |    ]
 563 |   },
 564 |   {
 565 |    "cell_type": "code",
 566 |    "execution_count": 2,
 567 |    "metadata": {
 568 |     "collapsed": true
 569 |    },
 570 |    "outputs": [],
 571 |    "source": [
 572 |     "#let's load the data\n",
 573 |     "train_data_1 = urllib.urlopen('/home/aziz/Downloads/kddcup.data_10_percent_corrected')\n",
 574 |     "test_data_1 = urllib.urlopen('/home/aziz/Downloads/corrected')\n",
 575 |     "\n",
 576 |     "#Place both dataset into a dataframe\n",
 577 |     "train_class = pd.read_csv(train_data_1, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class'])\n",
 578 |     "test_class = pd.read_csv(test_data_1, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class'])"
 579 |    ]
 580 |   },
 581 |   {
 582 |    "cell_type": "markdown",
 583 |    "metadata": {},
 584 |    "source": [
 585 |     "\n"
 586 |    ]
 587 |   },
 588 |   {
 589 |    "cell_type": "markdown",
 590 |    "metadata": {},
 591 |    "source": [
 592 |     "## 1 Pre-Processing The Datasets:"
 593 |    ]
 594 |   },
 595 |   {
 596 |    "cell_type": "markdown",
 597 |    "metadata": {},
 598 |    "source": [
 599 |     "### 1.1 Converts Labels to The Right Class"
 600 |    ]
 601 |   },
 602 |   {
 603 |    "cell_type": "code",
 604 |    "execution_count": 3,
 605 |    "metadata": {
 606 |     "collapsed": false
 607 |    },
 608 |    "outputs": [],
 609 |    "source": [
 610 |     "train_class.loc[(train_class['Class'] !='normal.'),'Class'] = 'attack'\n",
 611 |     "train_class.loc[(train_class['Class'] =='normal.'),'Class'] = 'normal'\n",
 612 |     "\n",
 613 |     "test_class.loc[(test_class['Class'] !='normal.'),'Class'] = 'attack'\n",
 614 |     "test_class.loc[(test_class['Class'] =='normal.'),'Class'] = 'normal'"
 615 |    ]
 616 |   },
 617 |   {
 618 |    "cell_type": "markdown",
 619 |    "metadata": {},
 620 |    "source": [
 621 |     "\n"
 622 |    ]
 623 |   },
 624 |   {
 625 |    "cell_type": "markdown",
 626 |    "metadata": {},
 627 |    "source": [
 628 |     "### 1.2 Encoding The Dataset"
 629 |    ]
 630 |   },
 631 |   {
 632 |    "cell_type": "code",
 633 |    "execution_count": 4,
 634 |    "metadata": {
 635 |     "collapsed": false
 636 |    },
 637 |    "outputs": [
 638 |     {
 639 |      "name": "stdout",
 640 |      "output_type": "stream",
 641 |      "text": [
 642 |       "(494021, 118)\n",
 643 |       "(311029, 118)\n"
 644 |      ]
 645 |     }
 646 |    ],
 647 |    "source": [
 648 |     "# Decoding The Dataset: \n",
 649 |     "attr_encoder = feature_extraction.DictVectorizer(sparse=False)\n",
 650 |     "label_encoder = preprocessing.LabelEncoder()\n",
 651 |     "\n",
 652 |     "train_data_df = attr_encoder.fit_transform(train_class.iloc[:,:-1].T.to_dict().values())\n",
 653 |     "train_target_df= label_encoder.fit_transform(train_class.iloc[:,-1])\n",
 654 |     "\n",
 655 |     "\n",
 656 |     "train_data_decoded = pd.DataFrame(train_data_df)\n",
 657 |     "train_target_decoded = pd.DataFrame(train_target_df)\n",
 658 |     "\n",
 659 |     "test_data_df= attr_encoder.transform(test_class.iloc[:,:-1].T.to_dict().values())\n",
 660 |     "test_target_df= label_encoder.transform(test_class.iloc[:,-1])\n",
 661 |     "\n",
 662 |     "test_data_decoded = pd.DataFrame(test_data_df)\n",
 663 |     "test_target_decoded = pd.DataFrame(test_target_df)\n",
 664 |     "\n",
 665 |     "\n",
 666 |     "print train_data_decoded.shape\n",
 667 |     "print test_data_decoded.shape"
 668 |    ]
 669 |   },
 670 |   {
 671 |    "cell_type": "markdown",
 672 |    "metadata": {},
 673 |    "source": [
 674 |     "\n"
 675 |    ]
 676 |   },
 677 |   {
 678 |    "cell_type": "markdown",
 679 |    "metadata": {},
 680 |    "source": [
 681 |     "### 1.3 Feature Reduction Using PCA"
 682 |    ]
 683 |   },
 684 |   {
 685 |    "cell_type": "code",
 686 |    "execution_count": 9,
 687 |    "metadata": {
 688 |     "collapsed": false
 689 |    },
 690 |    "outputs": [
 691 |     {
 692 |      "name": "stdout",
 693 |      "output_type": "stream",
 694 |      "text": [
 695 |       "(494021, 29)\n",
 696 |       "(311029, 29)\n"
 697 |      ]
 698 |     }
 699 |    ],
 700 |    "source": [
 701 |     "#load some modules to help\n",
 702 |     "from mpl_toolkits.mplot3d import Axes3D\n",
 703 |     "from sklearn.decomposition import PCA\n",
 704 |     "\n",
 705 |     "\n",
 706 |     "train_data_pca_1 = PCA(n_components=29).fit_transform(train_data_decoded)\n",
 707 |     "test_data_pca_1 = PCA(n_components=29).fit_transform(test_data_decoded)\n",
 708 |     "\n",
 709 |     "train_data_pca_df_1 = pd.DataFrame(train_data_pca_1)\n",
 710 |     "test_data_pca_df_1 = pd.DataFrame(test_data_pca_1)\n",
 711 |     "\n",
 712 |     "print train_data_pca_df_1.shape\n",
 713 |     "print test_data_pca_df_1.shape"
 714 |    ]
 715 |   },
 716 |   {
 717 |    "cell_type": "markdown",
 718 |    "metadata": {},
 719 |    "source": [
 720 |     "\n"
 721 |    ]
 722 |   },
 723 |   {
 724 |    "cell_type": "markdown",
 725 |    "metadata": {},
 726 |    "source": [
 727 |     "### 1.4 Normalizing The Datasets"
 728 |    ]
 729 |   },
 730 |   {
 731 |    "cell_type": "code",
 732 |    "execution_count": 10,
 733 |    "metadata": {
 734 |     "collapsed": true
 735 |    },
 736 |    "outputs": [],
 737 |    "source": [
 738 |     "#Creating our scaler and applyting it to our dataset after feature reduction\n",
 739 |     "standard_scaler = preprocessing.StandardScaler()\n",
 740 |     "train_ratio_standard_scaled_values = standard_scaler.fit_transform(train_data_pca_df_1.values)\n",
 741 |     "train_data_scaled_1=pd.DataFrame(train_ratio_standard_scaled_values)\n",
 742 |     "\n",
 743 |     "test_ratio_standard_scaled_values = standard_scaler.fit_transform(test_data_pca_df_1.values)\n",
 744 |     "test_data_scaled_1=pd.DataFrame(test_ratio_standard_scaled_values)"
 745 |    ]
 746 |   },
 747 |   {
 748 |    "cell_type": "markdown",
 749 |    "metadata": {},
 750 |    "source": [
 751 |     "\n"
 752 |    ]
 753 |   },
 754 |   {
 755 |    "cell_type": "markdown",
 756 |    "metadata": {},
 757 |    "source": [
 758 |     "## 2 Classifiying The Data Set"
 759 |    ]
 760 |   },
 761 |   {
 762 |    "cell_type": "markdown",
 763 |    "metadata": {},
 764 |    "source": [
 765 |     "### 2.1 Using SVM Algorithm:"
 766 |    ]
 767 |   },
 768 |   {
 769 |    "cell_type": "code",
 770 |    "execution_count": 8,
 771 |    "metadata": {
 772 |     "collapsed": false
 773 |    },
 774 |    "outputs": [
 775 |     {
 776 |      "name": "stdout",
 777 |      "output_type": "stream",
 778 |      "text": [
 779 |       "0.922396303882\n",
 780 |       "             precision    recall  f1-score   support\n",
 781 |       "\n",
 782 |       "          0       0.95      0.95      0.95    250436\n",
 783 |       "          1       0.80      0.80      0.80     60593\n",
 784 |       "\n",
 785 |       "avg / total       0.92      0.92      0.92    311029\n",
 786 |       "\n",
 787 |       "Number of support vectors for each class [1694 1688]\n",
 788 |       "[[  2.78947810e-03   4.43292344e-02  -6.15165127e-03 ...,   7.07344321e-01\n",
 789 |       "   -4.92057076e-01  -2.28549088e+00]\n",
 790 |       " [  2.01738092e-03  -2.62871757e-02   6.24009662e-02 ...,  -5.36280799e+00\n",
 791 |       "   -4.30704463e+00   2.56780715e+00]\n",
 792 |       " [  2.01738062e-03  -2.62830927e-02   3.67396697e-02 ...,  -6.92651351e+00\n",
 793 |       "   -4.77964921e+00  -4.82940529e+00]\n",
 794 |       " ..., \n",
 795 |       " [  1.54481243e-03   9.93833927e-02  -1.84981710e-02 ...,  -3.49703128e-01\n",
 796 |       "   -4.14043476e-02  -1.79917829e+00]\n",
 797 |       " [  2.82793121e-03  -1.56889839e-02   3.34062598e-02 ...,   7.36771477e-01\n",
 798 |       "    4.55919104e+00   9.46427430e-01]\n",
 799 |       " [  2.82388363e-03   3.23437103e-02   3.62694976e-02 ...,   4.35657953e-01\n",
 800 |       "    4.61212702e+00   9.38367066e-01]]\n"
 801 |      ]
 802 |     }
 803 |    ],
 804 |    "source": [
 805 |     "#Draft\n",
 806 |     "lin = svm.SVC(kernel='linear', max_iter=100000000)\n",
 807 |     "lin.fit(train_data_scaled_1, train_target_decoded[0])\n",
 808 |     "lin_predict = lin.predict(test_data_scaled_1)\n",
 809 |     "print lin.score(test_data_scaled_1, test_target_decoded)\n",
 810 |     "print metrics.classification_report(test_target_decoded, lin_predict)\n",
 811 |     "print \"Number of support vectors for each class\", lin.n_support_\n",
 812 |     "print lin.support_vectors_"
 813 |    ]
 814 |   },
 815 |   {
 816 |    "cell_type": "markdown",
 817 |    "metadata": {},
 818 |    "source": [
 819 |     "\n"
 820 |    ]
 821 |   },
 822 |   {
 823 |    "cell_type": "markdown",
 824 |    "metadata": {},
 825 |    "source": [
 826 |     "### 2.2 Using Decision Trees Algorithm"
 827 |    ]
 828 |   },
 829 |   {
 830 |    "cell_type": "markdown",
 831 |    "metadata": {},
 832 |    "source": [
 833 |     "#### 2.2.1 Performaing Corss Validation on The Training Set for Testing Different Paramter\n"
 834 |    ]
 835 |   },
 836 |   {
 837 |    "cell_type": "code",
 838 |    "execution_count": 10,
 839 |    "metadata": {
 840 |     "collapsed": false
 841 |    },
 842 |    "outputs": [
 843 |     {
 844 |      "name": "stdout",
 845 |      "output_type": "stream",
 846 |      "text": [
 847 |       "criterion: gini\n",
 848 |       "min_samples_leaf: 2\n",
 849 |       "max_depth: 1\n",
 850 |       "Accuracy    0.969376\n",
 851 |       "dtype: float64\n",
 852 |       "\n",
 853 |       "\n",
 854 |       "criterion: gini\n",
 855 |       "min_samples_leaf: 2\n",
 856 |       "max_depth: 6\n",
 857 |       "Accuracy    0.998154\n",
 858 |       "dtype: float64\n",
 859 |       "\n",
 860 |       "\n",
 861 |       "criterion: gini\n",
 862 |       "min_samples_leaf: 2\n",
 863 |       "max_depth: 12\n",
 864 |       "Accuracy    0.999326\n",
 865 |       "dtype: float64\n",
 866 |       "\n",
 867 |       "\n",
 868 |       "criterion: gini\n",
 869 |       "min_samples_leaf: 5\n",
 870 |       "max_depth: 1\n",
 871 |       "Accuracy    0.969376\n",
 872 |       "dtype: float64\n",
 873 |       "\n",
 874 |       "\n",
 875 |       "criterion: gini\n",
 876 |       "min_samples_leaf: 5\n",
 877 |       "max_depth: 6\n",
 878 |       "Accuracy    0.998128\n",
 879 |       "dtype: float64\n",
 880 |       "\n",
 881 |       "\n",
 882 |       "criterion: gini\n",
 883 |       "min_samples_leaf: 5\n",
 884 |       "max_depth: 12\n",
 885 |       "Accuracy    0.999273\n",
 886 |       "dtype: float64\n",
 887 |       "\n",
 888 |       "\n",
 889 |       "criterion: gini\n",
 890 |       "min_samples_leaf: 50\n",
 891 |       "max_depth: 1\n",
 892 |       "Accuracy    0.969376\n",
 893 |       "dtype: float64\n",
 894 |       "\n",
 895 |       "\n",
 896 |       "criterion: gini\n",
 897 |       "min_samples_leaf: 50\n",
 898 |       "max_depth: 6\n",
 899 |       "Accuracy    0.997601\n",
 900 |       "dtype: float64\n",
 901 |       "\n",
 902 |       "\n",
 903 |       "criterion: gini\n",
 904 |       "min_samples_leaf: 50\n",
 905 |       "max_depth: 12\n",
 906 |       "Accuracy    0.998296\n",
 907 |       "dtype: float64\n",
 908 |       "\n",
 909 |       "\n",
 910 |       "criterion: entropy\n",
 911 |       "min_samples_leaf: 2\n",
 912 |       "max_depth: 1\n",
 913 |       "Accuracy    0.969376\n",
 914 |       "dtype: float64\n",
 915 |       "\n",
 916 |       "\n",
 917 |       "criterion: entropy\n",
 918 |       "min_samples_leaf: 2\n",
 919 |       "max_depth: 6\n",
 920 |       "Accuracy    0.998911\n",
 921 |       "dtype: float64\n",
 922 |       "\n",
 923 |       "\n",
 924 |       "criterion: entropy\n",
 925 |       "min_samples_leaf: 2\n",
 926 |       "max_depth: 12\n",
 927 |       "Accuracy    0.999547\n",
 928 |       "dtype: float64\n",
 929 |       "\n",
 930 |       "\n",
 931 |       "criterion: entropy\n",
 932 |       "min_samples_leaf: 5\n",
 933 |       "max_depth: 1\n",
 934 |       "Accuracy    0.969376\n",
 935 |       "dtype: float64\n",
 936 |       "\n",
 937 |       "\n",
 938 |       "criterion: entropy\n",
 939 |       "min_samples_leaf: 5\n",
 940 |       "max_depth: 6\n",
 941 |       "Accuracy    0.998889\n",
 942 |       "dtype: float64\n",
 943 |       "\n",
 944 |       "\n",
 945 |       "criterion: entropy\n",
 946 |       "min_samples_leaf: 5\n",
 947 |       "max_depth: 12\n",
 948 |       "Accuracy    0.99946\n",
 949 |       "dtype: float64\n",
 950 |       "\n",
 951 |       "\n",
 952 |       "criterion: entropy\n",
 953 |       "min_samples_leaf: 50\n",
 954 |       "max_depth: 1\n",
 955 |       "Accuracy    0.969376\n",
 956 |       "dtype: float64\n",
 957 |       "\n",
 958 |       "\n",
 959 |       "criterion: entropy\n",
 960 |       "min_samples_leaf: 50\n",
 961 |       "max_depth: 6\n",
 962 |       "Accuracy    0.998326\n",
 963 |       "dtype: float64\n",
 964 |       "\n",
 965 |       "\n",
 966 |       "criterion: entropy\n",
 967 |       "min_samples_leaf: 50\n",
 968 |       "max_depth: 12\n",
 969 |       "Accuracy    0.998646\n",
 970 |       "dtype: float64\n",
 971 |       "\n",
 972 |       "\n"
 973 |      ]
 974 |     }
 975 |    ],
 976 |    "source": [
 977 |     "## Testing SVM using Different Kernals with class weights balanced\n",
 978 |     "foldnum = 0\n",
 979 |     "fold_results = pd.DataFrame()\n",
 980 |     "criterion=[ 'gini','entropy']\n",
 981 |     "min_samples_leaf = [2, 5, 50]\n",
 982 |     "max_depth = [1,6,12]\n",
 983 |     "\n",
 984 |     "\n",
 985 |     "for cri in criterion:\n",
 986 |     "    for leaf in min_samples_leaf:\n",
 987 |     "        for depth in max_depth:\n",
 988 |     "            foldnum = 0\n",
 989 |     "            clf = tree.DecisionTreeClassifier(criterion=cri,min_samples_leaf=leaf,max_depth=depth,random_state=20160121,class_weight=\"balanced\")\n",
 990 |     "            for train, test in cross_validation.KFold(len(train_data_scaled_1), n_folds=5,shuffle=True,random_state=20160202):  \n",
 991 |     "                [ids_tr_data, ids_te_data, ids_tr_target, ids_te_target] = folds_to_split(train_data_scaled_1,train_target_decoded,train, test)\n",
 992 |     "                clf.fit(ids_tr_data, ids_tr_target[0])\n",
 993 |     "                clf_predict = clf.predict(ids_te_data)\n",
 994 |     "\n",
 995 |     "                fold_results.loc[foldnum, 'Accuracy'] = clf.score(ids_te_data, ids_te_target)\n",
 996 |     "                foldnum+=1 \n",
 997 |     "            print \"criterion:\",cri\n",
 998 |     "            print \"min_samples_leaf:\",leaf\n",
 999 |     "            print \"max_depth:\",depth\n",
1000 |     "            print fold_results.mean()\n",
1001 |     "            print \"\\n\""
1002 |    ]
1003 |   },
1004 |   {
1005 |    "cell_type": "markdown",
1006 |    "metadata": {},
1007 |    "source": [
1008 |     "#### 2.2.1 Testing the IDS Model on The Test Set:"
1009 |    ]
1010 |   },
1011 |   {
1012 |    "cell_type": "code",
1013 |    "execution_count": 11,
1014 |    "metadata": {
1015 |     "collapsed": false
1016 |    },
1017 |    "outputs": [
1018 |     {
1019 |      "name": "stdout",
1020 |      "output_type": "stream",
1021 |      "text": [
1022 |       "Accuracy (via score): 0.824206102968\n",
1023 |       "             precision    recall  f1-score   support\n",
1024 |       "\n",
1025 |       "          0       0.92      0.86      0.89    250436\n",
1026 |       "          1       0.54      0.67      0.60     60593\n",
1027 |       "\n",
1028 |       "avg / total       0.84      0.82      0.83    311029\n",
1029 |       "\n"
1030 |      ]
1031 |     }
1032 |    ],
1033 |    "source": [
1034 |     "\n",
1035 |     "clf_t = tree.DecisionTreeClassifier(criterion='entropy',min_samples_leaf=2,max_depth=12,random_state=20160121,class_weight=\"balanced\")\n",
1036 |     "clf_t.fit(train_data_scaled_1, train_target_decoded[0])\n",
1037 |     "clf_predict = clf_t.predict(test_data_scaled_1)\n",
1038 |     "\n",
1039 |     "print \"Accuracy (via score):\", clf_t.score(test_data_scaled_1, test_target_decoded)\n",
1040 |     "print metrics.classification_report(test_target_decoded, clf_predict)\n"
1041 |    ]
1042 |   },
1043 |   {
1044 |    "cell_type": "markdown",
1045 |    "metadata": {
1046 |     "collapsed": true
1047 |    },
1048 |    "source": [
1049 |     "### 2.3 Using Naive Bayes Algorithm:"
1050 |    ]
1051 |   },
1052 |   {
1053 |    "cell_type": "code",
1054 |    "execution_count": 17,
1055 |    "metadata": {
1056 |     "collapsed": false
1057 |    },
1058 |    "outputs": [
1059 |     {
1060 |      "name": "stdout",
1061 |      "output_type": "stream",
1062 |      "text": [
1063 |       "0.822302743474\n",
1064 |       "             precision    recall  f1-score   support\n",
1065 |       "\n",
1066 |       "          0       0.87      0.91      0.89    250436\n",
1067 |       "          1       0.55      0.45      0.50     60593\n",
1068 |       "\n",
1069 |       "avg / total       0.81      0.82      0.81    311029\n",
1070 |       "\n",
1071 |       "[ 0.80308934  0.19691066]\n"
1072 |      ]
1073 |     }
1074 |    ],
1075 |    "source": [
1076 |     "from sklearn.naive_bayes import GaussianNB\n",
1077 |     "gnb = GaussianNB()\n",
1078 |     "Naive = gnb.fit(train_data_scaled_1, train_target_decoded[0])\n",
1079 |     "Naive_predict = Naive.predict(test_data_scaled_1)\n",
1080 |     "print Naive.score(test_data_scaled_1, test_target_decoded)\n",
1081 |     "print metrics.classification_report(test_target_decoded, Naive_predict)\n",
1082 |     "print Naive.class_prior_"
1083 |    ]
1084 |   }
1085 |  ],
1086 |  "metadata": {
1087 |   "kernelspec": {
1088 |    "display_name": "Python 2",
1089 |    "language": "python",
1090 |    "name": "python2"
1091 |   },
1092 |   "language_info": {
1093 |    "codemirror_mode": {
1094 |     "name": "ipython",
1095 |     "version": 2
1096 |    },
1097 |    "file_extension": ".py",
1098 |    "mimetype": "text/x-python",
1099 |    "name": "python",
1100 |    "nbconvert_exporter": "python",
1101 |    "pygments_lexer": "ipython2",
1102 |    "version": "2.7.11"
1103 |   }
1104 |  },
1105 |  "nbformat": 4,
1106 |  "nbformat_minor": 0
1107 | }
1108 | 


--------------------------------------------------------------------------------
/SVM with multiclasses.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | ## Preliminaries
  7 | import numpy as np 
  8 | import matplotlib.pyplot as plt
  9 | from sklearn import datasets, preprocessing,cross_validation, feature_extraction
 10 | from sklearn import linear_model, svm, metrics, ensemble, tree, ensemble
 11 | from sklearn.decomposition import PCA
 12 | import pandas as pd
 13 | import urllib
 14 | import csv
 15 | 
 16 | # Helper functions
 17 | def folds_to_split(data,targets,train,test):
 18 |     data_tr = pd.DataFrame(data).iloc[train]
 19 |     data_te = pd.DataFrame(data).iloc[test]
 20 |     labels_tr = pd.DataFrame(targets).iloc[train]
 21 |     labels_te = pd.DataFrame(targets).iloc[test]
 22 |     return [data_tr, data_te, labels_tr, labels_te]
 23 | 
 24 | 
 25 | # 
 26 | # 
 27 | 
 28 | # ## Using SVM To Build The Model 5 class labeling:
 29 | 
 30 | # In[22]:
 31 | 
 32 | #let's load the data
 33 | train_data = urllib.urlopen('/home/aziz/Downloads/kddcup.data_10_percent_corrected')
 34 | test_data = urllib.urlopen('/home/aziz/Downloads/corrected')
 35 | 
 36 | #Place both dataset into a dataframe
 37 | train_multiclass = pd.read_csv(train_data, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class'])
 38 | test_multiclass = pd.read_csv(test_data, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class'])
 39 | 
 40 | 
 41 | # 
 42 | # 
 43 | 
 44 | # ## 1 Pre-Processing The Datasets:
 45 | 
 46 | # ### 1.1 Change Labels to The Right Class: 
 47 | 
 48 | # In[23]:
 49 | 
 50 | ## Replacing all the different attack types(24) to their proper general attack class
 51 | train_multiclass.loc[(train_multiclass['Class'] =='smurf.')|(train_multiclass['Class'] =='neptune.') | (train_multiclass['Class'] =='back.') | (train_multiclass['Class'] =='teardrop.') |(train_multiclass['Class'] =='pod.')| (train_multiclass['Class']=='land.'),'Class'] = 'Dos'
 52 | train_multiclass.loc[(train_multiclass['Class'] =='satan.')|(train_multiclass['Class'] =='ipsweep.') | (train_multiclass['Class'] =='portsweep.') | (train_multiclass['Class'] =='nmap.'),'Class'] = 'probe'
 53 | train_multiclass.loc[(train_multiclass['Class'] =='spy.')|(train_multiclass['Class'] =='phf.')|(train_multiclass['Class'] =='multihop.')|(train_multiclass['Class'] =='ftp_write.') | (train_multiclass['Class'] =='imap.') | (train_multiclass['Class'] =='warezmaster.') |(train_multiclass['Class'] =='guess_passwd.')| (train_multiclass['Class']=='warezclient.'),'Class'] = 'r2l'
 54 | train_multiclass.loc[(train_multiclass['Class'] =='buffer_overflow.')|(train_multiclass['Class'] =='rootkit.') | (train_multiclass['Class'] =='loadmodule.') | (train_multiclass['Class'] =='perl.'),'Class']='u2r'
 55 | train_multiclass.loc[(train_multiclass['Class'] =='normal.'),'Class'] = 'normal'
 56 | 
 57 | 
 58 | # 
 59 | # 
 60 | 
 61 | # In[24]:
 62 | 
 63 | ## Replacing all the different attack types(36) to their proper general attack class
 64 | 
 65 | 
 66 | test_multiclass.loc[(test_multiclass['Class'] =='smurf.')|(test_multiclass['Class'] =='neptune.') | 
 67 |                     (test_multiclass['Class'] =='back.') | (test_multiclass['Class'] =='teardrop.') |
 68 |                     (test_multiclass['Class'] =='pod.')| (test_multiclass['Class']=='land.')|
 69 |                    (test_multiclass['Class']=='apache2.')|(test_multiclass['Class']=='udpstorm.')|
 70 |                    (test_multiclass['Class']=='processtable.')|(test_multiclass['Class']=='mailbomb.'),'Class'] = 'Dos'
 71 | 
 72 | 
 73 | test_multiclass.loc[(test_multiclass['Class'] =='guess_passwd.')|(test_multiclass['Class'] =='ftp_write.')|
 74 |                     (test_multiclass['Class'] =='imap.')|(test_multiclass['Class'] =='phf.') | 
 75 |                     (test_multiclass['Class'] =='multihop.') | 
 76 |                     (test_multiclass['Class'] =='warezmaster.') |(test_multiclass['Class'] =='snmpgetattack.')| 
 77 |                     (test_multiclass['Class']=='named.')|(test_multiclass['Class'] =='xlock.')|
 78 |                     (test_multiclass['Class'] =='xsnoop.')|(test_multiclass['Class'] =='sendmail.')|
 79 |                     (test_multiclass['Class'] =='httptunnel.')|(test_multiclass['Class'] =='worm.')|
 80 |                     (test_multiclass['Class'] =='snmpguess.'),'Class'] = 'r2l'
 81 | 
 82 | test_multiclass.loc[(test_multiclass['Class'] =='satan.')|(test_multiclass['Class'] =='ipsweep.') | (test_multiclass['Class'] =='portsweep.') | (test_multiclass['Class'] =='nmap.')|
 83 |                     (test_multiclass['Class'] =='saint.')|(test_multiclass['Class'] =='mscan.'),'Class'] = 'probe'
 84 | 
 85 | test_multiclass.loc[(test_multiclass['Class'] =='buffer_overflow.')|(test_multiclass['Class'] =='rootkit.') | 
 86 |                     (test_multiclass['Class'] =='loadmodule.') | (test_multiclass['Class'] =='xterm.')|
 87 |                     (test_multiclass['Class'] =='sqlattack.')|(test_multiclass['Class'] =='ps.')|
 88 |                     (test_multiclass['Class'] =='perl.'),'Class']='u2r'
 89 | 
 90 | test_multiclass.loc[(test_multiclass['Class'] =='normal.'),'Class'] = 'normal'
 91 | 
 92 | 
 93 | # 
 94 | # 
 95 | 
 96 | # ### 2.2 Encoding The Dataset:
 97 | 
 98 | # In[25]:
 99 | 
100 | # Decoding The Dataset: 
101 | attr_encoder = feature_extraction.DictVectorizer(sparse=False)
102 | label_encoder = preprocessing.LabelEncoder()
103 | 
104 | train_data_df_m = attr_encoder.fit_transform(train_multiclass.iloc[:,:-1].T.to_dict().values())
105 | train_target_df_m= label_encoder.fit_transform(train_multiclass.iloc[:,-1])
106 | 
107 | 
108 | train_data_decoded_m = pd.DataFrame(train_data_df_m)
109 | train_target_decoded_m = pd.DataFrame(train_target_df_m)
110 | 
111 | test_data_df_m = attr_encoder.transform(test_multiclass.iloc[:,:-1].T.to_dict().values())
112 | test_target_df_m = label_encoder.transform(test_multiclass.iloc[:,-1])
113 | 
114 | test_data_decoded_m = pd.DataFrame(test_data_df_m)
115 | test_target_decoded_m = pd.DataFrame(test_target_df_m)
116 | 
117 | 
118 | print train_data_decoded_m.shape
119 | print test_data_decoded_m.shape
120 | 
121 | 
122 | # ### 1.3 Perfroming Feature Reduction using PCA
123 | 
124 | # In[29]:
125 | 
126 | #load some modules to help
127 | from mpl_toolkits.mplot3d import Axes3D
128 | from sklearn.decomposition import PCA
129 | 
130 | 
131 | train_data_pca2 = PCA(n_components=29).fit_transform(train_data_decoded_m)
132 | test_data_pca2 = PCA(n_components=29).fit_transform(test_data_decoded_m)
133 | 
134 | train_data_pca_df2 = pd.DataFrame(train_data_pca2)
135 | test_data_pca_df2 = pd.DataFrame(test_data_pca2)
136 | 
137 | print train_data_pca_df2.shape
138 | print test_data_pca_df2.shape
139 | 
140 | 
141 | # ### 1.4 Normalizing the Data Sets
142 | 
143 | # In[30]:
144 | 
145 | #Creating our scaler and applyting it to our dataset after feature reduction
146 | standard_scaler = preprocessing.StandardScaler()
147 | train_ratio_standard_scaled_values2 = standard_scaler.fit_transform(train_data_pca_df2.values)
148 | train_data_scaled2=pd.DataFrame(train_ratio_standard_scaled_values2)
149 | 
150 | test_ratio_standard_scaled_values2 = standard_scaler.fit_transform(test_data_pca_df2.values)
151 | test_data_scaled2=pd.DataFrame(test_ratio_standard_scaled_values2)
152 | 
153 | 
154 | # 
155 | # 
156 | 
157 | # ## 2 Classification:
158 | 
159 | # ### 2.1 Using SVM Algorithm
160 | 
161 | # In[22]:
162 | 
163 | #Draft
164 | clf = svm.SVC(kernel='linear',class_weight="balanced", max_iter=100000000)
165 | clf.fit(train_data_scaled2, train_target_decoded_m[0])
166 | clf_predict = clf.predict(test_data_scaled2)
167 | print clf.score(test_data_scaled2, test_target_decoded_m)
168 | print metrics.classification_report(test_target_decoded_m, clf_predict)
169 | 
170 | 
171 | # ### 2.2 Using Decision Trees Algorithm:
172 | 
173 | # #### 2.2.1 Performaing Corss Validation on The Training Set for Testing Different Paramter
174 | 
175 | # In[37]:
176 | 
177 | ## Testing SVM using Different Kernals with class weights balanced
178 | foldnum = 0
179 | fold_results = pd.DataFrame()
180 | criterion=[ 'gini','entropy']
181 | min_samples_leaf = [5,10]
182 | max_depth = [6,12]
183 | 
184 | 
185 | for cri in criterion:
186 |     for leaf in min_samples_leaf:
187 |         for depth in max_depth:
188 |             foldnum = 0
189 |             clf3 = tree.DecisionTreeClassifier(criterion=cri,min_samples_leaf=leaf,max_depth=depth,random_state=20160121,class_weight="balanced")
190 |             for train, test in cross_validation.KFold(len(train_data_scaled2), n_folds=5,shuffle=True,random_state=20160202):  
191 |                 [ids_tr_data, ids_te_data, ids_tr_target, ids_te_target] = folds_to_split(train_data_scaled_1,train_target_decoded,train, test)
192 |                 clf3.fit(ids_tr_data, ids_tr_target[0])
193 |                 fold_results.loc[foldnum, 'Accuracy'] = clf3.score(ids_te_data, ids_te_target)
194 |                 foldnum+=1 
195 |             print "criterion:",cri
196 |             print "min_samples_leaf:",leaf
197 |             print "max_depth:",depth
198 |             print fold_results.mean()
199 |             print "\n"
200 | 
201 | 
202 | # #### 2.2.1 Testing the IDS Model on The Test Set:
203 | 
204 | # In[46]:
205 | 
206 | from sklearn.datasets import load_iris
207 | from sklearn import tree
208 | 
209 | clf3 = tree.DecisionTreeClassifier(criterion = 'gini', min_samples_leaf = 5, max_depth=12,random_state=20160121,class_weight="balanced")
210 | clf3.fit(train_data_scaled2, train_target_decoded_m[0])
211 | clf3_predict = clf3.predict(test_data_scaled2)
212 | print "Accuracy :", clf3.score(test_data_scaled2, test_target_decoded_m)
213 | print metrics.classification_report(test_target_decoded_m, clf3_predict)
214 | 
215 | 
216 | # ### 2.3 Using Naive Bayes Algorithm:
217 | 
218 | # In[35]:
219 | 
220 | from sklearn.naive_bayes import GaussianNB
221 | gnb = GaussianNB()
222 | y_pred = gnb.fit(train_data_scaled2, train_target_decoded_m[0])
223 | y_pred_predict3 = y_pred.predict(test_data_scaled2)
224 | print y_pred.score(test_data_scaled2, test_target_decoded_m)
225 | print metrics.classification_report(test_target_decoded_m, y_pred_predict3)
226 | 
227 | 
228 | # 
229 | # 
230 | 
231 | # 
232 | # 
233 | 
234 | # 
235 | # 
236 | # 
237 | 
238 | # 
239 | # 
240 | # 
241 | # 
242 | # 
243 | # 
244 | # 
245 | # 
246 | 
247 | # # Building 2 Class IDS Model:
248 | 
249 | # In[2]:
250 | 
251 | #let's load the data
252 | train_data_1 = urllib.urlopen('/home/aziz/Downloads/kddcup.data_10_percent_corrected')
253 | test_data_1 = urllib.urlopen('/home/aziz/Downloads/corrected')
254 | 
255 | #Place both dataset into a dataframe
256 | train_class = pd.read_csv(train_data_1, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class'])
257 | test_class = pd.read_csv(test_data_1, quotechar=',', skipinitialspace=True, names=['Duration', 'protocol_type', 'Service', 'Flag', 'src_bytes', 'dst_bytes', 'Land', 'wrong_fragment', 'Urgent', 'Hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'Count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate','Class'])
258 | 
259 | 
260 | # 
261 | # 
262 | 
263 | # ## 1 Pre-Processing The Datasets:
264 | 
265 | # ### 1.1 Converts Labels to The Right Class
266 | 
267 | # In[3]:
268 | 
269 | train_class.loc[(train_class['Class'] !='normal.'),'Class'] = 'attack'
270 | train_class.loc[(train_class['Class'] =='normal.'),'Class'] = 'normal'
271 | 
272 | test_class.loc[(test_class['Class'] !='normal.'),'Class'] = 'attack'
273 | test_class.loc[(test_class['Class'] =='normal.'),'Class'] = 'normal'
274 | 
275 | 
276 | # 
277 | # 
278 | 
279 | # ### 1.2 Encoding The Dataset
280 | 
281 | # In[4]:
282 | 
283 | # Decoding The Dataset: 
284 | attr_encoder = feature_extraction.DictVectorizer(sparse=False)
285 | label_encoder = preprocessing.LabelEncoder()
286 | 
287 | train_data_df = attr_encoder.fit_transform(train_class.iloc[:,:-1].T.to_dict().values())
288 | train_target_df= label_encoder.fit_transform(train_class.iloc[:,-1])
289 | 
290 | 
291 | train_data_decoded = pd.DataFrame(train_data_df)
292 | train_target_decoded = pd.DataFrame(train_target_df)
293 | 
294 | test_data_df= attr_encoder.transform(test_class.iloc[:,:-1].T.to_dict().values())
295 | test_target_df= label_encoder.transform(test_class.iloc[:,-1])
296 | 
297 | test_data_decoded = pd.DataFrame(test_data_df)
298 | test_target_decoded = pd.DataFrame(test_target_df)
299 | 
300 | 
301 | print train_data_decoded.shape
302 | print test_data_decoded.shape
303 | 
304 | 
305 | # 
306 | # 
307 | 
308 | # ### 1.3 Feature Reduction Using PCA
309 | 
310 | # In[9]:
311 | 
312 | #load some modules to help
313 | from mpl_toolkits.mplot3d import Axes3D
314 | from sklearn.decomposition import PCA
315 | 
316 | 
317 | train_data_pca_1 = PCA(n_components=29).fit_transform(train_data_decoded)
318 | test_data_pca_1 = PCA(n_components=29).fit_transform(test_data_decoded)
319 | 
320 | train_data_pca_df_1 = pd.DataFrame(train_data_pca_1)
321 | test_data_pca_df_1 = pd.DataFrame(test_data_pca_1)
322 | 
323 | print train_data_pca_df_1.shape
324 | print test_data_pca_df_1.shape
325 | 
326 | 
327 | # 
328 | # 
329 | 
330 | # ### 1.4 Normalizing The Datasets
331 | 
332 | # In[10]:
333 | 
334 | #Creating our scaler and applyting it to our dataset after feature reduction
335 | standard_scaler = preprocessing.StandardScaler()
336 | train_ratio_standard_scaled_values = standard_scaler.fit_transform(train_data_pca_df_1.values)
337 | train_data_scaled_1=pd.DataFrame(train_ratio_standard_scaled_values)
338 | 
339 | test_ratio_standard_scaled_values = standard_scaler.fit_transform(test_data_pca_df_1.values)
340 | test_data_scaled_1=pd.DataFrame(test_ratio_standard_scaled_values)
341 | 
342 | 
343 | # 
344 | # 
345 | 
346 | # ## 2 Classifiying The Data Set
347 | 
348 | # ### 2.1 Using SVM Algorithm:
349 | 
350 | # In[8]:
351 | 
352 | #Draft
353 | lin = svm.SVC(kernel='linear', max_iter=100000000)
354 | lin.fit(train_data_scaled_1, train_target_decoded[0])
355 | lin_predict = lin.predict(test_data_scaled_1)
356 | print lin.score(test_data_scaled_1, test_target_decoded)
357 | print metrics.classification_report(test_target_decoded, lin_predict)
358 | print "Number of support vectors for each class", lin.n_support_
359 | print lin.support_vectors_
360 | 
361 | 
362 | # 
363 | # 
364 | 
365 | # ### 2.2 Using Decision Trees Algorithm
366 | 
367 | # #### 2.2.1 Performaing Corss Validation on The Training Set for Testing Different Paramter
368 | # 
369 | 
370 | # In[10]:
371 | 
372 | ## Testing SVM using Different Kernals with class weights balanced
373 | foldnum = 0
374 | fold_results = pd.DataFrame()
375 | criterion=[ 'gini','entropy']
376 | min_samples_leaf = [2, 5, 50]
377 | max_depth = [1,6,12]
378 | 
379 | 
380 | for cri in criterion:
381 |     for leaf in min_samples_leaf:
382 |         for depth in max_depth:
383 |             foldnum = 0
384 |             clf = tree.DecisionTreeClassifier(criterion=cri,min_samples_leaf=leaf,max_depth=depth,random_state=20160121,class_weight="balanced")
385 |             for train, test in cross_validation.KFold(len(train_data_scaled_1), n_folds=5,shuffle=True,random_state=20160202):  
386 |                 [ids_tr_data, ids_te_data, ids_tr_target, ids_te_target] = folds_to_split(train_data_scaled_1,train_target_decoded,train, test)
387 |                 clf.fit(ids_tr_data, ids_tr_target[0])
388 |                 clf_predict = clf.predict(ids_te_data)
389 | 
390 |                 fold_results.loc[foldnum, 'Accuracy'] = clf.score(ids_te_data, ids_te_target)
391 |                 foldnum+=1 
392 |             print "criterion:",cri
393 |             print "min_samples_leaf:",leaf
394 |             print "max_depth:",depth
395 |             print fold_results.mean()
396 |             print "\n"
397 | 
398 | 
399 | # #### 2.2.1 Testing the IDS Model on The Test Set:
400 | 
401 | # In[11]:
402 | 
403 | 
404 | clf_t = tree.DecisionTreeClassifier(criterion='entropy',min_samples_leaf=2,max_depth=12,random_state=20160121,class_weight="balanced")
405 | clf_t.fit(train_data_scaled_1, train_target_decoded[0])
406 | clf_predict = clf_t.predict(test_data_scaled_1)
407 | 
408 | print "Accuracy (via score):", clf_t.score(test_data_scaled_1, test_target_decoded)
409 | print metrics.classification_report(test_target_decoded, clf_predict)
410 | 
411 | 
412 | # ### 2.3 Using Naive Bayes Algorithm:
413 | 
414 | # In[17]:
415 | 
416 | from sklearn.naive_bayes import GaussianNB
417 | gnb = GaussianNB()
418 | Naive = gnb.fit(train_data_scaled_1, train_target_decoded[0])
419 | Naive_predict = Naive.predict(test_data_scaled_1)
420 | print Naive.score(test_data_scaled_1, test_target_decoded)
421 | print metrics.classification_report(test_target_decoded, Naive_predict)
422 | print Naive.class_prior_
423 | 
424 | 


--------------------------------------------------------------------------------