├── SparkML
    ├── project
    │   ├── build.properties
    │   ├── plugins.sbt
    │   └── assembly.sbt
    ├── readme.md
    ├── build.sbt
    ├── src
    │   └── main
    │   │   └── scala
    │   │       ├── Main.scala
    │   │       ├── utils.scala
    │   │       ├── RandomForest.scala
    │   │       ├── RandomForestCluster.scala
    │   │       ├── RandomForestAllDaysBinary.scala
    │   │       └── RandomForestIndividualDays.scala
    └── LICENSE
├── Conv-LSTM-Keras
    ├── README.md
    └── Data_Extraction_Revised.py
└── README.md


/SparkML/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.8


--------------------------------------------------------------------------------
/SparkML/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn


--------------------------------------------------------------------------------
/SparkML/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3")
2 | 


--------------------------------------------------------------------------------
/SparkML/readme.md:
--------------------------------------------------------------------------------
1 | This repo contains code for a [Spark](spark.apache.org) application defining a pipeline for intrusion detection (through supervised machine learning -- multiclass classification with Random Forest) based on ip flow data.
2 | 
3 | The dataset can be found [here](http://www.unb.ca/research/iscx/dataset/iscx-IDS-dataset.html), and it consists of several different types of well-known network attacks, such as DDOS and Brute Force SSH connections.
4 | 


--------------------------------------------------------------------------------
/SparkML/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "iscx-ids-spark"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.10.5"
 6 | 
 7 | libraryDependencies ++= Seq(
 8 |   ("org.apache.spark" %% "spark-core" % "1.6.2" % "provided").
 9 |     exclude("org.mortbay.jetty", "servlet-api").
10 |     exclude("commons-beanutils", "commons-beanutils-core").
11 |     exclude("commons-collections", "commons-collections").
12 |     exclude("commons-logging", "commons-logging").
13 |     exclude("com.esotericsoftware.minlog", "minlog"),
14 |   "org.apache.spark" %% "spark-mllib" % "1.6.2" % "provided",
15 |   "com.databricks" %% "spark-csv" % "1.4.0",
16 |   "com.databricks" %% "spark-xml" % "0.3.3"
17 | )
18 | 


--------------------------------------------------------------------------------
/SparkML/src/main/scala/Main.scala:
--------------------------------------------------------------------------------
 1 | package iscx
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.sql._
 5 | import org.apache.spark.sql.functions._
 6 | import utils.{loadISCX, initSpark}
 7 | 
 8 | 
 9 | object Stats {
10 |   def main(args: Array[String]) {
11 |     val datasetPath = args match {
12 |        case Array(p,_*) => p
13 |        case _           => "/var/spark/datasets/iscxids/labeled/"
14 |      }
15 |     val (sc,sqlContext) = initSpark()
16 |     val dataframes = loadISCX(sqlContext,datasetPath)
17 | 
18 |     dataframes.foreach { d =>
19 |       println("Dia: " + d._1)
20 |       println("Número de fluxos: " + d._2.count.toString)
21 |       val groupedByTag = d._2
22 |                             .groupBy("Tag")
23 |                             .agg(count("Tag").as("count"))
24 |       // val normal = groupedByTag.filter(""Tag".equals("Normal"))
25 |       println("Proporção normal/ataque: ")
26 |       groupedByTag.show
27 |     }
28 |     sc.stop()
29 |   }
30 | 
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/SparkML/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Luis Fernando Milano Oliveira (c) 2016
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |       notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |       copyright notice, this list of conditions and the following
13 |       disclaimer in the documentation and/or other materials provided
14 |       with the distribution.
15 | 
16 |     * Neither the name of Luis Fernando Milano Oliveira nor the names of other
17 |       contributors may be used to endorse or promote products derived
18 |       from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/Conv-LSTM-Keras/README.md:
--------------------------------------------------------------------------------
 1 | # Intrusion Detection System using Deep Learning
 2 | 
 3 | VGG-19 deep learning model trained using ISCX 2012 IDS Dataset
 4 | 
 5 | # Framework & API's 
 6 | 
 7 | * Tensorflow-GPU
 8 | * Keras
 9 | * NVIDIA CUDA Toolkit 9.0
10 | * NVIDIA cuDNN 7.0
11 | 
12 | # Tools
13 | 
14 | * Anaconda (Python 3.6)
15 | * PyCharm
16 | 
17 | 
18 | # How to use
19 | Download the ISCX 2012 data set from the link
20 | 
21 | http://www.unb.ca/cic/datasets/ids.html
22 | 
23 | Then run the Java program known as ISCX FlowMeter which is found here on GitHub. You can use any IDE for that
24 | 
25 | https://github.com/ISCX/CICFlowMeter (if this doesnt convert .PCAP to .XML then try below)
26 | 
27 | https://github.com/ISCX/ISCXFlowMeter
28 | 
29 | Next I want you to make sure that your system is capable of running deep learning software. To check you can follow this guide that I have created:
30 | 
31 | https://towardsdatascience.com/python-environment-setup-for-deep-learning-on-windows-10-c373786e36d1
32 | 
33 | #### Note: If your system is inadequate then I humbly request you to stop here as the program will not perform efficiently and a great deal of time will be wasted.
34 | 
35 | Next run the program on the pre-processed data (change the location of the save file in the code). This will take out the relevant data fields in XML format for each file and process the data into Numpy Arrays by running the following python file:
36 | 
37 |     Data_Extraction_Revised.py
38 | 
39 | When completed you can now run (assuming you have Jupyter Notebook) the program.
40 | You have to change the location of the save file, in the code, to the save file from the revised data extraction program
41 | 
42 |     FYP-Revised.ipynb
43 | 
44 | And you can begin training
45 | 
46 | ## GOOD LUCK :)
47 | 


--------------------------------------------------------------------------------
/Conv-LSTM-Keras/Data_Extraction_Revised.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | import numpy as np
 3 | import os
 4 | import time
 5 | 
 6 | import_directory = 'C:\\Users\Tamim Mirza\Documents\ISCX\labeled_flows_xml\\'
 7 | 
 8 | files = os.listdir(import_directory)
 9 | 
10 | errors = []
11 | 
12 | start_time = time.time()
13 | i = -1
14 | data_array = np.empty((0, 2))
15 | counter = 0
16 | actual = (50**2) * 3
17 | for file in files:
18 |     print(file)
19 |     try:
20 |         tree = ET.parse(import_directory + file)
21 |         print('Reading File ', file)
22 |         root = tree.getroot()
23 |     except:
24 |         errors += file
25 |         continue
26 |     for child in root:
27 |         for next_child in child:
28 |             if next_child.tag == 'destinationPayloadAsUTF':
29 |                 if next_child.text is not None:
30 |                     x = next_child.text
31 |                     if len(x) > actual:
32 |                         x = x[: actual]
33 |                     else:
34 |                         while len(x) < actual:
35 |                             x += x
36 |                         x = x[:actual]
37 |                     if child.find('Tag').text == 'Normal':
38 |                         data_array = np.vstack((data_array, np.array([np.fromstring(x, dtype=np.uint8), 0])))
39 |                     else:
40 |                         data_array = np.vstack((data_array, np.array([np.fromstring(x, dtype=np.uint8), 1])))
41 |                     counter += 1
42 |     print('Time taken: {}'.format(time.time() - start_time))
43 |     start_time = time.time()
44 |     np.save('Database2\destinationPayload_' + file, np.array(data_array))
45 |     data_array = np.empty((0, 2))
46 | 
47 | print('Error in Opening Files = ', errors)
48 | print('Counter = ', counter)
49 | print('DONE!')


--------------------------------------------------------------------------------
/SparkML/src/main/scala/utils.scala:
--------------------------------------------------------------------------------
 1 | package iscx
 2 | 
 3 | import org.apache.spark.SparkContext
 4 | import org.apache.spark.SparkContext._
 5 | import org.apache.spark.SparkConf
 6 | import org.apache.spark.sql._
 7 | 
 8 | object utils {
 9 |   def initSpark() : (SparkContext,SQLContext) = {
10 |     val conf = new SparkConf().setAppName("Simple Application")
11 |       .setMaster("spark://10.90.67.77:7077")
12 |       // .setMaster("local[4]")
13 |     val sc = new SparkContext(conf)
14 |     sc.setLogLevel("WARN")
15 |     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
16 |     (sc,sqlContext)
17 |   }
18 | 
19 |   def loadISCX(sqlContext : SQLContext, path : String) : Array[(String, DataFrame)] = {
20 |     val days : Array[String] = Array(
21 |       "TestbedSatJun12"
22 |     , "TestbedSunJun13"
23 |     , "TestbedMonJun14"
24 |     , "TestbedTueJun15"
25 |     , "TestbedWedJun16"
26 |     , "TestbedThuJun17"
27 |     )
28 | 
29 |     val xmlFiles = days.map(d => path + d + ".xml")
30 |     val zipped = days.zip(xmlFiles)
31 | 
32 |     zipped.map { d =>
33 |       (d._1.drop(10), sqlContext
34 |               .read
35 |               .format("com.databricks.spark.xml")
36 |               .option("rowTag",d._1 + "Flows")
37 |               .load(d._2)
38 |               )
39 |     }
40 |     // TestbedJun12
41 |     // val jun12 = sqlContext.read
42 |     //   .format("com.databricks.spark.xml")
43 |     //   .option("rowTag",days(0))
44 |     //   .load(xmlFiles(0))
45 |     // // TestbedJun13
46 |     // val jun13 = sqlContext.read
47 |     //   .format("com.databricks.spark.xml")
48 |     //   .option("rowTag",days(1) + "Flows")
49 |     //   .load(xmlFiles(1))
50 |     // // TestbedJun14
51 |     // val jun14 = sqlContext.read
52 |     //   .format("com.databricks.spark.xml")
53 |     //   .option("rowTag",days(2) + "Flows")
54 |     //   .load(xmlFiles(2))
55 |     // // TestbedJun15
56 |     // val jun15 = sqlContext.read
57 |     //   .format("com.databricks.spark.xml")
58 |     //   .option("rowTag",days(3) + "Flows")
59 |     //   .load(xmlFiles(3))
60 |     // // TestbedJun16
61 |     // val jun16 = sqlContext.read
62 |     //   .format("com.databricks.spark.xml")
63 |     //   .option("rowTag",days(4) + "Flows")
64 |     //   .load(xmlFiles(4))
65 |     // // TestbedJun17
66 |     // val jun13 = sqlContext.read
67 |     //   .format("com.databricks.spark.xml")
68 |     //   .option("rowTag",days(1) + "Flows")
69 |     //   .load(xmlFiles(2))
70 |   }
71 | }
72 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # A Scalable Deep Learning-based Intrusion Detection System using Conv-LSTM Network
 2 | Intrusion detection system with Apache Spark and deep learning
 3 | 
 4 | # Why and how to use this repository? 
 5 | This repository contains the implementation details and the code for our paper titled "A Scalable and Hybrid Deep Learning-based Intrusion Detection System using Convolutional-LSTM Network". This papers has been submitted to "Symmetry — Open Access Journal" (see http://www.mdpi.com/journal/symmetry). 
 6 | 
 7 | In network intrusion detection (IDS), anomaly-based approaches in particular suffer from accurate evaluation, comparison, and deployment which originates from the scarcity of adequate datasets. Many such datasets are internal and cannot be shared due to privacy issues, others are heavily anonymized and do not reflect current trends, or they lack certain statistical characteristics. These deficiencies are primarily the reasons why a perfect dataset is yet to exist. Thus, researchers must resort to datasets which they can obtain that are often suboptimal.
 8 | 
 9 | As network behaviours and patterns change and intrusions evolve, it has very much become necessary to move away from static and one-time datasets toward more dynamically generated datasets which not only reflect the traffic compositions and intrusions of that time, but are also modifiable, extensible, and reproducible.
10 | 
11 | As a proof-of-concept, we use the Intrusion detection evaluation dataset (ISCXIDS2012) to solve a classification problem, which accurately identifies anomalies. 
12 | 
13 | Nevertheless, to show the effectiveness of our proposed approach on both datasets, we implemented the first stage in Scala using Spark MLlib as the ML platform. The Conv-LSTM network, on the other hand, were implemented in Python using Keras. 
14 | 
15 | Experiments were performed on a computing cluster with 32 cores running 64-bit Ubuntu 14.04 OS. The software stack consisted of Apache Spark v2.3.0, Java (JDK) 1.8, Scala 2.11.8, and Keras. The Conv-LSTM network was trained on an Nvidia TitanX GPU with CUDA and cuDNN enabled to improve overall pipeline speed. 
16 | 
17 | ## Spark MLlib-based classifers: 
18 | The following classifiers have been implemented to solve both the classification problems in a 2 stage cascading style:
19 | - Logistic Regression
20 | - Decision Trees
21 | - Random Forest
22 | - Multilayer Perceptron (MLP).
23 | 
24 | Nevertheless, we implemnted Spark + H2O (aka. Sparkling Water) versions too. Take a look at the ArrhythmiaPredictionH2O.scala and URLReputationH2O.scala classes for the classification of the Cardiac Arrhythmia and indentifying suspicious URLs respectively. 
25 | 
26 | Make sure that Spark is properly configured. Also, you need to have Maven installed on Linux. If you prefer, Eclipse/IntelliJ IDEA, make sure that Maven plugin and Scala plugins are installed.  
27 | 
28 | If everything is properly configured, you can create a uber jar containing all the dependencies and execute the jar. Alternatively, you can execute each implementation as a stand-alone Scala project from your favourite IDE. 
29 | 
30 | ## DeepLearning4j-based LSTM networks: 
31 | The Long Short-term Memory (LSTM) network has been implemented to solve the classification problem. The following are prerequisites when working with DL4J:
32 | - Java 1.8+ (64-bit only)
33 | - Apache Maven for automated build and dependency manager
34 | - IntelliJ IDEA or Eclipse IDE.
35 | 
36 | For more information on how to configure DeepLearning4j, please refer to https://deeplearning4j.org/. If everything is properly configured, you can create a uber jar containing all the dependencies and execute the jar. Alternatively, you can execute each implementation as a stand-alone Java project from your favourite IDE. 
37 | 
38 | ## Citation request
39 | If you reuse this implementation, please cite our paper: 
40 | 
41 |     @inproceedings{khan2018bigdata,
42 |         title={A Scalable and Hybrid Deep Learning-based Intrusion Detection System using Convolutional-LSTM Network},
43 |         author={M. A., Khan; Karim, Md. Rezaul; Y. Kim },
44 |         booktitle={Symmetry — Open Access Journal},
45 |         year={2019}
46 |     }
47 | 
48 | ## Contributing
49 | For any questions, feel free to open an issue or contact at rezaul.karim@rwth-aachen.de
50 | 


--------------------------------------------------------------------------------
/SparkML/src/main/scala/RandomForest.scala:
--------------------------------------------------------------------------------
  1 | package iscx
  2 | 
  3 | import org.apache.spark.SparkContext
  4 | import org.apache.spark.sql._
  5 | import org.apache.spark.sql.functions._
  6 | import utils.{loadISCX, initSpark}
  7 | import org.apache.spark.sql.Row
  8 | 
  9 | 
 10 | import org.apache.spark.ml.{Pipeline, PipelineStage}
 11 | import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
 12 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
 13 | import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler}
 14 | 
 15 | 
 16 | 
 17 | object RandomForest {
 18 |   def main(args: Array[String]) {
 19 |     val datasetPath = args match {
 20 |        case Array(p,_*) => p
 21 |        case _           => "/var/spark/datasets/iscxids/labeled/"
 22 |      }
 23 |     val (sc,sqlContext) = initSpark()
 24 |     // Array[(String, DataFrame)]
 25 |     val dataframes  = loadISCX(sqlContext,datasetPath)
 26 | 
 27 |      Array(dataframes(2)).foreach { d =>
 28 |       val data = d._2.select(
 29 |           "Tag"
 30 |         , "appName"
 31 |         , "destination"
 32 |         , "destinationPort"
 33 |         , "destinationTCPFlagsDescription"
 34 |         , "direction"
 35 |         , "protocolName"
 36 |         , "source"
 37 |         , "sourcePort"
 38 |         , "sourceTCPFlagsDescription"
 39 |         , "startDateTime"
 40 |         , "stopDateTime"
 41 |         , "totalDestinationBytes"
 42 |         , "totalDestinationPackets"
 43 |         , "totalSourceBytes"
 44 |         , "totalSourcePackets"
 45 |       ).na.fill("N/A")
 46 | 
 47 |     // MinMax
 48 |     // val (dstByMin, dstByMax) = data.agg(min($"totalDestinationBytes"), max($"totalDestinationBytes")).first match {
 49 |     //   case Row(x: Double, y: Double) => (x, y)
 50 |     // }
 51 | 
 52 |     // val scaledRange = lit(1) // Range of the scaled variable
 53 |     // val scaledMin = lit(0)  // Min value of the scaled variable
 54 |     // val vNormalized = ($"totalDestinationBytes" - vMin) / (vMax - vMin) // v normalized to (0, 1) range
 55 | 
 56 |     // val vScaled = scaledRange * vNormalized + scaledMin
 57 |     // /MinMax
 58 |     val filteredData = sqlContext.createDataFrame(data.map { row =>
 59 |           Row(
 60 |               row.getString(0)  // tag
 61 |             , row.getString(1)  // appName
 62 |             , row.getString(2).split("\\.").take(2).mkString(".")  // destination
 63 |             , row.getLong(3)  // destinationPort
 64 |             , row.getString(4)  // destinationTCPFlagsDescription
 65 |             , row.getString(5)  // direction
 66 |             , row.getString(6)  // protocolName
 67 |             , row.getString(7).split("\\.").take(2).mkString(".")  // destination
 68 |             , row.getLong(8) // sourcePort
 69 |             , row.getString(9) // sourceTCPFlagsDescription
 70 |             , row.getString(10).drop(11).take(2) // startDateTime
 71 |             , row.getString(11).drop(11).take(2)// stopDateTime
 72 |             , row.getLong(12) // totalDestinationBytes
 73 |             , row.getLong(13) // totalDestinationPackets
 74 |             , row.getLong(14) // totalSourceBytes
 75 |             , row.getLong(15) // totalSourcePackets
 76 |             )
 77 |     }, data.schema)
 78 | 
 79 | 
 80 |     // Transform the non-numerical features using the pipeline api
 81 |     val stringColumns = filteredData.columns
 82 |       .filter(!_.contains("Payload"))
 83 |       .filter(!_.contains("total"))
 84 |       .filter(!_.contains("Port"))
 85 | 
 86 |     val longColumns = filteredData.columns
 87 |       .filter(c => c.contains("total") || c.contains("Port"))
 88 | 
 89 |     // minMax
 90 | 
 91 |     // Index labels, adding metadata to the label column.
 92 |     // Fit on whole dataset to include all labels in index.
 93 |     val labelIndexer = new StringIndexer()
 94 |       .setInputCol("Tag")
 95 |       .setOutputCol("indexedLabel")
 96 | 
 97 |     val transformers: Array[PipelineStage] = stringColumns
 98 |       .map(cname => new StringIndexer()
 99 |              .setInputCol(cname)
100 |              .setOutputCol(s"${cname}_index")
101 |     )
102 | 
103 |     val assembler  = new VectorAssembler()
104 |       .setInputCols((stringColumns
105 |                        .map(cname => s"${cname}_index")) ++ longColumns)
106 |       .setOutputCol("features")
107 | 
108 |     // Automatically identify categorical features, and index them.
109 |     // Set maxCategories so features with > 10 distinct values are treated as continuous.
110 |     val featureIndexer = new VectorIndexer()
111 |       .setInputCol("features")
112 |       .setOutputCol("indexedFeatures")
113 |       .setMaxCategories(10)
114 | 
115 |     // Split the data into training and test sets (30% held out for testing)
116 | 
117 |     // Train a RandomForest model.
118 |     val rf = new RandomForestClassifier()
119 |       .setLabelCol("indexedLabel")
120 |       .setFeaturesCol("indexedFeatures")
121 |       .setNumTrees(32)
122 |       .setMaxBins(10000)
123 | 
124 |     // Convert indexed labels back to original labels.
125 |     val labelConverter = new IndexToString()
126 |       .setInputCol("prediction")
127 |       .setOutputCol("predictedLabel")
128 |       .setLabels(Array("Normal","Attack"))
129 | 
130 |     // Chain indexers and forest in a Pipeline
131 | 
132 |     val transformationStages : Array[PipelineStage] =
133 |         Array(labelIndexer) ++
134 |         transformers :+
135 |         assembler :+
136 |         featureIndexer
137 |     val preProcessers = new Pipeline().setStages(transformationStages)
138 | 
139 |     val stages : Array[PipelineStage] =
140 |       Array(rf,labelConverter)
141 | 
142 |     val dataModel = preProcessers.fit(filteredData)
143 |     val transformedData = dataModel.transform(filteredData)
144 | 
145 |     transformedData.write
146 |       .format("com.databricks.spark.csv")
147 |       .option("header", "true")
148 |     .save("/var/spark/datasets/iscx-processed/" + d._1)
149 | 
150 | 
151 |     val pipeline = new Pipeline()
152 |       .setStages(stages)
153 | 
154 |     val Array(trainingData, testData) = transformedData.randomSplit(Array(0.7, 0.3))
155 |     trainingData.cache()
156 |     testData.cache()
157 |     // Train model.  This also runs the indexers.
158 |     val model = pipeline.fit(trainingData)
159 | 
160 |     // // Make predictions.
161 |     val predictions = model.transform(testData)
162 | 
163 |     // // Select example rows to display.
164 |     predictions.select("predictedLabel", "Tag", "features").show(5)
165 | 
166 |     val rfModel = model.stages.init.last.asInstanceOf[RandomForestClassificationModel]
167 |     println("Learned classification forest model:\n" + rfModel.toDebugString)
168 |     val featuresImportance = rfModel.featureImportances.toArray.mkString(",")
169 |     println(s"Feature Importances for" + d._1)
170 |     println(featuresImportance)
171 | 
172 |     // // Select (prediction, true label) and compute test error
173 |     val evaluator = new MulticlassClassificationEvaluator()
174 |       .setLabelCol("indexedLabel")
175 |       .setPredictionCol("prediction")
176 |       .setMetricName("precision")
177 |     val accuracy = evaluator.evaluate(predictions)
178 |     println("Test Error = " + (1.0 - accuracy))
179 | 
180 |     }
181 |     sc.stop()
182 |   }
183 | }
184 | 


--------------------------------------------------------------------------------
/SparkML/src/main/scala/RandomForestCluster.scala:
--------------------------------------------------------------------------------
  1 | package iscx
  2 | 
  3 | import org.apache.spark.SparkContext
  4 | import org.apache.spark.SparkConf
  5 | import org.apache.spark.sql._
  6 | import org.apache.spark.sql.functions._
  7 | import utils.{loadISCX, initSpark}
  8 | import org.apache.spark.sql.Row
  9 | 
 10 | 
 11 | import org.apache.spark.ml.{Pipeline, PipelineStage}
 12 | import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
 13 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
 14 | import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler}
 15 | 
 16 | object RandomForestCluster {
 17 |   def main(args: Array[String]) {
 18 |     val datasetPath = args match {
 19 |        case Array(p,_*) => p
 20 |        case _           => "/var/spark/datasets/iscxids/labeled/"
 21 |      }
 22 |     val conf = new SparkConf().setAppName("Simple Application")
 23 |       .setMaster("spark://10.90.67.77:7077")
 24 |     val sc = new SparkContext(conf)
 25 |     sc.setLogLevel("WARN")
 26 |     val sqlContext = new org.apache.spark.sql.SQLContext(sc)
 27 | 
 28 |     // Array[(String, DataFrame)]
 29 |     val dataframes  = loadISCX(sqlContext,datasetPath)
 30 | 
 31 |      Array(dataframes(0)).foreach { d =>
 32 |       val data = d._2.select(
 33 |           "Tag"
 34 |         , "appName"
 35 |         , "destination"
 36 |         , "destinationPort"
 37 |         , "destinationTCPFlagsDescription"
 38 |         , "direction"
 39 |         , "protocolName"
 40 |         , "source"
 41 |         , "sourcePort"
 42 |         , "sourceTCPFlagsDescription"
 43 |         , "startDateTime"
 44 |         , "stopDateTime"
 45 |         , "totalDestinationBytes"
 46 |         , "totalDestinationPackets"
 47 |         , "totalSourceBytes"
 48 |         , "totalSourcePackets"
 49 |       ).na.fill("N/A")
 50 | 
 51 |     // MinMax
 52 |     // val (dstByMin, dstByMax) = data.agg(min($"totalDestinationBytes"), max($"totalDestinationBytes")).first match {
 53 |     //   case Row(x: Double, y: Double) => (x, y)
 54 |     // }
 55 | 
 56 |     // val scaledRange = lit(1) // Range of the scaled variable
 57 |     // val scaledMin = lit(0)  // Min value of the scaled variable
 58 |     // val vNormalized = ($"totalDestinationBytes" - vMin) / (vMax - vMin) // v normalized to (0, 1) range
 59 | 
 60 |     // val vScaled = scaledRange * vNormalized + scaledMin
 61 |     // /MinMax
 62 |     val filteredData = sqlContext.createDataFrame(data.map { row =>
 63 |           Row(
 64 |               row.getString(0)  // tag
 65 |             , row.getString(1)  // appName
 66 |             , row.getString(2).split("\\.").take(2).mkString(".")  // destination
 67 |             , row.getLong(3)  // destinationPort
 68 |             , row.getString(4)  // destinationTCPFlagsDescription
 69 |             , row.getString(5)  // direction
 70 |             , row.getString(6)  // protocolName
 71 |             , row.getString(7).split("\\.").take(2).mkString(".")  // destination
 72 |             , row.getLong(8) // sourcePort
 73 |             , row.getString(9) // sourceTCPFlagsDescription
 74 |             , row.getString(10).drop(11).take(2) // startDateTime
 75 |             , row.getString(11).drop(11).take(2)// stopDateTime
 76 |             , row.getLong(12) // totalDestinationBytes
 77 |             , row.getLong(13) // totalDestinationPackets
 78 |             , row.getLong(14) // totalSourceBytes
 79 |             , row.getLong(15) // totalSourcePackets
 80 |             )
 81 |     }, data.schema)
 82 | 
 83 | 
 84 |     // Transform the non-numerical features using the pipeline api
 85 |     val stringColumns = filteredData.columns
 86 |       .filter(!_.contains("Payload"))
 87 |       .filter(!_.contains("total"))
 88 |       .filter(!_.contains("Port"))
 89 |       .filter(!_.contains("Tag"))
 90 | 
 91 |     val longColumns = filteredData.columns
 92 |       .filter(c => c.contains("total") || c.contains("Port"))
 93 | 
 94 |     // minMax
 95 | 
 96 |     // Index labels, adding metadata to the label column.
 97 |     // Fit on whole dataset to include all labels in index.
 98 |     val labelIndexer = new StringIndexer()
 99 |       .setInputCol("Tag")
100 |       .setOutputCol("indexedLabel")
101 | 
102 |     val transformers: Array[PipelineStage] = stringColumns
103 |       .map(cname => new StringIndexer()
104 |              .setInputCol(cname)
105 |              .setOutputCol(s"${cname}_index")
106 |     )
107 | 
108 |     val assembler  = new VectorAssembler()
109 |       .setInputCols((stringColumns
110 |                        .map(cname => s"${cname}_index")) ++ longColumns)
111 |       .setOutputCol("features")
112 | 
113 |     // Automatically identify categorical features, and index them.
114 |     // Set maxCategories so features with > 10 distinct values are treated as continuous.
115 |     val featureIndexer = new VectorIndexer()
116 |       .setInputCol("features")
117 |       .setOutputCol("indexedFeatures")
118 |       .setMaxCategories(10)
119 | 
120 |     // Split the data into training and test sets (30% held out for testing)
121 | 
122 |     // Train a RandomForest model.
123 |     val rf = new RandomForestClassifier()
124 |       .setLabelCol("indexedLabel")
125 |       .setFeaturesCol("indexedFeatures")
126 |       .setNumTrees(32)
127 |       .setMaxBins(10000)
128 | 
129 |     // Convert indexed labels back to original labels.
130 |     val labelConverter = new IndexToString()
131 |       .setInputCol("prediction")
132 |       .setOutputCol("predictedLabel")
133 |       .setLabels(Array("Normal","Attack"))
134 | 
135 |     // Chain indexers and forest in a Pipeline
136 | 
137 |     val transformationStages : Array[PipelineStage] =
138 |         Array(labelIndexer) ++
139 |         transformers :+
140 |         assembler :+
141 |         featureIndexer
142 |     val preProcessers = new Pipeline().setStages(transformationStages)
143 | 
144 |     val stages : Array[PipelineStage] =
145 |       Array(rf,labelConverter)
146 | 
147 |     val dataModel = preProcessers.fit(filteredData)
148 |     val transformedData = dataModel.transform(filteredData)
149 | 
150 |     val pipeline = new Pipeline()
151 |       .setStages(stages)
152 | 
153 |     val Array(trainingData, testData) = transformedData.randomSplit(Array(0.7, 0.3))
154 |     trainingData.cache()
155 |     testData.cache()
156 |     // Train model.  This also runs the indexers.
157 |     val model = pipeline.fit(trainingData)
158 | 
159 |     // // Make predictions.
160 |     val predictions = model.transform(testData)
161 | 
162 |     // // Select example rows to display.
163 |     predictions.select("predictedLabel", "Tag", "features").show(5)
164 | 
165 |     val rfModel = model.stages.init.last.asInstanceOf[RandomForestClassificationModel]
166 |     println("Learned classification forest model:\n" + rfModel.toDebugString)
167 |     val featuresImportance = rfModel.featureImportances.toArray.mkString(",")
168 |     println(s"Feature Importances for" + d._1)
169 |     println(featuresImportance)
170 | 
171 |     // // Select (prediction, true label) and compute test error
172 |     val evaluator = new MulticlassClassificationEvaluator()
173 |       .setLabelCol("indexedLabel")
174 |       .setPredictionCol("prediction")
175 |       .setMetricName("precision")
176 |     val accuracy = evaluator.evaluate(predictions)
177 |     println("Test Error = " + (1.0 - accuracy))
178 | 
179 |     }
180 |     sc.stop()
181 |   }
182 | }
183 | 


--------------------------------------------------------------------------------
/SparkML/src/main/scala/RandomForestAllDaysBinary.scala:
--------------------------------------------------------------------------------
  1 | package iscx
  2 | 
  3 | import org.apache.spark.SparkContext
  4 | import org.apache.spark.sql._
  5 | import org.apache.spark.sql.functions._
  6 | import utils.{loadISCX, initSpark}
  7 | import org.apache.spark.sql.Row
  8 | 
  9 | 
 10 | import org.apache.spark.ml.{Pipeline, PipelineStage}
 11 | import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
 12 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
 13 | import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler}
 14 | 
 15 | 
 16 | 
 17 | object RandomForestAllDaysBinary {
 18 |   def main(args: Array[String]) {
 19 |     val datasetPath = args match {
 20 |        case Array(p,_*) => p
 21 |        case _           => "/var/spark/datasets/iscxids/labeled/"
 22 |      }
 23 |     val (sc,sqlContext) = initSpark()
 24 |     // Array[(String, DataFrame)]
 25 |     val dataframes  = loadISCX(sqlContext,datasetPath).map(_._2)
 26 |     val days = dataframes.map(_.select(
 27 |           "Tag"
 28 |         , "appName"
 29 |         , "destination"
 30 |         , "destinationPort"
 31 |         , "destinationTCPFlagsDescription"
 32 |         , "direction"
 33 |         , "protocolName"
 34 |         , "source"
 35 |         , "sourcePort"
 36 |         , "sourceTCPFlagsDescription"
 37 |         , "startDateTime"
 38 |         , "stopDateTime"
 39 |         , "totalDestinationBytes"
 40 |         , "totalDestinationPackets"
 41 |         , "totalSourceBytes"
 42 |         , "totalSourcePackets"
 43 |         ).na.fill("N/A")
 44 |     )
 45 |     val data = days.reduceLeft((a,b) =>
 46 |       a.unionAll(b))
 47 | 
 48 | 
 49 |     // MinMax
 50 |     // val (dstByMin, dstByMax) = data.agg(min($"totalDestinationBytes"), max($"totalDestinationBytes")).first match {
 51 |     //   case Row(x: Double, y: Double) => (x, y)
 52 |     // }
 53 | 
 54 |     // val scaledRange = lit(1) // Range of the scaled variable
 55 |     // val scaledMin = lit(0)  // Min value of the scaled variable
 56 |     // val vNormalized = ($"totalDestinationBytes" - vMin) / (vMax - vMin) // v normalized to (0, 1) range
 57 | 
 58 |     // val vScaled = scaledRange * vNormalized + scaledMin
 59 |     // /MinMax
 60 |     val filteredData = sqlContext.createDataFrame(data.map { row =>
 61 |           Row(
 62 |               row.getString(0)  // tag
 63 |             , row.getString(1)  // appName
 64 |             , row.getString(2).split("\\.").take(2).mkString(".")  // destination
 65 |             , row.getLong(3)  // destinationPort
 66 |             , row.getString(4)  // destinationTCPFlagsDescription
 67 |             , row.getString(5)  // direction
 68 |             , row.getString(6)  // protocolName
 69 |             , row.getString(7).split("\\.").take(2).mkString(".")  // destination
 70 |             , row.getLong(8) // sourcePort
 71 |             , row.getString(9) // sourceTCPFlagsDescription
 72 |             , row.getString(10).drop(11).take(2) // startDateTime
 73 |             , row.getString(11).drop(11).take(2)// stopDateTime
 74 |             , row.getLong(12) // totalDestinationBytes
 75 |             , row.getLong(13) // totalDestinationPackets
 76 |             , row.getLong(14) // totalSourceBytes
 77 |             , row.getLong(15) // totalSourcePackets
 78 |             )
 79 |     }, data.schema)
 80 | 
 81 | 
 82 |     // Transform the non-numerical features using the pipeline api
 83 |     val stringColumns = filteredData.columns
 84 |       .filter(!_.contains("Payload"))
 85 |       .filter(!_.contains("total"))
 86 |       .filter(!_.contains("Port"))
 87 |       .filter(!_.contains("Tag"))
 88 | 
 89 |     val longColumns = filteredData.columns
 90 |       .filter(c => c.contains("total") || c.contains("Port"))
 91 | 
 92 |     // minMax
 93 | 
 94 |     // Index labels, adding metadata to the label column.
 95 |     // Fit on whole dataset to include all labels in index.
 96 |     val labelIndexer = new StringIndexer()
 97 |       .setInputCol("Tag")
 98 |       .setOutputCol("indexedLabel")
 99 | 
100 |     val transformers: Array[PipelineStage] = stringColumns
101 |       .map(cname => new StringIndexer()
102 |              .setInputCol(cname)
103 |              .setOutputCol(s"${cname}_index")
104 |     )
105 | 
106 |     val assembler  = new VectorAssembler()
107 |       .setInputCols((stringColumns
108 |                        .map(cname => s"${cname}_index")) ++ longColumns)
109 |       .setOutputCol("features")
110 | 
111 |     // Automatically identify categorical features, and index them.
112 |     // Set maxCategories so features with > 10 distinct values are treated as continuous.
113 |     val featureIndexer = new VectorIndexer()
114 |       .setInputCol("features")
115 |       .setOutputCol("indexedFeatures")
116 |       .setMaxCategories(10)
117 | 
118 |     // Split the data into training and test sets (30% held out for testing)
119 | 
120 |     // Train a RandomForest model.
121 |     val rf = new RandomForestClassifier()
122 |       .setLabelCol("indexedLabel")
123 |       .setFeaturesCol("indexedFeatures")
124 |       .setNumTrees(32)
125 |       .setMaxBins(20000)
126 | 
127 |     // Convert indexed labels back to original labels.
128 |     val labelConverter = new IndexToString()
129 |       .setInputCol("prediction")
130 |       .setOutputCol("predictedLabel")
131 |       .setLabels(Array("Normal","Attack"))
132 | 
133 |     // Chain indexers and forest in a Pipeline
134 | 
135 |     val transformationStages : Array[PipelineStage] =
136 |         Array(labelIndexer) ++
137 |         transformers :+
138 |         assembler :+
139 |         featureIndexer
140 |     val preProcessers = new Pipeline().setStages(transformationStages)
141 | 
142 |     val stages : Array[PipelineStage] =
143 |       Array(rf,labelConverter)
144 | 
145 |     val dataModel = preProcessers.fit(filteredData)
146 |     val transformedData = dataModel.transform(filteredData)
147 | 
148 |     // transformedData.write
149 |     //   .format("com.databricks.spark.csv")
150 |     //   .option("header", "true")
151 |     // .save("/var/spark/datasets/iscx-processed/alldays")
152 | 
153 | 
154 |     val pipeline = new Pipeline()
155 |       .setStages(stages)
156 | 
157 |     val Array(trainingData, testData) = transformedData.randomSplit(Array(0.7, 0.3))
158 |     trainingData.cache()
159 |     testData.cache()
160 |     // Train model.  This also runs the indexers.
161 |     val model = pipeline.fit(trainingData)
162 | 
163 |     // // Make predictions.
164 |     val predictions = model.transform(testData)
165 | 
166 |     // // Select example rows to display.
167 |     predictions.select("predictedLabel", "Tag", "features").show(5)
168 | 
169 |     val rfModel = model.stages.init.last.asInstanceOf[RandomForestClassificationModel]
170 |     println("Learned classification forest model:\n" + rfModel.toDebugString)
171 |     val featuresImportance = rfModel.featureImportances.toArray.mkString(",")
172 |     println(s"Feature Importances")
173 |     println(featuresImportance)
174 | 
175 |     // // Select (prediction, true label) and compute test error
176 |     val evaluator = new MulticlassClassificationEvaluator()
177 |       .setLabelCol("indexedLabel")
178 |       .setPredictionCol("prediction")
179 |       .setMetricName("precision")
180 |     val accuracy = evaluator.evaluate(predictions)
181 |     println("Test Error = " + (1.0 - accuracy))
182 | 
183 |     sc.stop()
184 |   }
185 | }
186 | 


--------------------------------------------------------------------------------
/SparkML/src/main/scala/RandomForestIndividualDays.scala:
--------------------------------------------------------------------------------
  1 | package iscx
  2 | 
  3 | import org.apache.spark.SparkContext
  4 | import org.apache.spark.sql._
  5 | import org.apache.spark.sql.functions._
  6 | import utils.{loadISCX, initSpark}
  7 | import org.apache.spark.sql.Row
  8 | 
  9 | import scala.collection.mutable.{ArrayBuffer}
 10 | 
 11 | 
 12 | import org.apache.spark.ml.{Pipeline, PipelineStage}
 13 | import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
 14 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
 15 | import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler}
 16 | 
 17 | 
 18 | 
 19 | object RandomForestIndividualDays {
 20 |   def main(args: Array[String]) {
 21 |     val datasetPath = args match {
 22 |        case Array(p,_*) => p
 23 |        case _           => "/var/spark/datasets/iscxids/labeled/"
 24 |      }
 25 |     val (sc,sqlContext) = initSpark()
 26 |     // Array[(String, DataFrame)]
 27 |     val dataframes  = loadISCX(sqlContext,datasetPath)
 28 |     var featuresPerDay = new ArrayBuffer[(String, String)]()
 29 |      dataframes.foreach { d =>
 30 |       val data = d._2.select(
 31 |           "Tag"
 32 |         , "appName"
 33 |         , "destination"
 34 |         , "destinationPort"
 35 |         , "destinationTCPFlagsDescription"
 36 |         , "direction"
 37 |         , "protocolName"
 38 |         , "source"
 39 |         , "sourcePort"
 40 |         , "sourceTCPFlagsDescription"
 41 |         , "startDateTime"
 42 |         , "stopDateTime"
 43 |         , "totalDestinationBytes"
 44 |         , "totalDestinationPackets"
 45 |         , "totalSourceBytes"
 46 |         , "totalSourcePackets"
 47 |       ).na.fill("N/A")
 48 | 
 49 |     // MinMax
 50 |     // val (dstByMin, dstByMax) = data.agg(min($"totalDestinationBytes"), max($"totalDestinationBytes")).first match {
 51 |     //   case Row(x: Double, y: Double) => (x, y)
 52 |     // }
 53 | 
 54 |     // val scaledRange = lit(1) // Range of the scaled variable
 55 |     // val scaledMin = lit(0)  // Min value of the scaled variable
 56 |     // val vNormalized = ($"totalDestinationBytes" - vMin) / (vMax - vMin) // v normalized to (0, 1) range
 57 | 
 58 |     // val vScaled = scaledRange * vNormalized + scaledMin
 59 |     // /MinMax
 60 |     val filteredData = sqlContext.createDataFrame(data.map { row =>
 61 |           Row(
 62 |               row.getString(0)  // tag
 63 |             , row.getString(1)  // appName
 64 |             , row.getString(2).split("\\.").take(2).mkString(".")  // destination
 65 |             , row.getLong(3)  // destinationPort
 66 |             , row.getString(4)  // destinationTCPFlagsDescription
 67 |             , row.getString(5)  // direction
 68 |             , row.getString(6)  // protocolName
 69 |             , row.getString(7).split("\\.").take(2).mkString(".")  // destination
 70 |             , row.getLong(8) // sourcePort
 71 |             , row.getString(9) // sourceTCPFlagsDescription
 72 |             , row.getString(10).drop(11).take(2) // startDateTime
 73 |             , row.getString(11).drop(11).take(2)// stopDateTime
 74 |             , row.getLong(12) // totalDestinationBytes
 75 |             , row.getLong(13) // totalDestinationPackets
 76 |             , row.getLong(14) // totalSourceBytes
 77 |             , row.getLong(15) // totalSourcePackets
 78 |             )
 79 |     }, data.schema)
 80 | 
 81 | 
 82 |     // Transform the non-numerical features using the pipeline api
 83 |     val stringColumns = filteredData.columns
 84 |       .filter(!_.contains("Payload"))
 85 |       .filter(!_.contains("total"))
 86 |       .filter(!_.contains("Port"))
 87 |       .filter(!_.contains("Tag"))
 88 | 
 89 |     val longColumns = filteredData.columns
 90 |       .filter(c => c.contains("total") || c.contains("Port"))
 91 | 
 92 |     // minMax
 93 | 
 94 |     // Index labels, adding metadata to the label column.
 95 |     // Fit on whole dataset to include all labels in index.
 96 |     val labelIndexer = new StringIndexer()
 97 |       .setInputCol("Tag")
 98 |       .setOutputCol("indexedLabel")
 99 | 
100 |     val transformers: Array[PipelineStage] = stringColumns
101 |       .map(cname => new StringIndexer()
102 |              .setInputCol(cname)
103 |              .setOutputCol(s"${cname}_index")
104 |     )
105 | 
106 |     val assembler  = new VectorAssembler()
107 |       .setInputCols((stringColumns
108 |                        .map(cname => s"${cname}_index")) ++ longColumns)
109 |       .setOutputCol("features")
110 | 
111 |     // Automatically identify categorical features, and index them.
112 |     // Set maxCategories so features with > 10 distinct values are treated as continuous.
113 |     val featureIndexer = new VectorIndexer()
114 |       .setInputCol("features")
115 |       .setOutputCol("indexedFeatures")
116 |       .setMaxCategories(10)
117 | 
118 |     // Split the data into training and test sets (30% held out for testing)
119 | 
120 |     // Train a RandomForest model.
121 |     val rf = new RandomForestClassifier()
122 |       .setLabelCol("indexedLabel")
123 |       .setFeaturesCol("indexedFeatures")
124 |       .setNumTrees(32)
125 |       .setMaxBins(10000)
126 | 
127 |     // Convert indexed labels back to original labels.
128 |     val labelConverter = new IndexToString()
129 |       .setInputCol("prediction")
130 |       .setOutputCol("predictedLabel")
131 |       .setLabels(Array("Normal","Attack"))
132 | 
133 |     // Chain indexers and forest in a Pipeline
134 | 
135 |     val transformationStages : Array[PipelineStage] =
136 |         Array(labelIndexer) ++
137 |         transformers :+
138 |         assembler :+
139 |         featureIndexer
140 |     val preProcessers = new Pipeline().setStages(transformationStages)
141 | 
142 |     val stages : Array[PipelineStage] =
143 |       Array(rf,labelConverter)
144 | 
145 |     val dataModel = preProcessers.fit(filteredData)
146 |     val transformedData = dataModel.transform(filteredData)
147 | 
148 |     transformedData.write
149 |       .format("com.databricks.spark.csv")
150 |       .option("header", "true")
151 |     .save("/var/spark/datasets/iscx-processed/" + d._1)
152 | 
153 | 
154 |     val pipeline = new Pipeline()
155 |       .setStages(stages)
156 | 
157 |     val Array(trainingData, testData) = transformedData.randomSplit(Array(0.7, 0.3))
158 |     trainingData.cache()
159 |     testData.cache()
160 |     // Train model.  This also runs the indexers.
161 |     val model = pipeline.fit(trainingData)
162 | 
163 |     // // Make predictions.
164 |     val predictions = model.transform(testData)
165 | 
166 |     // // Select example rows to display.
167 |     predictions.select("predictedLabel", "Tag", "features").show(5)
168 | 
169 |     val rfModel = model.stages.init.last.asInstanceOf[RandomForestClassificationModel]
170 |     println("Learned classification forest model:\n" + rfModel.toDebugString)
171 |     val featuresImportance = rfModel.featureImportances.toArray.mkString(",")
172 |     featuresPerDay += ((d._1, featuresImportance))
173 |     println(s"Feature Importances for" + d._1)
174 |     println(featuresImportance)
175 | 
176 |     // // Select (prediction, true label) and compute test error
177 |     val evaluator = new MulticlassClassificationEvaluator()
178 |       .setLabelCol("indexedLabel")
179 |       .setPredictionCol("prediction")
180 |       .setMetricName("precision")
181 |     val accuracy = evaluator.evaluate(predictions)
182 |     println("Test Error = " + (1.0 - accuracy))
183 | 
184 |     }
185 |     println("Features Importance for individual days:")
186 |     featuresPerDay.foreach(println)
187 |     sc.stop()
188 |   }
189 | }
190 | 


--------------------------------------------------------------------------------