├── SparkML ├── project │ ├── build.properties │ ├── plugins.sbt │ └── assembly.sbt ├── readme.md ├── build.sbt ├── src │ └── main │ │ └── scala │ │ ├── Main.scala │ │ ├── utils.scala │ │ ├── RandomForest.scala │ │ ├── RandomForestCluster.scala │ │ ├── RandomForestAllDaysBinary.scala │ │ └── RandomForestIndividualDays.scala └── LICENSE ├── Conv-LSTM-Keras ├── README.md └── Data_Extraction_Revised.py └── README.md /SparkML/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.8 -------------------------------------------------------------------------------- /SparkML/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn -------------------------------------------------------------------------------- /SparkML/project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3") 2 | -------------------------------------------------------------------------------- /SparkML/readme.md: -------------------------------------------------------------------------------- 1 | This repo contains code for a [Spark](spark.apache.org) application defining a pipeline for intrusion detection (through supervised machine learning -- multiclass classification with Random Forest) based on ip flow data. 2 | 3 | The dataset can be found [here](http://www.unb.ca/research/iscx/dataset/iscx-IDS-dataset.html), and it consists of several different types of well-known network attacks, such as DDOS and Brute Force SSH connections. 4 | -------------------------------------------------------------------------------- /SparkML/build.sbt: -------------------------------------------------------------------------------- 1 | name := "iscx-ids-spark" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.5" 6 | 7 | libraryDependencies ++= Seq( 8 | ("org.apache.spark" %% "spark-core" % "1.6.2" % "provided"). 9 | exclude("org.mortbay.jetty", "servlet-api"). 10 | exclude("commons-beanutils", "commons-beanutils-core"). 11 | exclude("commons-collections", "commons-collections"). 12 | exclude("commons-logging", "commons-logging"). 13 | exclude("com.esotericsoftware.minlog", "minlog"), 14 | "org.apache.spark" %% "spark-mllib" % "1.6.2" % "provided", 15 | "com.databricks" %% "spark-csv" % "1.4.0", 16 | "com.databricks" %% "spark-xml" % "0.3.3" 17 | ) 18 | -------------------------------------------------------------------------------- /SparkML/src/main/scala/Main.scala: -------------------------------------------------------------------------------- 1 | package iscx 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.functions._ 6 | import utils.{loadISCX, initSpark} 7 | 8 | 9 | object Stats { 10 | def main(args: Array[String]) { 11 | val datasetPath = args match { 12 | case Array(p,_*) => p 13 | case _ => "/var/spark/datasets/iscxids/labeled/" 14 | } 15 | val (sc,sqlContext) = initSpark() 16 | val dataframes = loadISCX(sqlContext,datasetPath) 17 | 18 | dataframes.foreach { d => 19 | println("Dia: " + d._1) 20 | println("Número de fluxos: " + d._2.count.toString) 21 | val groupedByTag = d._2 22 | .groupBy("Tag") 23 | .agg(count("Tag").as("count")) 24 | // val normal = groupedByTag.filter(""Tag".equals("Normal")) 25 | println("Proporção normal/ataque: ") 26 | groupedByTag.show 27 | } 28 | sc.stop() 29 | } 30 | 31 | 32 | } 33 | -------------------------------------------------------------------------------- /SparkML/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Luis Fernando Milano Oliveira (c) 2016 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Luis Fernando Milano Oliveira nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /Conv-LSTM-Keras/README.md: -------------------------------------------------------------------------------- 1 | # Intrusion Detection System using Deep Learning 2 | 3 | VGG-19 deep learning model trained using ISCX 2012 IDS Dataset 4 | 5 | # Framework & API's 6 | 7 | * Tensorflow-GPU 8 | * Keras 9 | * NVIDIA CUDA Toolkit 9.0 10 | * NVIDIA cuDNN 7.0 11 | 12 | # Tools 13 | 14 | * Anaconda (Python 3.6) 15 | * PyCharm 16 | 17 | 18 | # How to use 19 | Download the ISCX 2012 data set from the link 20 | 21 | http://www.unb.ca/cic/datasets/ids.html 22 | 23 | Then run the Java program known as ISCX FlowMeter which is found here on GitHub. You can use any IDE for that 24 | 25 | https://github.com/ISCX/CICFlowMeter (if this doesnt convert .PCAP to .XML then try below) 26 | 27 | https://github.com/ISCX/ISCXFlowMeter 28 | 29 | Next I want you to make sure that your system is capable of running deep learning software. To check you can follow this guide that I have created: 30 | 31 | https://towardsdatascience.com/python-environment-setup-for-deep-learning-on-windows-10-c373786e36d1 32 | 33 | #### Note: If your system is inadequate then I humbly request you to stop here as the program will not perform efficiently and a great deal of time will be wasted. 34 | 35 | Next run the program on the pre-processed data (change the location of the save file in the code). This will take out the relevant data fields in XML format for each file and process the data into Numpy Arrays by running the following python file: 36 | 37 | Data_Extraction_Revised.py 38 | 39 | When completed you can now run (assuming you have Jupyter Notebook) the program. 40 | You have to change the location of the save file, in the code, to the save file from the revised data extraction program 41 | 42 | FYP-Revised.ipynb 43 | 44 | And you can begin training 45 | 46 | ## GOOD LUCK :) 47 | -------------------------------------------------------------------------------- /Conv-LSTM-Keras/Data_Extraction_Revised.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | import numpy as np 3 | import os 4 | import time 5 | 6 | import_directory = 'C:\\Users\Tamim Mirza\Documents\ISCX\labeled_flows_xml\\' 7 | 8 | files = os.listdir(import_directory) 9 | 10 | errors = [] 11 | 12 | start_time = time.time() 13 | i = -1 14 | data_array = np.empty((0, 2)) 15 | counter = 0 16 | actual = (50**2) * 3 17 | for file in files: 18 | print(file) 19 | try: 20 | tree = ET.parse(import_directory + file) 21 | print('Reading File ', file) 22 | root = tree.getroot() 23 | except: 24 | errors += file 25 | continue 26 | for child in root: 27 | for next_child in child: 28 | if next_child.tag == 'destinationPayloadAsUTF': 29 | if next_child.text is not None: 30 | x = next_child.text 31 | if len(x) > actual: 32 | x = x[: actual] 33 | else: 34 | while len(x) < actual: 35 | x += x 36 | x = x[:actual] 37 | if child.find('Tag').text == 'Normal': 38 | data_array = np.vstack((data_array, np.array([np.fromstring(x, dtype=np.uint8), 0]))) 39 | else: 40 | data_array = np.vstack((data_array, np.array([np.fromstring(x, dtype=np.uint8), 1]))) 41 | counter += 1 42 | print('Time taken: {}'.format(time.time() - start_time)) 43 | start_time = time.time() 44 | np.save('Database2\destinationPayload_' + file, np.array(data_array)) 45 | data_array = np.empty((0, 2)) 46 | 47 | print('Error in Opening Files = ', errors) 48 | print('Counter = ', counter) 49 | print('DONE!') -------------------------------------------------------------------------------- /SparkML/src/main/scala/utils.scala: -------------------------------------------------------------------------------- 1 | package iscx 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkContext._ 5 | import org.apache.spark.SparkConf 6 | import org.apache.spark.sql._ 7 | 8 | object utils { 9 | def initSpark() : (SparkContext,SQLContext) = { 10 | val conf = new SparkConf().setAppName("Simple Application") 11 | .setMaster("spark://10.90.67.77:7077") 12 | // .setMaster("local[4]") 13 | val sc = new SparkContext(conf) 14 | sc.setLogLevel("WARN") 15 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 16 | (sc,sqlContext) 17 | } 18 | 19 | def loadISCX(sqlContext : SQLContext, path : String) : Array[(String, DataFrame)] = { 20 | val days : Array[String] = Array( 21 | "TestbedSatJun12" 22 | , "TestbedSunJun13" 23 | , "TestbedMonJun14" 24 | , "TestbedTueJun15" 25 | , "TestbedWedJun16" 26 | , "TestbedThuJun17" 27 | ) 28 | 29 | val xmlFiles = days.map(d => path + d + ".xml") 30 | val zipped = days.zip(xmlFiles) 31 | 32 | zipped.map { d => 33 | (d._1.drop(10), sqlContext 34 | .read 35 | .format("com.databricks.spark.xml") 36 | .option("rowTag",d._1 + "Flows") 37 | .load(d._2) 38 | ) 39 | } 40 | // TestbedJun12 41 | // val jun12 = sqlContext.read 42 | // .format("com.databricks.spark.xml") 43 | // .option("rowTag",days(0)) 44 | // .load(xmlFiles(0)) 45 | // // TestbedJun13 46 | // val jun13 = sqlContext.read 47 | // .format("com.databricks.spark.xml") 48 | // .option("rowTag",days(1) + "Flows") 49 | // .load(xmlFiles(1)) 50 | // // TestbedJun14 51 | // val jun14 = sqlContext.read 52 | // .format("com.databricks.spark.xml") 53 | // .option("rowTag",days(2) + "Flows") 54 | // .load(xmlFiles(2)) 55 | // // TestbedJun15 56 | // val jun15 = sqlContext.read 57 | // .format("com.databricks.spark.xml") 58 | // .option("rowTag",days(3) + "Flows") 59 | // .load(xmlFiles(3)) 60 | // // TestbedJun16 61 | // val jun16 = sqlContext.read 62 | // .format("com.databricks.spark.xml") 63 | // .option("rowTag",days(4) + "Flows") 64 | // .load(xmlFiles(4)) 65 | // // TestbedJun17 66 | // val jun13 = sqlContext.read 67 | // .format("com.databricks.spark.xml") 68 | // .option("rowTag",days(1) + "Flows") 69 | // .load(xmlFiles(2)) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Scalable Deep Learning-based Intrusion Detection System using Conv-LSTM Network 2 | Intrusion detection system with Apache Spark and deep learning 3 | 4 | # Why and how to use this repository? 5 | This repository contains the implementation details and the code for our paper titled "A Scalable and Hybrid Deep Learning-based Intrusion Detection System using Convolutional-LSTM Network". This papers has been submitted to "Symmetry — Open Access Journal" (see http://www.mdpi.com/journal/symmetry). 6 | 7 | In network intrusion detection (IDS), anomaly-based approaches in particular suffer from accurate evaluation, comparison, and deployment which originates from the scarcity of adequate datasets. Many such datasets are internal and cannot be shared due to privacy issues, others are heavily anonymized and do not reflect current trends, or they lack certain statistical characteristics. These deficiencies are primarily the reasons why a perfect dataset is yet to exist. Thus, researchers must resort to datasets which they can obtain that are often suboptimal. 8 | 9 | As network behaviours and patterns change and intrusions evolve, it has very much become necessary to move away from static and one-time datasets toward more dynamically generated datasets which not only reflect the traffic compositions and intrusions of that time, but are also modifiable, extensible, and reproducible. 10 | 11 | As a proof-of-concept, we use the Intrusion detection evaluation dataset (ISCXIDS2012) to solve a classification problem, which accurately identifies anomalies. 12 | 13 | Nevertheless, to show the effectiveness of our proposed approach on both datasets, we implemented the first stage in Scala using Spark MLlib as the ML platform. The Conv-LSTM network, on the other hand, were implemented in Python using Keras. 14 | 15 | Experiments were performed on a computing cluster with 32 cores running 64-bit Ubuntu 14.04 OS. The software stack consisted of Apache Spark v2.3.0, Java (JDK) 1.8, Scala 2.11.8, and Keras. The Conv-LSTM network was trained on an Nvidia TitanX GPU with CUDA and cuDNN enabled to improve overall pipeline speed. 16 | 17 | ## Spark MLlib-based classifers: 18 | The following classifiers have been implemented to solve both the classification problems in a 2 stage cascading style: 19 | - Logistic Regression 20 | - Decision Trees 21 | - Random Forest 22 | - Multilayer Perceptron (MLP). 23 | 24 | Nevertheless, we implemnted Spark + H2O (aka. Sparkling Water) versions too. Take a look at the ArrhythmiaPredictionH2O.scala and URLReputationH2O.scala classes for the classification of the Cardiac Arrhythmia and indentifying suspicious URLs respectively. 25 | 26 | Make sure that Spark is properly configured. Also, you need to have Maven installed on Linux. If you prefer, Eclipse/IntelliJ IDEA, make sure that Maven plugin and Scala plugins are installed. 27 | 28 | If everything is properly configured, you can create a uber jar containing all the dependencies and execute the jar. Alternatively, you can execute each implementation as a stand-alone Scala project from your favourite IDE. 29 | 30 | ## DeepLearning4j-based LSTM networks: 31 | The Long Short-term Memory (LSTM) network has been implemented to solve the classification problem. The following are prerequisites when working with DL4J: 32 | - Java 1.8+ (64-bit only) 33 | - Apache Maven for automated build and dependency manager 34 | - IntelliJ IDEA or Eclipse IDE. 35 | 36 | For more information on how to configure DeepLearning4j, please refer to https://deeplearning4j.org/. If everything is properly configured, you can create a uber jar containing all the dependencies and execute the jar. Alternatively, you can execute each implementation as a stand-alone Java project from your favourite IDE. 37 | 38 | ## Citation request 39 | If you reuse this implementation, please cite our paper: 40 | 41 | @inproceedings{khan2018bigdata, 42 | title={A Scalable and Hybrid Deep Learning-based Intrusion Detection System using Convolutional-LSTM Network}, 43 | author={M. A., Khan; Karim, Md. Rezaul; Y. Kim }, 44 | booktitle={Symmetry — Open Access Journal}, 45 | year={2019} 46 | } 47 | 48 | ## Contributing 49 | For any questions, feel free to open an issue or contact at rezaul.karim@rwth-aachen.de 50 | -------------------------------------------------------------------------------- /SparkML/src/main/scala/RandomForest.scala: -------------------------------------------------------------------------------- 1 | package iscx 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.functions._ 6 | import utils.{loadISCX, initSpark} 7 | import org.apache.spark.sql.Row 8 | 9 | 10 | import org.apache.spark.ml.{Pipeline, PipelineStage} 11 | import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier} 12 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 13 | import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler} 14 | 15 | 16 | 17 | object RandomForest { 18 | def main(args: Array[String]) { 19 | val datasetPath = args match { 20 | case Array(p,_*) => p 21 | case _ => "/var/spark/datasets/iscxids/labeled/" 22 | } 23 | val (sc,sqlContext) = initSpark() 24 | // Array[(String, DataFrame)] 25 | val dataframes = loadISCX(sqlContext,datasetPath) 26 | 27 | Array(dataframes(2)).foreach { d => 28 | val data = d._2.select( 29 | "Tag" 30 | , "appName" 31 | , "destination" 32 | , "destinationPort" 33 | , "destinationTCPFlagsDescription" 34 | , "direction" 35 | , "protocolName" 36 | , "source" 37 | , "sourcePort" 38 | , "sourceTCPFlagsDescription" 39 | , "startDateTime" 40 | , "stopDateTime" 41 | , "totalDestinationBytes" 42 | , "totalDestinationPackets" 43 | , "totalSourceBytes" 44 | , "totalSourcePackets" 45 | ).na.fill("N/A") 46 | 47 | // MinMax 48 | // val (dstByMin, dstByMax) = data.agg(min($"totalDestinationBytes"), max($"totalDestinationBytes")).first match { 49 | // case Row(x: Double, y: Double) => (x, y) 50 | // } 51 | 52 | // val scaledRange = lit(1) // Range of the scaled variable 53 | // val scaledMin = lit(0) // Min value of the scaled variable 54 | // val vNormalized = ($"totalDestinationBytes" - vMin) / (vMax - vMin) // v normalized to (0, 1) range 55 | 56 | // val vScaled = scaledRange * vNormalized + scaledMin 57 | // /MinMax 58 | val filteredData = sqlContext.createDataFrame(data.map { row => 59 | Row( 60 | row.getString(0) // tag 61 | , row.getString(1) // appName 62 | , row.getString(2).split("\\.").take(2).mkString(".") // destination 63 | , row.getLong(3) // destinationPort 64 | , row.getString(4) // destinationTCPFlagsDescription 65 | , row.getString(5) // direction 66 | , row.getString(6) // protocolName 67 | , row.getString(7).split("\\.").take(2).mkString(".") // destination 68 | , row.getLong(8) // sourcePort 69 | , row.getString(9) // sourceTCPFlagsDescription 70 | , row.getString(10).drop(11).take(2) // startDateTime 71 | , row.getString(11).drop(11).take(2)// stopDateTime 72 | , row.getLong(12) // totalDestinationBytes 73 | , row.getLong(13) // totalDestinationPackets 74 | , row.getLong(14) // totalSourceBytes 75 | , row.getLong(15) // totalSourcePackets 76 | ) 77 | }, data.schema) 78 | 79 | 80 | // Transform the non-numerical features using the pipeline api 81 | val stringColumns = filteredData.columns 82 | .filter(!_.contains("Payload")) 83 | .filter(!_.contains("total")) 84 | .filter(!_.contains("Port")) 85 | 86 | val longColumns = filteredData.columns 87 | .filter(c => c.contains("total") || c.contains("Port")) 88 | 89 | // minMax 90 | 91 | // Index labels, adding metadata to the label column. 92 | // Fit on whole dataset to include all labels in index. 93 | val labelIndexer = new StringIndexer() 94 | .setInputCol("Tag") 95 | .setOutputCol("indexedLabel") 96 | 97 | val transformers: Array[PipelineStage] = stringColumns 98 | .map(cname => new StringIndexer() 99 | .setInputCol(cname) 100 | .setOutputCol(s"${cname}_index") 101 | ) 102 | 103 | val assembler = new VectorAssembler() 104 | .setInputCols((stringColumns 105 | .map(cname => s"${cname}_index")) ++ longColumns) 106 | .setOutputCol("features") 107 | 108 | // Automatically identify categorical features, and index them. 109 | // Set maxCategories so features with > 10 distinct values are treated as continuous. 110 | val featureIndexer = new VectorIndexer() 111 | .setInputCol("features") 112 | .setOutputCol("indexedFeatures") 113 | .setMaxCategories(10) 114 | 115 | // Split the data into training and test sets (30% held out for testing) 116 | 117 | // Train a RandomForest model. 118 | val rf = new RandomForestClassifier() 119 | .setLabelCol("indexedLabel") 120 | .setFeaturesCol("indexedFeatures") 121 | .setNumTrees(32) 122 | .setMaxBins(10000) 123 | 124 | // Convert indexed labels back to original labels. 125 | val labelConverter = new IndexToString() 126 | .setInputCol("prediction") 127 | .setOutputCol("predictedLabel") 128 | .setLabels(Array("Normal","Attack")) 129 | 130 | // Chain indexers and forest in a Pipeline 131 | 132 | val transformationStages : Array[PipelineStage] = 133 | Array(labelIndexer) ++ 134 | transformers :+ 135 | assembler :+ 136 | featureIndexer 137 | val preProcessers = new Pipeline().setStages(transformationStages) 138 | 139 | val stages : Array[PipelineStage] = 140 | Array(rf,labelConverter) 141 | 142 | val dataModel = preProcessers.fit(filteredData) 143 | val transformedData = dataModel.transform(filteredData) 144 | 145 | transformedData.write 146 | .format("com.databricks.spark.csv") 147 | .option("header", "true") 148 | .save("/var/spark/datasets/iscx-processed/" + d._1) 149 | 150 | 151 | val pipeline = new Pipeline() 152 | .setStages(stages) 153 | 154 | val Array(trainingData, testData) = transformedData.randomSplit(Array(0.7, 0.3)) 155 | trainingData.cache() 156 | testData.cache() 157 | // Train model. This also runs the indexers. 158 | val model = pipeline.fit(trainingData) 159 | 160 | // // Make predictions. 161 | val predictions = model.transform(testData) 162 | 163 | // // Select example rows to display. 164 | predictions.select("predictedLabel", "Tag", "features").show(5) 165 | 166 | val rfModel = model.stages.init.last.asInstanceOf[RandomForestClassificationModel] 167 | println("Learned classification forest model:\n" + rfModel.toDebugString) 168 | val featuresImportance = rfModel.featureImportances.toArray.mkString(",") 169 | println(s"Feature Importances for" + d._1) 170 | println(featuresImportance) 171 | 172 | // // Select (prediction, true label) and compute test error 173 | val evaluator = new MulticlassClassificationEvaluator() 174 | .setLabelCol("indexedLabel") 175 | .setPredictionCol("prediction") 176 | .setMetricName("precision") 177 | val accuracy = evaluator.evaluate(predictions) 178 | println("Test Error = " + (1.0 - accuracy)) 179 | 180 | } 181 | sc.stop() 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /SparkML/src/main/scala/RandomForestCluster.scala: -------------------------------------------------------------------------------- 1 | package iscx 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.sql._ 6 | import org.apache.spark.sql.functions._ 7 | import utils.{loadISCX, initSpark} 8 | import org.apache.spark.sql.Row 9 | 10 | 11 | import org.apache.spark.ml.{Pipeline, PipelineStage} 12 | import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier} 13 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 14 | import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler} 15 | 16 | object RandomForestCluster { 17 | def main(args: Array[String]) { 18 | val datasetPath = args match { 19 | case Array(p,_*) => p 20 | case _ => "/var/spark/datasets/iscxids/labeled/" 21 | } 22 | val conf = new SparkConf().setAppName("Simple Application") 23 | .setMaster("spark://10.90.67.77:7077") 24 | val sc = new SparkContext(conf) 25 | sc.setLogLevel("WARN") 26 | val sqlContext = new org.apache.spark.sql.SQLContext(sc) 27 | 28 | // Array[(String, DataFrame)] 29 | val dataframes = loadISCX(sqlContext,datasetPath) 30 | 31 | Array(dataframes(0)).foreach { d => 32 | val data = d._2.select( 33 | "Tag" 34 | , "appName" 35 | , "destination" 36 | , "destinationPort" 37 | , "destinationTCPFlagsDescription" 38 | , "direction" 39 | , "protocolName" 40 | , "source" 41 | , "sourcePort" 42 | , "sourceTCPFlagsDescription" 43 | , "startDateTime" 44 | , "stopDateTime" 45 | , "totalDestinationBytes" 46 | , "totalDestinationPackets" 47 | , "totalSourceBytes" 48 | , "totalSourcePackets" 49 | ).na.fill("N/A") 50 | 51 | // MinMax 52 | // val (dstByMin, dstByMax) = data.agg(min($"totalDestinationBytes"), max($"totalDestinationBytes")).first match { 53 | // case Row(x: Double, y: Double) => (x, y) 54 | // } 55 | 56 | // val scaledRange = lit(1) // Range of the scaled variable 57 | // val scaledMin = lit(0) // Min value of the scaled variable 58 | // val vNormalized = ($"totalDestinationBytes" - vMin) / (vMax - vMin) // v normalized to (0, 1) range 59 | 60 | // val vScaled = scaledRange * vNormalized + scaledMin 61 | // /MinMax 62 | val filteredData = sqlContext.createDataFrame(data.map { row => 63 | Row( 64 | row.getString(0) // tag 65 | , row.getString(1) // appName 66 | , row.getString(2).split("\\.").take(2).mkString(".") // destination 67 | , row.getLong(3) // destinationPort 68 | , row.getString(4) // destinationTCPFlagsDescription 69 | , row.getString(5) // direction 70 | , row.getString(6) // protocolName 71 | , row.getString(7).split("\\.").take(2).mkString(".") // destination 72 | , row.getLong(8) // sourcePort 73 | , row.getString(9) // sourceTCPFlagsDescription 74 | , row.getString(10).drop(11).take(2) // startDateTime 75 | , row.getString(11).drop(11).take(2)// stopDateTime 76 | , row.getLong(12) // totalDestinationBytes 77 | , row.getLong(13) // totalDestinationPackets 78 | , row.getLong(14) // totalSourceBytes 79 | , row.getLong(15) // totalSourcePackets 80 | ) 81 | }, data.schema) 82 | 83 | 84 | // Transform the non-numerical features using the pipeline api 85 | val stringColumns = filteredData.columns 86 | .filter(!_.contains("Payload")) 87 | .filter(!_.contains("total")) 88 | .filter(!_.contains("Port")) 89 | .filter(!_.contains("Tag")) 90 | 91 | val longColumns = filteredData.columns 92 | .filter(c => c.contains("total") || c.contains("Port")) 93 | 94 | // minMax 95 | 96 | // Index labels, adding metadata to the label column. 97 | // Fit on whole dataset to include all labels in index. 98 | val labelIndexer = new StringIndexer() 99 | .setInputCol("Tag") 100 | .setOutputCol("indexedLabel") 101 | 102 | val transformers: Array[PipelineStage] = stringColumns 103 | .map(cname => new StringIndexer() 104 | .setInputCol(cname) 105 | .setOutputCol(s"${cname}_index") 106 | ) 107 | 108 | val assembler = new VectorAssembler() 109 | .setInputCols((stringColumns 110 | .map(cname => s"${cname}_index")) ++ longColumns) 111 | .setOutputCol("features") 112 | 113 | // Automatically identify categorical features, and index them. 114 | // Set maxCategories so features with > 10 distinct values are treated as continuous. 115 | val featureIndexer = new VectorIndexer() 116 | .setInputCol("features") 117 | .setOutputCol("indexedFeatures") 118 | .setMaxCategories(10) 119 | 120 | // Split the data into training and test sets (30% held out for testing) 121 | 122 | // Train a RandomForest model. 123 | val rf = new RandomForestClassifier() 124 | .setLabelCol("indexedLabel") 125 | .setFeaturesCol("indexedFeatures") 126 | .setNumTrees(32) 127 | .setMaxBins(10000) 128 | 129 | // Convert indexed labels back to original labels. 130 | val labelConverter = new IndexToString() 131 | .setInputCol("prediction") 132 | .setOutputCol("predictedLabel") 133 | .setLabels(Array("Normal","Attack")) 134 | 135 | // Chain indexers and forest in a Pipeline 136 | 137 | val transformationStages : Array[PipelineStage] = 138 | Array(labelIndexer) ++ 139 | transformers :+ 140 | assembler :+ 141 | featureIndexer 142 | val preProcessers = new Pipeline().setStages(transformationStages) 143 | 144 | val stages : Array[PipelineStage] = 145 | Array(rf,labelConverter) 146 | 147 | val dataModel = preProcessers.fit(filteredData) 148 | val transformedData = dataModel.transform(filteredData) 149 | 150 | val pipeline = new Pipeline() 151 | .setStages(stages) 152 | 153 | val Array(trainingData, testData) = transformedData.randomSplit(Array(0.7, 0.3)) 154 | trainingData.cache() 155 | testData.cache() 156 | // Train model. This also runs the indexers. 157 | val model = pipeline.fit(trainingData) 158 | 159 | // // Make predictions. 160 | val predictions = model.transform(testData) 161 | 162 | // // Select example rows to display. 163 | predictions.select("predictedLabel", "Tag", "features").show(5) 164 | 165 | val rfModel = model.stages.init.last.asInstanceOf[RandomForestClassificationModel] 166 | println("Learned classification forest model:\n" + rfModel.toDebugString) 167 | val featuresImportance = rfModel.featureImportances.toArray.mkString(",") 168 | println(s"Feature Importances for" + d._1) 169 | println(featuresImportance) 170 | 171 | // // Select (prediction, true label) and compute test error 172 | val evaluator = new MulticlassClassificationEvaluator() 173 | .setLabelCol("indexedLabel") 174 | .setPredictionCol("prediction") 175 | .setMetricName("precision") 176 | val accuracy = evaluator.evaluate(predictions) 177 | println("Test Error = " + (1.0 - accuracy)) 178 | 179 | } 180 | sc.stop() 181 | } 182 | } 183 | -------------------------------------------------------------------------------- /SparkML/src/main/scala/RandomForestAllDaysBinary.scala: -------------------------------------------------------------------------------- 1 | package iscx 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.functions._ 6 | import utils.{loadISCX, initSpark} 7 | import org.apache.spark.sql.Row 8 | 9 | 10 | import org.apache.spark.ml.{Pipeline, PipelineStage} 11 | import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier} 12 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 13 | import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler} 14 | 15 | 16 | 17 | object RandomForestAllDaysBinary { 18 | def main(args: Array[String]) { 19 | val datasetPath = args match { 20 | case Array(p,_*) => p 21 | case _ => "/var/spark/datasets/iscxids/labeled/" 22 | } 23 | val (sc,sqlContext) = initSpark() 24 | // Array[(String, DataFrame)] 25 | val dataframes = loadISCX(sqlContext,datasetPath).map(_._2) 26 | val days = dataframes.map(_.select( 27 | "Tag" 28 | , "appName" 29 | , "destination" 30 | , "destinationPort" 31 | , "destinationTCPFlagsDescription" 32 | , "direction" 33 | , "protocolName" 34 | , "source" 35 | , "sourcePort" 36 | , "sourceTCPFlagsDescription" 37 | , "startDateTime" 38 | , "stopDateTime" 39 | , "totalDestinationBytes" 40 | , "totalDestinationPackets" 41 | , "totalSourceBytes" 42 | , "totalSourcePackets" 43 | ).na.fill("N/A") 44 | ) 45 | val data = days.reduceLeft((a,b) => 46 | a.unionAll(b)) 47 | 48 | 49 | // MinMax 50 | // val (dstByMin, dstByMax) = data.agg(min($"totalDestinationBytes"), max($"totalDestinationBytes")).first match { 51 | // case Row(x: Double, y: Double) => (x, y) 52 | // } 53 | 54 | // val scaledRange = lit(1) // Range of the scaled variable 55 | // val scaledMin = lit(0) // Min value of the scaled variable 56 | // val vNormalized = ($"totalDestinationBytes" - vMin) / (vMax - vMin) // v normalized to (0, 1) range 57 | 58 | // val vScaled = scaledRange * vNormalized + scaledMin 59 | // /MinMax 60 | val filteredData = sqlContext.createDataFrame(data.map { row => 61 | Row( 62 | row.getString(0) // tag 63 | , row.getString(1) // appName 64 | , row.getString(2).split("\\.").take(2).mkString(".") // destination 65 | , row.getLong(3) // destinationPort 66 | , row.getString(4) // destinationTCPFlagsDescription 67 | , row.getString(5) // direction 68 | , row.getString(6) // protocolName 69 | , row.getString(7).split("\\.").take(2).mkString(".") // destination 70 | , row.getLong(8) // sourcePort 71 | , row.getString(9) // sourceTCPFlagsDescription 72 | , row.getString(10).drop(11).take(2) // startDateTime 73 | , row.getString(11).drop(11).take(2)// stopDateTime 74 | , row.getLong(12) // totalDestinationBytes 75 | , row.getLong(13) // totalDestinationPackets 76 | , row.getLong(14) // totalSourceBytes 77 | , row.getLong(15) // totalSourcePackets 78 | ) 79 | }, data.schema) 80 | 81 | 82 | // Transform the non-numerical features using the pipeline api 83 | val stringColumns = filteredData.columns 84 | .filter(!_.contains("Payload")) 85 | .filter(!_.contains("total")) 86 | .filter(!_.contains("Port")) 87 | .filter(!_.contains("Tag")) 88 | 89 | val longColumns = filteredData.columns 90 | .filter(c => c.contains("total") || c.contains("Port")) 91 | 92 | // minMax 93 | 94 | // Index labels, adding metadata to the label column. 95 | // Fit on whole dataset to include all labels in index. 96 | val labelIndexer = new StringIndexer() 97 | .setInputCol("Tag") 98 | .setOutputCol("indexedLabel") 99 | 100 | val transformers: Array[PipelineStage] = stringColumns 101 | .map(cname => new StringIndexer() 102 | .setInputCol(cname) 103 | .setOutputCol(s"${cname}_index") 104 | ) 105 | 106 | val assembler = new VectorAssembler() 107 | .setInputCols((stringColumns 108 | .map(cname => s"${cname}_index")) ++ longColumns) 109 | .setOutputCol("features") 110 | 111 | // Automatically identify categorical features, and index them. 112 | // Set maxCategories so features with > 10 distinct values are treated as continuous. 113 | val featureIndexer = new VectorIndexer() 114 | .setInputCol("features") 115 | .setOutputCol("indexedFeatures") 116 | .setMaxCategories(10) 117 | 118 | // Split the data into training and test sets (30% held out for testing) 119 | 120 | // Train a RandomForest model. 121 | val rf = new RandomForestClassifier() 122 | .setLabelCol("indexedLabel") 123 | .setFeaturesCol("indexedFeatures") 124 | .setNumTrees(32) 125 | .setMaxBins(20000) 126 | 127 | // Convert indexed labels back to original labels. 128 | val labelConverter = new IndexToString() 129 | .setInputCol("prediction") 130 | .setOutputCol("predictedLabel") 131 | .setLabels(Array("Normal","Attack")) 132 | 133 | // Chain indexers and forest in a Pipeline 134 | 135 | val transformationStages : Array[PipelineStage] = 136 | Array(labelIndexer) ++ 137 | transformers :+ 138 | assembler :+ 139 | featureIndexer 140 | val preProcessers = new Pipeline().setStages(transformationStages) 141 | 142 | val stages : Array[PipelineStage] = 143 | Array(rf,labelConverter) 144 | 145 | val dataModel = preProcessers.fit(filteredData) 146 | val transformedData = dataModel.transform(filteredData) 147 | 148 | // transformedData.write 149 | // .format("com.databricks.spark.csv") 150 | // .option("header", "true") 151 | // .save("/var/spark/datasets/iscx-processed/alldays") 152 | 153 | 154 | val pipeline = new Pipeline() 155 | .setStages(stages) 156 | 157 | val Array(trainingData, testData) = transformedData.randomSplit(Array(0.7, 0.3)) 158 | trainingData.cache() 159 | testData.cache() 160 | // Train model. This also runs the indexers. 161 | val model = pipeline.fit(trainingData) 162 | 163 | // // Make predictions. 164 | val predictions = model.transform(testData) 165 | 166 | // // Select example rows to display. 167 | predictions.select("predictedLabel", "Tag", "features").show(5) 168 | 169 | val rfModel = model.stages.init.last.asInstanceOf[RandomForestClassificationModel] 170 | println("Learned classification forest model:\n" + rfModel.toDebugString) 171 | val featuresImportance = rfModel.featureImportances.toArray.mkString(",") 172 | println(s"Feature Importances") 173 | println(featuresImportance) 174 | 175 | // // Select (prediction, true label) and compute test error 176 | val evaluator = new MulticlassClassificationEvaluator() 177 | .setLabelCol("indexedLabel") 178 | .setPredictionCol("prediction") 179 | .setMetricName("precision") 180 | val accuracy = evaluator.evaluate(predictions) 181 | println("Test Error = " + (1.0 - accuracy)) 182 | 183 | sc.stop() 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /SparkML/src/main/scala/RandomForestIndividualDays.scala: -------------------------------------------------------------------------------- 1 | package iscx 2 | 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.functions._ 6 | import utils.{loadISCX, initSpark} 7 | import org.apache.spark.sql.Row 8 | 9 | import scala.collection.mutable.{ArrayBuffer} 10 | 11 | 12 | import org.apache.spark.ml.{Pipeline, PipelineStage} 13 | import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier} 14 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 15 | import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler} 16 | 17 | 18 | 19 | object RandomForestIndividualDays { 20 | def main(args: Array[String]) { 21 | val datasetPath = args match { 22 | case Array(p,_*) => p 23 | case _ => "/var/spark/datasets/iscxids/labeled/" 24 | } 25 | val (sc,sqlContext) = initSpark() 26 | // Array[(String, DataFrame)] 27 | val dataframes = loadISCX(sqlContext,datasetPath) 28 | var featuresPerDay = new ArrayBuffer[(String, String)]() 29 | dataframes.foreach { d => 30 | val data = d._2.select( 31 | "Tag" 32 | , "appName" 33 | , "destination" 34 | , "destinationPort" 35 | , "destinationTCPFlagsDescription" 36 | , "direction" 37 | , "protocolName" 38 | , "source" 39 | , "sourcePort" 40 | , "sourceTCPFlagsDescription" 41 | , "startDateTime" 42 | , "stopDateTime" 43 | , "totalDestinationBytes" 44 | , "totalDestinationPackets" 45 | , "totalSourceBytes" 46 | , "totalSourcePackets" 47 | ).na.fill("N/A") 48 | 49 | // MinMax 50 | // val (dstByMin, dstByMax) = data.agg(min($"totalDestinationBytes"), max($"totalDestinationBytes")).first match { 51 | // case Row(x: Double, y: Double) => (x, y) 52 | // } 53 | 54 | // val scaledRange = lit(1) // Range of the scaled variable 55 | // val scaledMin = lit(0) // Min value of the scaled variable 56 | // val vNormalized = ($"totalDestinationBytes" - vMin) / (vMax - vMin) // v normalized to (0, 1) range 57 | 58 | // val vScaled = scaledRange * vNormalized + scaledMin 59 | // /MinMax 60 | val filteredData = sqlContext.createDataFrame(data.map { row => 61 | Row( 62 | row.getString(0) // tag 63 | , row.getString(1) // appName 64 | , row.getString(2).split("\\.").take(2).mkString(".") // destination 65 | , row.getLong(3) // destinationPort 66 | , row.getString(4) // destinationTCPFlagsDescription 67 | , row.getString(5) // direction 68 | , row.getString(6) // protocolName 69 | , row.getString(7).split("\\.").take(2).mkString(".") // destination 70 | , row.getLong(8) // sourcePort 71 | , row.getString(9) // sourceTCPFlagsDescription 72 | , row.getString(10).drop(11).take(2) // startDateTime 73 | , row.getString(11).drop(11).take(2)// stopDateTime 74 | , row.getLong(12) // totalDestinationBytes 75 | , row.getLong(13) // totalDestinationPackets 76 | , row.getLong(14) // totalSourceBytes 77 | , row.getLong(15) // totalSourcePackets 78 | ) 79 | }, data.schema) 80 | 81 | 82 | // Transform the non-numerical features using the pipeline api 83 | val stringColumns = filteredData.columns 84 | .filter(!_.contains("Payload")) 85 | .filter(!_.contains("total")) 86 | .filter(!_.contains("Port")) 87 | .filter(!_.contains("Tag")) 88 | 89 | val longColumns = filteredData.columns 90 | .filter(c => c.contains("total") || c.contains("Port")) 91 | 92 | // minMax 93 | 94 | // Index labels, adding metadata to the label column. 95 | // Fit on whole dataset to include all labels in index. 96 | val labelIndexer = new StringIndexer() 97 | .setInputCol("Tag") 98 | .setOutputCol("indexedLabel") 99 | 100 | val transformers: Array[PipelineStage] = stringColumns 101 | .map(cname => new StringIndexer() 102 | .setInputCol(cname) 103 | .setOutputCol(s"${cname}_index") 104 | ) 105 | 106 | val assembler = new VectorAssembler() 107 | .setInputCols((stringColumns 108 | .map(cname => s"${cname}_index")) ++ longColumns) 109 | .setOutputCol("features") 110 | 111 | // Automatically identify categorical features, and index them. 112 | // Set maxCategories so features with > 10 distinct values are treated as continuous. 113 | val featureIndexer = new VectorIndexer() 114 | .setInputCol("features") 115 | .setOutputCol("indexedFeatures") 116 | .setMaxCategories(10) 117 | 118 | // Split the data into training and test sets (30% held out for testing) 119 | 120 | // Train a RandomForest model. 121 | val rf = new RandomForestClassifier() 122 | .setLabelCol("indexedLabel") 123 | .setFeaturesCol("indexedFeatures") 124 | .setNumTrees(32) 125 | .setMaxBins(10000) 126 | 127 | // Convert indexed labels back to original labels. 128 | val labelConverter = new IndexToString() 129 | .setInputCol("prediction") 130 | .setOutputCol("predictedLabel") 131 | .setLabels(Array("Normal","Attack")) 132 | 133 | // Chain indexers and forest in a Pipeline 134 | 135 | val transformationStages : Array[PipelineStage] = 136 | Array(labelIndexer) ++ 137 | transformers :+ 138 | assembler :+ 139 | featureIndexer 140 | val preProcessers = new Pipeline().setStages(transformationStages) 141 | 142 | val stages : Array[PipelineStage] = 143 | Array(rf,labelConverter) 144 | 145 | val dataModel = preProcessers.fit(filteredData) 146 | val transformedData = dataModel.transform(filteredData) 147 | 148 | transformedData.write 149 | .format("com.databricks.spark.csv") 150 | .option("header", "true") 151 | .save("/var/spark/datasets/iscx-processed/" + d._1) 152 | 153 | 154 | val pipeline = new Pipeline() 155 | .setStages(stages) 156 | 157 | val Array(trainingData, testData) = transformedData.randomSplit(Array(0.7, 0.3)) 158 | trainingData.cache() 159 | testData.cache() 160 | // Train model. This also runs the indexers. 161 | val model = pipeline.fit(trainingData) 162 | 163 | // // Make predictions. 164 | val predictions = model.transform(testData) 165 | 166 | // // Select example rows to display. 167 | predictions.select("predictedLabel", "Tag", "features").show(5) 168 | 169 | val rfModel = model.stages.init.last.asInstanceOf[RandomForestClassificationModel] 170 | println("Learned classification forest model:\n" + rfModel.toDebugString) 171 | val featuresImportance = rfModel.featureImportances.toArray.mkString(",") 172 | featuresPerDay += ((d._1, featuresImportance)) 173 | println(s"Feature Importances for" + d._1) 174 | println(featuresImportance) 175 | 176 | // // Select (prediction, true label) and compute test error 177 | val evaluator = new MulticlassClassificationEvaluator() 178 | .setLabelCol("indexedLabel") 179 | .setPredictionCol("prediction") 180 | .setMetricName("precision") 181 | val accuracy = evaluator.evaluate(predictions) 182 | println("Test Error = " + (1.0 - accuracy)) 183 | 184 | } 185 | println("Features Importance for individual days:") 186 | featuresPerDay.foreach(println) 187 | sc.stop() 188 | } 189 | } 190 | --------------------------------------------------------------------------------