├── Chapter01 ├── .classpath ├── .project ├── data │ └── Cryotherapy.csv ├── pom.xml └── src │ └── main │ └── scala │ └── GettingStartedML │ └── CryotherapyPrediction.scala ├── Chapter02 ├── pom.xml └── src │ └── main │ └── scala │ └── RegressionAnalysis │ ├── EDA.scala │ ├── UrbanTrafficGeneralizedLinearRegression.scala │ └── UrbanTrafficLinearRegression.scala ├── Chapter03 ├── pom.xml └── src │ └── main │ └── scala │ └── ScalaClassification │ ├── ChurnPredictionLR.scala │ ├── ChurnPredictionNB.scala │ ├── ChurnPredictionSVM.scala │ ├── Describe.scala │ ├── PipelineConstruction.scala │ └── Preprocessing.scala ├── Chapter04 ├── pom.xml └── src │ └── main │ └── scala │ └── ScalaTreeEnsimbles │ ├── ChurnPredictionDT.scala │ ├── ChurnPredictionGBT.scala │ ├── ChurnPredictionRF.scala │ ├── Preproessing.scala │ ├── UrbanTrafficDTRegressor.scala │ ├── UrbanTrafficGBTRegressor.scala │ └── UrbanTrafficRFRegressor.scala ├── Chapter05 ├── pom.xml └── src │ └── main │ └── scala │ └── org │ └── fit │ └── genomics │ ├── PCA.scala │ └── PopStratClustering.scala ├── Chapter06 ├── pom.xml └── src │ └── main │ └── scala │ └── ScalaBookRecommendation │ └── BookRecommendation.scala ├── Chapter07 ├── pom.xml └── src │ ├── main │ └── scala │ │ └── GettingStartedDL │ │ ├── CancerDataPreprocessor.scala │ │ └── CancerTypePrediction.scala │ └── test │ └── scala │ └── com │ └── packt │ └── ScalaMLQuickStartGuide │ └── AppTest.java ├── LICENSE └── README.md /Chapter01/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Chapter01/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | ScalaMLQuickStartGuide 4 | 5 | 6 | 7 | 8 | 9 | org.scala-ide.sdt.core.scalabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.scala-ide.sdt.core.scalanature 21 | org.eclipse.jdt.core.javanature 22 | org.eclipse.m2e.core.maven2Nature 23 | 24 | 25 | -------------------------------------------------------------------------------- /Chapter01/data/Cryotherapy.csv: -------------------------------------------------------------------------------- 1 | sex,age,Time,Number_of_Warts,Type,Area,Result_of_Treatment 2 | 1,35,12,5,1,100,0 3 | 1,29,7,5,1,96,1 4 | 1,50,8,1,3,132,0 5 | 1,32,11.75,7,3,750,0 6 | 1,67,9.25,1,1,42,0 7 | 1,41,8,2,2,20,1 8 | 1,36,11,2,1,8,0 9 | 1,59,3.5,3,3,20,0 10 | 1,20,4.5,12,1,6,1 11 | 2,34,11.25,3,3,150,0 12 | 2,21,10.75,5,1,35,0 13 | 2,15,6,2,1,30,1 14 | 2,15,2,3,1,4,1 15 | 2,15,3.75,2,3,70,1 16 | 2,17,11,2,1,10,0 17 | 2,17,5.25,3,1,63,1 18 | 2,23,11.75,12,3,72,0 19 | 2,27,8.75,2,1,6,0 20 | 2,15,4.25,1,1,6,1 21 | 2,18,5.75,1,1,80,1 22 | 1,22,5.5,2,1,70,1 23 | 2,16,8.5,1,2,60,1 24 | 1,28,4.75,3,1,100,1 25 | 2,40,9.75,1,2,80,0 26 | 1,30,2.5,2,1,115,1 27 | 2,34,12,3,3,95,0 28 | 1,20,0.5,2,1,75,1 29 | 2,35,12,5,3,100,0 30 | 2,24,9.5,3,3,20,0 31 | 2,19,8.75,6,1,160,1 32 | 1,35,9.25,9,1,100,1 33 | 1,29,7.25,6,1,96,1 34 | 1,50,8.75,11,3,132,0 35 | 2,32,12,4,3,750,0 36 | 2,67,12,12,3,42,0 37 | 2,41,10.5,2,2,20,1 38 | 2,36,11,6,1,8,0 39 | 1,63,2.75,3,3,20,0 40 | 1,20,5,3,1,6,1 41 | 1,34,12,1,3,150,0 42 | 2,21,10.5,5,1,35,0 43 | 2,15,8,12,1,30,1 44 | 1,15,3.5,2,1,4,1 45 | 2,15,1.5,12,3,70,1 46 | 1,17,11.5,2,1,10,0 47 | 1,17,5.25,4,1,63,1 48 | 2,23,9.5,5,3,72,0 49 | 1,27,10,5,1,6,0 50 | 1,15,4,7,1,6,1 51 | 2,18,4.5,8,1,80,1 52 | 2,22,5,9,1,70,1 53 | 1,16,10.25,3,2,60,1 54 | 2,28,4,11,1,100,1 55 | 2,40,8.75,6,2,80,0 56 | 2,30,0.5,8,3,115,1 57 | 1,34,10.75,1,3,95,0 58 | 1,20,3.75,11,1,75,1 59 | 2,35,8.5,6,3,100,0 60 | 1,24,9.5,8,1,20,1 61 | 2,19,8,9,1,160,1 62 | 1,35,7.25,2,1,100,1 63 | 1,29,11.75,5,1,96,0 64 | 2,50,9.5,4,3,132,0 65 | 2,32,12,12,3,750,0 66 | 1,67,10,7,1,42,0 67 | 2,41,7.75,5,2,20,1 68 | 2,36,10.5,4,1,8,0 69 | 1,67,3.75,11,3,20,0 70 | 1,20,4,3,1,6,1 71 | 1,34,11.25,1,3,150,0 72 | 2,21,10.75,7,1,35,0 73 | 1,15,10.5,11,1,30,1 74 | 1,15,2,11,1,4,1 75 | 2,15,2,10,3,70,1 76 | 1,17,9.25,12,1,10,0 77 | 1,17,5.75,10,1,63,1 78 | 1,23,10.25,7,3,72,0 79 | 1,27,10.5,7,1,6,0 80 | 1,15,5.5,5,1,6,1 81 | 1,18,4,1,1,80,1 82 | 2,22,4.5,2,1,70,1 83 | 1,16,11,3,2,60,1 84 | 2,28,5,9,1,100,1 85 | 1,40,11.5,9,2,80,0 86 | 1,30,0.25,10,1,115,1 87 | 2,34,12,3,3,95,0 88 | 2,20,3.5,6,1,75,1 89 | 2,35,8.25,8,3,100,0 90 | 1,24,10.75,10,1,20,1 91 | 1,19,8,8,1,160,1 92 | -------------------------------------------------------------------------------- /Chapter01/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.packt.AnomalyDetection 6 | RandomForest 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | ScalaMLQuickStartGuide 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 1.8 16 | 2.2.0 17 | 1.0.0-alpha 18 | 1.0.0-alpha 19 | 1.0.0-alpha 20 | 1.0.0-alpha 21 | 1.2.3 22 | 23 | 24 | 25 | 26 | jdk.tools 27 | jdk.tools 28 | 1.8.0_171 29 | system 30 | C:/Program Files/Java/jdk1.8.0_171/lib/tools.jar 31 | 32 | 33 | org.apache.directory.studio 34 | org.apache.commons.io 35 | 2.4 36 | 37 | 38 | org.deeplearning4j 39 | scalnet_2.11 40 | 1.0.0-alpha 41 | 42 | 43 | org.apache.spark 44 | spark-core_2.11 45 | ${spark.version} 46 | 47 | 48 | com.github.tototoshi 49 | scala-csv_2.10 50 | 1.3.5 51 | 52 | 53 | org.apache.spark 54 | spark-sql_2.11 55 | ${spark.version} 56 | 57 | 58 | com.github.scopt 59 | scopt_2.11 60 | 3.3.0 61 | 62 | 63 | com.typesafe 64 | config 65 | 1.2.1 66 | 67 | 68 | org.apache.directory.api 69 | api-util 70 | 1.0.0 71 | 72 | 73 | commons-io 74 | commons-io 75 | 2.6 76 | 77 | 78 | com.esotericsoftware.kryo 79 | kryo 80 | 2.10 81 | 82 | 83 | edu.stanford.nlp 84 | stanford-corenlp 85 | 3.6.0 86 | 87 | 88 | edu.stanford.nlp 89 | stanford-corenlp 90 | 3.6.0 91 | models 92 | 93 | 94 | org.apache.hadoop 95 | hadoop-common 96 | 2.6.0 97 | 98 | 99 | org.sameersingh.scalaplot 100 | scalaplot 101 | 0.0.4 102 | 103 | 104 | org.apache.spark 105 | spark-mllib_2.11 106 | ${spark.version} 107 | 108 | 109 | org.apache.spark 110 | spark-graphx_2.11 111 | ${spark.version} 112 | 113 | 114 | org.apache.spark 115 | spark-yarn_2.11 116 | ${spark.version} 117 | 118 | 119 | org.apache.spark 120 | spark-network-shuffle_2.11 121 | ${spark.version} 122 | 123 | 124 | com.databricks 125 | spark-csv_2.11 126 | 1.3.0 127 | 128 | 129 | com.holdenkarau 130 | spark-testing-base_2.10 131 | 2.0.0_0.6.0 132 | 133 | 134 | com.databricks 135 | spark-avro_2.11 136 | 4.0.0 137 | 138 | 139 | org.apache.commons 140 | commons-math3 141 | 3.2 142 | 143 | 144 | org.apache.hive 145 | hive-exec 146 | 2.3.2 147 | 148 | 149 | junit 150 | junit 151 | 3.8.1 152 | test 153 | 154 | 155 | org.nd4j 156 | nd4j-native 157 | ${nd4j.version} 158 | 159 | 160 | org.deeplearning4j 161 | deeplearning4j-ui_2.11 162 | ${dl4j.version} 163 | 164 | 165 | org.deeplearning4j 166 | deeplearning4j-core 167 | ${dl4j.version} 168 | 169 | 170 | org.deeplearning4j 171 | deeplearning4j-nlp 172 | ${dl4j.version} 173 | 174 | 175 | org.deeplearning4j 176 | deeplearning4j-zoo 177 | ${dl4j.version} 178 | 179 | 180 | org.deeplearning4j 181 | arbiter-deeplearning4j 182 | ${arbiter.version} 183 | 184 | 185 | org.deeplearning4j 186 | arbiter-ui_2.11 187 | ${arbiter.version} 188 | 189 | 190 | datavec-data-codec 191 | org.datavec 192 | ${datavec.version} 193 | 194 | 195 | org.apache.httpcomponents 196 | httpclient 197 | 4.3.5 198 | 199 | 200 | ch.qos.logback 201 | logback-classic 202 | ${logback.version} 203 | 204 | 205 | org.datavec 206 | datavec-data-image 207 | ${dl4j.version} 208 | 209 | 210 | org.bytedeco 211 | javacv-platform 212 | 1.4.1 213 | 214 | 215 | org.datavec 216 | datavec-hadoop 217 | ${datavec.version} 218 | 219 | 220 | 221 | org.deeplearning4j 222 | arbiter-deeplearning4j 223 | ${arbiter.version} 224 | 225 | 226 | org.deeplearning4j 227 | arbiter-ui_2.11 228 | ${arbiter.version} 229 | 230 | 231 | org.apache.httpcomponents 232 | httpclient 233 | 4.3.5 234 | 235 | 236 | ch.qos.logback 237 | logback-classic 238 | ${logback.version} 239 | 240 | 241 | 242 | jfree 243 | jfreechart 244 | 1.0.13 245 | 246 | 247 | org.jcodec 248 | jcodec 249 | 0.2.3 250 | 251 | 252 | 253 | 254 | 255 | 256 | org.apache.maven.plugins 257 | maven-eclipse-plugin 258 | 2.9 259 | 260 | true 261 | false 262 | 263 | 264 | 265 | 266 | org.apache.maven.plugins 267 | maven-compiler-plugin 268 | 3.5.1 269 | 270 | ${jdk.version} 271 | ${jdk.version} 272 | 273 | 274 | 275 | maven-shade-plugin 276 | 2.4.3 277 | 278 | 279 | package 280 | 281 | shade 282 | 283 | 284 | false 285 | 286 | 287 | 288 | *:* 289 | 290 | META-INF/*.SF 291 | META-INF/*.DSA 292 | META-INF/*.RSA 293 | 294 | 295 | 296 | 297 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | org.apache.maven.plugins 307 | maven-assembly-plugin 308 | 2.4.1 309 | 310 | 311 | 312 | jar-with-dependencies 313 | 314 | 315 | 316 | 317 | com.packt.ScalaML.ProductionEngineering.BoschProductionLinePerformance2 318 | 319 | 320 | 321 | 322 | oozie.launcher.mapreduce.job.user.classpath.first 323 | true 324 | 325 | 326 | 327 | 328 | 329 | make-assembly 330 | 331 | package 332 | 333 | single 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | -------------------------------------------------------------------------------- /Chapter01/src/main/scala/GettingStartedML/CryotherapyPrediction.scala: -------------------------------------------------------------------------------- 1 | package GettingStartedML 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.ml._ 5 | import org.apache.spark.ml.feature._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.DataFrame 8 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 9 | import org.apache.spark.ml.classification.DecisionTreeClassificationModel 10 | import org.apache.spark.ml.classification.DecisionTreeClassifier 11 | 12 | object CryotherapyPrediction { 13 | def main(args: Array[String]) { 14 | val spark = SparkSession 15 | .builder 16 | .master("local[*]") 17 | .config("spark.sql.warehouse.dir", "E:/Exp/") 18 | .appName("CryotherapyPrediction") 19 | .getOrCreate() 20 | 21 | import spark.implicits._ 22 | 23 | var CryotherapyDF = spark.read.option("header", "true") 24 | .option("inferSchema", "true") 25 | .csv("data/Cryotherapy.csv") 26 | 27 | CryotherapyDF.printSchema() 28 | CryotherapyDF.show(10) 29 | 30 | //Since Spark ML algorithm expect a 'label' column, which is in our case 'Survived". Let's rename it to 'label' 31 | CryotherapyDF = CryotherapyDF.withColumnRenamed("Result_of_Treatment", "label") 32 | CryotherapyDF.printSchema() 33 | 34 | //Select columns for preparing training data using VectorAssembler() 35 | val selectedCols = Array("sex", "age", "Time", "Number_of_Warts", "Type", "Area") 36 | 37 | val vectorAssembler = new VectorAssembler() 38 | .setInputCols(selectedCols) 39 | .setOutputCol("features") 40 | 41 | // We convert prepare a training data containing "label" and "features", where the features contains existing numeric features and one hot encoded ones: 42 | val numericDF = vectorAssembler.transform(CryotherapyDF) 43 | .select("label", "features") 44 | numericDF.show(10) 45 | 46 | // Spliting the training data into train and test sets. We use 60% for the training and the rest 40% for testing 47 | val splits = numericDF.randomSplit(Array(0.8, 0.2)) 48 | val trainDF = splits(0) 49 | val testDF = splits(1) 50 | 51 | // Train a DecisionTree model. 52 | val dt = new DecisionTreeClassifier() 53 | .setImpurity("gini") 54 | .setMaxBins(10) 55 | .setMaxDepth(30) 56 | .setLabelCol("label") 57 | .setFeaturesCol("features") 58 | 59 | // Train model. This also runs the indexers. 60 | val dtModel = dt.fit(trainDF) 61 | 62 | // Since it's a binary clasisfication problem, we need BinaryClassificationEvaluator() estimator to evaluatemodel's performance on the test set 63 | val evaluator = new BinaryClassificationEvaluator() 64 | .setLabelCol("label") 65 | 66 | // Making predictions on test set 67 | val predictionDF = dtModel.transform(testDF) 68 | 69 | //Computing classification accuracy 70 | val accuracy = evaluator.evaluate(predictionDF) 71 | println("Accuracy = " + accuracy) 72 | 73 | // Finally, we stop the Spark session by invokin stop() method 74 | spark.stop() 75 | } 76 | } -------------------------------------------------------------------------------- /Chapter02/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.packt.AnomalyDetection 6 | RandomForest 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | ScalaMLQuickStartGuide 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 1.8 16 | 2.4.4 17 | 18 | 19 | 20 | 21 | jdk.tools 22 | jdk.tools 23 | 1.8.0_171 24 | system 25 | C:/Program Files/Java/jdk1.8.0_171/lib/tools.jar 26 | 27 | 28 | org.apache.directory.studio 29 | org.apache.commons.io 30 | 2.4 31 | 32 | 33 | org.apache.spark 34 | spark-core_2.11 35 | ${spark.version} 36 | 37 | 38 | com.github.tototoshi 39 | scala-csv_2.10 40 | 1.3.5 41 | 42 | 43 | org.apache.spark 44 | spark-sql_2.11 45 | ${spark.version} 46 | 47 | 48 | com.github.scopt 49 | scopt_2.11 50 | 3.3.0 51 | 52 | 53 | com.typesafe 54 | config 55 | 1.2.1 56 | 57 | 58 | org.apache.directory.api 59 | api-util 60 | 1.0.0 61 | 62 | 63 | commons-io 64 | commons-io 65 | 2.6 66 | 67 | 68 | com.esotericsoftware.kryo 69 | kryo 70 | 2.10 71 | 72 | 73 | edu.stanford.nlp 74 | stanford-corenlp 75 | 3.6.0 76 | 77 | 78 | edu.stanford.nlp 79 | stanford-corenlp 80 | 3.6.0 81 | models 82 | 83 | 84 | org.apache.hadoop 85 | hadoop-common 86 | 2.6.0 87 | 88 | 89 | org.sameersingh.scalaplot 90 | scalaplot 91 | 0.0.4 92 | 93 | 94 | org.apache.spark 95 | spark-mllib_2.11 96 | ${spark.version} 97 | 98 | 99 | org.apache.spark 100 | spark-graphx_2.11 101 | ${spark.version} 102 | 103 | 104 | org.apache.spark 105 | spark-yarn_2.11 106 | ${spark.version} 107 | 108 | 109 | org.apache.spark 110 | spark-network-shuffle_2.11 111 | ${spark.version} 112 | 113 | 114 | com.databricks 115 | spark-csv_2.11 116 | 1.3.0 117 | 118 | 119 | com.holdenkarau 120 | spark-testing-base_2.10 121 | 2.0.0_0.6.0 122 | 123 | 124 | com.databricks 125 | spark-avro_2.11 126 | 4.0.0 127 | 128 | 129 | org.apache.commons 130 | commons-math3 131 | 3.2 132 | 133 | 134 | org.apache.hive 135 | hive-exec 136 | 2.3.2 137 | 138 | 139 | junit 140 | junit 141 | 3.8.1 142 | test 143 | 144 | 145 | 146 | 147 | 148 | 149 | org.apache.maven.plugins 150 | maven-eclipse-plugin 151 | 2.9 152 | 153 | true 154 | false 155 | 156 | 157 | 158 | 159 | org.apache.maven.plugins 160 | maven-compiler-plugin 161 | 3.5.1 162 | 163 | ${jdk.version} 164 | ${jdk.version} 165 | 166 | 167 | 168 | maven-shade-plugin 169 | 2.4.3 170 | 171 | 172 | package 173 | 174 | shade 175 | 176 | 177 | false 178 | 179 | 180 | 181 | *:* 182 | 183 | META-INF/*.SF 184 | META-INF/*.DSA 185 | META-INF/*.RSA 186 | 187 | 188 | 189 | 190 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | org.apache.maven.plugins 200 | maven-assembly-plugin 201 | 2.4.1 202 | 203 | 204 | 205 | jar-with-dependencies 206 | 207 | 208 | 209 | 210 | com.packt.ScalaML.ProductionEngineering.BoschProductionLinePerformance2 211 | 212 | 213 | 214 | 215 | oozie.launcher.mapreduce.job.user.classpath.first 216 | true 217 | 218 | 219 | 220 | 221 | 222 | make-assembly 223 | 224 | package 225 | 226 | single 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | -------------------------------------------------------------------------------- /Chapter02/src/main/scala/RegressionAnalysis/EDA.scala: -------------------------------------------------------------------------------- 1 | package RegressionAnalysis 2 | 3 | import org.apache.spark.sql._ 4 | import org.apache.spark.sql.functions._ 5 | 6 | object EDA { 7 | def main(args: Array[String]): Unit = { 8 | val spark = SparkSession 9 | .builder 10 | .master("local[*]") 11 | .config("spark.sql.warehouse.dir", "E:/Exp/") 12 | .appName(s"OneVsRestExample") 13 | .getOrCreate() 14 | 15 | import spark.implicits._ 16 | 17 | val rawTrafficDF = spark.read 18 | .option("header", "true") 19 | .option("inferSchema", "true") 20 | .option("delimiter", ";") 21 | .format("com.databricks.spark.csv") 22 | .load("data/Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv") 23 | .cache 24 | 25 | rawTrafficDF.select("Hour (Coded)", "Immobilized bus", "Broken Truck", "Vehicle excess", "Fire", "Slowness in traffic (%)").show(5) 26 | println(rawTrafficDF.count()) 27 | rawTrafficDF.printSchema() 28 | 29 | rawTrafficDF.select("Hour (Coded)", "Immobilized bus", "Broken Truck", "Point of flooding", "Fire", "Slowness in traffic (%)").describe().show() 30 | 31 | var newTrafficDF = rawTrafficDF.withColumnRenamed("Slowness in traffic (%)", "label") 32 | 33 | // Let's explore two other important features Point of flooding and Vehicle excess. We can rename these two columns as follows: 34 | 35 | newTrafficDF = newTrafficDF.withColumnRenamed("Point of flooding", "NoOfFloodPoint") 36 | 37 | newTrafficDF.createOrReplaceTempView("slDF") 38 | spark.sql("SELECT avg(label) as avgSlowness FROM slDF").show() 39 | 40 | spark.sql("SELECT max(NoOfFloodPoint) FROM slDF").show() 41 | } 42 | } -------------------------------------------------------------------------------- /Chapter02/src/main/scala/RegressionAnalysis/UrbanTrafficGeneralizedLinearRegression.scala: -------------------------------------------------------------------------------- 1 | package RegressionAnalysis 2 | 3 | import org.apache.spark.ml.regression.{ GeneralizedLinearRegression, GeneralizedLinearRegressionModel } 4 | import org.apache.spark.ml.{ Pipeline, PipelineModel } 5 | import org.apache.spark.ml.tuning.{ CrossValidator, ParamGridBuilder } 6 | import org.apache.spark.ml.evaluation.RegressionEvaluator 7 | import org.apache.spark.sql._ 8 | import org.apache.spark.sql.functions._ 9 | import org.apache.spark.mllib.evaluation.RegressionMetrics 10 | import org.apache.spark.ml.feature.VectorAssembler 11 | 12 | import org.apache.log4j.Logger 13 | import org.apache.log4j.Level 14 | 15 | object UrbanTrafficGeneralizedLinearRegression { 16 | def main(args: Array[String]) { 17 | val spark = SparkSession 18 | .builder 19 | .master("local[*]") 20 | .config("spark.sql.warehouse.dir", "E:/Exp/") 21 | .appName(s"OneVsRestExample") 22 | .getOrCreate() 23 | 24 | Logger.getLogger("org").setLevel(Level.FATAL) 25 | Logger.getLogger("akka").setLevel(Level.ERROR) 26 | 27 | import spark.implicits._ 28 | 29 | val rawTrafficDF = spark.read 30 | .option("header", "true") 31 | .option("inferSchema", "true") 32 | .option("delimiter", ";") 33 | .format("com.databricks.spark.csv") 34 | .load("data/Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv") 35 | .cache 36 | 37 | rawTrafficDF.show() 38 | rawTrafficDF.printSchema() 39 | rawTrafficDF.describe().show() 40 | 41 | val newTrafficDF = rawTrafficDF.withColumnRenamed("Slowness in traffic (%)", "label") 42 | val colNames = newTrafficDF.columns.dropRight(1) 43 | 44 | colNames.foreach(println) 45 | 46 | newTrafficDF.printSchema() 47 | 48 | // VectorAssembler for training features 49 | val assembler = new VectorAssembler() 50 | .setInputCols(colNames) 51 | .setOutputCol("features") 52 | 53 | val assembleDF = assembler.transform(newTrafficDF).select("features", "label") 54 | assembleDF.printSchema() 55 | 56 | val seed = 1357911L 57 | val splits = assembleDF.randomSplit(Array(0.60, 0.40), seed) 58 | val (trainingData, testData) = (splits(0), splits(1)) 59 | 60 | trainingData.cache 61 | testData.cache 62 | 63 | // Create an LinerRegression estimator 64 | val glr = new GeneralizedLinearRegression() 65 | .setFeaturesCol("features") 66 | .setLabelCol("label") 67 | 68 | // Building the Pipeline model for transformations and predictor 69 | println("Building ML regression model") 70 | val glrModel = glr.fit(trainingData) 71 | 72 | // ********************************************************************** 73 | println("Evaluating the model on the test set and calculating the regression metrics") 74 | // ********************************************************************** 75 | val trainPredictionsAndLabels = glrModel.transform(testData).select("label", "prediction") 76 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd 77 | 78 | val testRegressionMetrics = new RegressionMetrics(trainPredictionsAndLabels) 79 | 80 | val results = "\n=====================================================================\n" + 81 | s"TrainingData count: ${trainingData.count}\n" + 82 | s"TestData count: ${testData.count}\n" + 83 | "=====================================================================\n" + 84 | s"TestData MSE = ${testRegressionMetrics.meanSquaredError}\n" + 85 | s"TestData RMSE = ${testRegressionMetrics.rootMeanSquaredError}\n" + 86 | s"TestData R-squared = ${testRegressionMetrics.r2}\n" + 87 | s"TestData MAE = ${testRegressionMetrics.meanAbsoluteError}\n" + 88 | s"TestData explained variance = ${testRegressionMetrics.explainedVariance}\n" + 89 | "=====================================================================\n" 90 | println(results) 91 | 92 | // *********************************************************** 93 | println("Preparing K-fold Cross Validation and Grid Search") 94 | // *********************************************************** 95 | val paramGrid = new ParamGridBuilder() 96 | .addGrid(glr.maxIter, Array(10, 20, 30, 50, 100, 500, 1000)) 97 | .addGrid(glr.regParam, Array(0.001, 0.01, 0.1)) 98 | .addGrid(glr.tol, Array(0.01, 0.1)) 99 | .build() 100 | 101 | val numFolds = 10 //10-fold cross-validation 102 | val cv = new CrossValidator() 103 | .setEstimator(glr) 104 | .setEvaluator(new RegressionEvaluator) 105 | .setEstimatorParamMaps(paramGrid) 106 | .setNumFolds(numFolds) 107 | 108 | // ************************************************************ 109 | println("Training model with Linear Regression algorithm") 110 | // ************************************************************ 111 | val cvModel = cv.fit(trainingData) 112 | 113 | // Save the workflow 114 | //cvModel.write.overwrite().save("model/GLR_model") 115 | 116 | // Load the workflow back 117 | //val sameCV = CrossValidatorModel.load("model/GLR_model") 118 | 119 | // ********************************************************************** 120 | println("Evaluating the cross validated model on the validation set and calculating the regression metrics") 121 | // ********************************************************************** 122 | val trainPredictionsAndLabelsCV = cvModel.transform(testData).select("label", "prediction") 123 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd 124 | 125 | val testRegressionMetricsCV = new RegressionMetrics(trainPredictionsAndLabelsCV) 126 | 127 | val cvResults = "\n=====================================================================\n" + 128 | s"TrainingData count: ${trainingData.count}\n" + 129 | s"TestData count: ${testData.count}\n" + 130 | "=====================================================================\n" + 131 | s"TestData MSE = ${testRegressionMetricsCV.meanSquaredError}\n" + 132 | s"TestData RMSE = ${testRegressionMetricsCV.rootMeanSquaredError}\n" + 133 | s"TestData R-squared = ${testRegressionMetricsCV.r2}\n" + 134 | s"TestData MAE = ${testRegressionMetricsCV.meanAbsoluteError}\n" + 135 | s"TestData explained variance = ${testRegressionMetricsCV.explainedVariance}\n" + 136 | "=====================================================================\n" 137 | println(cvResults) 138 | 139 | // Print the coefficients and intercept for generalized linear regression model 140 | println(s"Coefficients: ${glrModel.coefficients}") 141 | println(s"Intercept: ${glrModel.intercept}") 142 | 143 | spark.stop() 144 | } 145 | } -------------------------------------------------------------------------------- /Chapter02/src/main/scala/RegressionAnalysis/UrbanTrafficLinearRegression.scala: -------------------------------------------------------------------------------- 1 | package RegressionAnalysis 2 | 3 | import org.apache.spark.ml.regression.{ LinearRegression, LinearRegressionModel } 4 | import org.apache.spark.ml.{ Pipeline, PipelineModel } 5 | import org.apache.spark.ml.tuning.{ CrossValidator, ParamGridBuilder } 6 | import org.apache.spark.ml.evaluation.RegressionEvaluator 7 | import org.apache.spark.sql._ 8 | import org.apache.spark.sql.functions._ 9 | import org.apache.spark.mllib.evaluation.RegressionMetrics 10 | import org.apache.spark.ml.feature.VectorAssembler 11 | 12 | import org.apache.log4j.Logger 13 | import org.apache.log4j.Level 14 | 15 | object UrbanTrafficLinearRegression { 16 | def main(args: Array[String]) { 17 | val spark = SparkSession 18 | .builder 19 | .master("local[*]") 20 | .config("spark.sql.warehouse.dir", "E:/Exp/") 21 | .appName(s"OneVsRestExample") 22 | .getOrCreate() 23 | 24 | Logger.getLogger("org").setLevel(Level.FATAL) 25 | Logger.getLogger("akka").setLevel(Level.ERROR) 26 | 27 | import spark.implicits._ 28 | 29 | val rawTrafficDF = spark.read 30 | .option("header", "true") 31 | .option("inferSchema", "true") 32 | .option("delimiter", ";") 33 | .format("com.databricks.spark.csv") 34 | .load("data/Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv") 35 | .cache 36 | 37 | rawTrafficDF.show() 38 | rawTrafficDF.printSchema() 39 | rawTrafficDF.describe().show() 40 | 41 | val newTrafficDF = rawTrafficDF.withColumnRenamed("Slowness in traffic (%)", "label") 42 | 43 | newTrafficDF.createOrReplaceTempView("slDF") 44 | spark.sql("SELECT avg(label) FROM slDF").show() 45 | 46 | val colNames = newTrafficDF.columns.dropRight(1) 47 | 48 | // VectorAssembler for training features 49 | val assembler = new VectorAssembler() 50 | .setInputCols(colNames) 51 | .setOutputCol("features") 52 | 53 | val assembleDF = assembler.transform(newTrafficDF).select("features", "label") 54 | assembleDF.show(10) 55 | 56 | val seed = 12345 57 | val splits = assembleDF.randomSplit(Array(0.60, 0.40), seed) 58 | val (trainingData, testData) = (splits(0), splits(1)) 59 | 60 | trainingData.cache 61 | testData.cache 62 | 63 | // Create an LinerRegression estimator 64 | val lr = new LinearRegression() 65 | .setFeaturesCol("features") 66 | .setLabelCol("label") 67 | 68 | // Building the Pipeline model for transformations and predictor 69 | println("Building ML regression model") 70 | val lrModel = lr.fit(trainingData) 71 | 72 | // Save the workflow 73 | //lrModel.write.overwrite().save("model/LR_model") 74 | 75 | // Load the workflow back 76 | //val sameLRModel = CrossValidatorModel.load("model/GLR_model") 77 | 78 | // ********************************************************************** 79 | println("Evaluating the model on the test set and calculating the regression metrics") 80 | // ********************************************************************** 81 | val trainPredictionsAndLabels = lrModel.transform(testData).select("label", "prediction") 82 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd 83 | 84 | val testRegressionMetrics = new RegressionMetrics(trainPredictionsAndLabels) 85 | 86 | val results = "\n=====================================================================\n" + 87 | s"TrainingData count: ${trainingData.count}\n" + 88 | s"TestData count: ${testData.count}\n" + 89 | "=====================================================================\n" + 90 | s"TestData MSE = ${testRegressionMetrics.meanSquaredError}\n" + 91 | s"TestData RMSE = ${testRegressionMetrics.rootMeanSquaredError}\n" + 92 | s"TestData R-squared = ${testRegressionMetrics.r2}\n" + 93 | s"TestData MAE = ${testRegressionMetrics.meanAbsoluteError}\n" + 94 | s"TestData explained variance = ${testRegressionMetrics.explainedVariance}\n" + 95 | "=====================================================================\n" 96 | println(results) 97 | 98 | // *********************************************************** 99 | println("Preparing K-fold Cross Validation and Grid Search") 100 | // *********************************************************** 101 | val paramGrid = new ParamGridBuilder() 102 | .addGrid(lr.maxIter, Array(10, 20, 30, 50, 100, 500, 1000)) 103 | .addGrid(lr.regParam, Array(0.001, 0.01, 0.1)) 104 | .addGrid(lr.tol, Array(0.01, 0.1)) 105 | .build() 106 | 107 | val numFolds = 10 //10-fold cross-validation 108 | val cv = new CrossValidator() 109 | .setEstimator(lr) 110 | .setEvaluator(new RegressionEvaluator()) 111 | .setEstimatorParamMaps(paramGrid) 112 | .setNumFolds(numFolds) 113 | 114 | // ************************************************************ 115 | println("Training model with Linear Regression algorithm") 116 | // ************************************************************ 117 | val cvModel = cv.fit(trainingData) 118 | 119 | // Save the workflow 120 | cvModel.write.overwrite().save("model/LR_model") 121 | 122 | // Load the workflow back 123 | val sameCVModel = LinearRegressionModel.load("model/LR_model") 124 | 125 | // ********************************************************************** 126 | println("Evaluating the cross validated model on the validation set and calculating the regression metrics") 127 | // ********************************************************************** 128 | val trainPredictionsAndLabelsCV = cvModel.transform(testData).select("label", "prediction") 129 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd 130 | 131 | val testRegressionMetricsCV = new RegressionMetrics(trainPredictionsAndLabelsCV) 132 | 133 | val cvResults = "\n=====================================================================\n" + 134 | s"TrainingData count: ${trainingData.count}\n" + 135 | s"TestData count: ${testData.count}\n" + 136 | "=====================================================================\n" + 137 | s"TestData MSE = ${testRegressionMetricsCV.meanSquaredError}\n" + 138 | s"TestData RMSE = ${testRegressionMetricsCV.rootMeanSquaredError}\n" + 139 | s"TestData R-squared = ${testRegressionMetricsCV.r2}\n" + 140 | s"TestData MAE = ${testRegressionMetricsCV.meanAbsoluteError}\n" + 141 | s"TestData explained variance = ${testRegressionMetricsCV.explainedVariance}\n" + 142 | "=====================================================================\n" 143 | println(cvResults) 144 | 145 | spark.stop() 146 | } 147 | } -------------------------------------------------------------------------------- /Chapter03/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.packt.AnomalyDetection 6 | RandomForest 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | ScalaMLQuickStartGuide 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 1.8 16 | 2.2.0 17 | 18 | 19 | 20 | 21 | jdk.tools 22 | jdk.tools 23 | 1.8.0_171 24 | system 25 | C:/Program Files/Java/jdk1.8.0_171/lib/tools.jar 26 | 27 | 28 | org.apache.directory.studio 29 | org.apache.commons.io 30 | 2.4 31 | 32 | 33 | org.apache.spark 34 | spark-core_2.11 35 | ${spark.version} 36 | 37 | 38 | com.github.tototoshi 39 | scala-csv_2.10 40 | 1.3.5 41 | 42 | 43 | org.apache.spark 44 | spark-sql_2.11 45 | ${spark.version} 46 | 47 | 48 | com.github.scopt 49 | scopt_2.11 50 | 3.3.0 51 | 52 | 53 | com.typesafe 54 | config 55 | 1.2.1 56 | 57 | 58 | org.apache.directory.api 59 | api-util 60 | 1.0.0 61 | 62 | 63 | commons-io 64 | commons-io 65 | 2.6 66 | 67 | 68 | com.esotericsoftware.kryo 69 | kryo 70 | 2.10 71 | 72 | 73 | edu.stanford.nlp 74 | stanford-corenlp 75 | 3.6.0 76 | 77 | 78 | edu.stanford.nlp 79 | stanford-corenlp 80 | 3.6.0 81 | models 82 | 83 | 84 | org.apache.hadoop 85 | hadoop-common 86 | 2.6.0 87 | 88 | 89 | org.sameersingh.scalaplot 90 | scalaplot 91 | 0.0.4 92 | 93 | 94 | org.apache.spark 95 | spark-mllib_2.11 96 | ${spark.version} 97 | 98 | 99 | org.apache.spark 100 | spark-graphx_2.11 101 | ${spark.version} 102 | 103 | 104 | org.apache.spark 105 | spark-yarn_2.11 106 | ${spark.version} 107 | 108 | 109 | org.apache.spark 110 | spark-network-shuffle_2.11 111 | ${spark.version} 112 | 113 | 114 | com.databricks 115 | spark-csv_2.11 116 | 1.3.0 117 | 118 | 119 | com.holdenkarau 120 | spark-testing-base_2.10 121 | 2.0.0_0.6.0 122 | 123 | 124 | com.databricks 125 | spark-avro_2.11 126 | 4.0.0 127 | 128 | 129 | org.apache.commons 130 | commons-math3 131 | 3.2 132 | 133 | 134 | org.apache.hive 135 | hive-exec 136 | 2.3.2 137 | 138 | 139 | junit 140 | junit 141 | 3.8.1 142 | test 143 | 144 | 145 | 146 | 147 | 148 | 149 | org.apache.maven.plugins 150 | maven-eclipse-plugin 151 | 2.9 152 | 153 | true 154 | false 155 | 156 | 157 | 158 | 159 | org.apache.maven.plugins 160 | maven-compiler-plugin 161 | 3.5.1 162 | 163 | ${jdk.version} 164 | ${jdk.version} 165 | 166 | 167 | 168 | maven-shade-plugin 169 | 2.4.3 170 | 171 | 172 | package 173 | 174 | shade 175 | 176 | 177 | false 178 | 179 | 180 | 181 | *:* 182 | 183 | META-INF/*.SF 184 | META-INF/*.DSA 185 | META-INF/*.RSA 186 | 187 | 188 | 189 | 190 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | org.apache.maven.plugins 200 | maven-assembly-plugin 201 | 2.4.1 202 | 203 | 204 | 205 | jar-with-dependencies 206 | 207 | 208 | 209 | 210 | com.packt.ScalaML.ProductionEngineering.BoschProductionLinePerformance2 211 | 212 | 213 | 214 | 215 | oozie.launcher.mapreduce.job.user.classpath.first 216 | true 217 | 218 | 219 | 220 | 221 | 222 | make-assembly 223 | 224 | package 225 | 226 | single 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | -------------------------------------------------------------------------------- /Chapter03/src/main/scala/ScalaClassification/ChurnPredictionLR.scala: -------------------------------------------------------------------------------- 1 | package ScalaClassification 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel } 7 | import org.apache.spark.ml.Pipeline 8 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } 9 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 10 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 11 | 12 | object ChurnPredictionLR { 13 | def main(args: Array[String]) { 14 | val spark = SparkSession 15 | .builder 16 | .master("local[*]") 17 | .config("spark.sql.warehouse.dir", "E:/Exp/") 18 | .appName("ChurnPrediction") 19 | .getOrCreate() 20 | import spark.implicits._ 21 | 22 | val numFolds = 10 23 | val MaxIter: Seq[Int] = Seq(100) 24 | val RegParam: Seq[Double] = Seq(0.01) // L2 regularization param, set 0.10 with L1 reguarization 25 | val Tol: Seq[Double] = Seq(1e-4) 26 | val ElasticNetParam: Seq[Double] = Seq(1.0) // Combination of L1 and L2 27 | 28 | val lr = new LogisticRegression() 29 | .setLabelCol("label") 30 | .setFeaturesCol("features") 31 | 32 | // Chain indexers and tree in a Pipeline. 33 | val pipeline = new Pipeline() 34 | .setStages(Array(PipelineConstruction.ipindexer, 35 | PipelineConstruction.labelindexer, 36 | PipelineConstruction.assembler, 37 | lr)) 38 | 39 | // Search through decision tree's maxDepth parameter for best model 40 | val paramGrid = new ParamGridBuilder() 41 | .addGrid(lr.maxIter, MaxIter) 42 | .addGrid(lr.regParam, RegParam) 43 | .addGrid(lr.tol, Tol) 44 | .addGrid(lr.elasticNetParam, ElasticNetParam) 45 | .build() 46 | 47 | val evaluator = new BinaryClassificationEvaluator() 48 | .setLabelCol("label") 49 | .setRawPredictionCol("prediction") 50 | 51 | // Set up 10-fold cross validation 52 | val crossval = new CrossValidator() 53 | .setEstimator(pipeline) 54 | .setEvaluator(evaluator) 55 | .setEstimatorParamMaps(paramGrid) 56 | .setNumFolds(numFolds) 57 | 58 | val cvModel = crossval.fit(Preprocessing.trainDF) 59 | 60 | val predDF = cvModel.transform(Preprocessing.testSet) 61 | val result = predDF.select("label", "prediction", "probability") 62 | val resutDF = result.withColumnRenamed("prediction", "Predicted_label") 63 | resutDF.show(10) 64 | 65 | val accuracy = evaluator.evaluate(predDF) 66 | println("Classification accuracy: " + accuracy) 67 | 68 | // Compute other performence metrices 69 | val predictionAndLabels = predDF 70 | .select("prediction", "label") 71 | .rdd.map(x => (x(0).asInstanceOf[Double], x(1) 72 | .asInstanceOf[Double])) 73 | 74 | val metrics = new BinaryClassificationMetrics(predictionAndLabels) 75 | val areaUnderPR = metrics.areaUnderPR 76 | println("Area under the precision-recall curve: " + areaUnderPR) 77 | 78 | val areaUnderROC = metrics.areaUnderROC 79 | println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) 80 | 81 | val tVSpDF = predDF.select("label", "prediction") // True vs predicted labels 82 | val TC = predDF.count() //Total count 83 | 84 | val tp = tVSpDF.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / TC.toDouble 85 | val tn = tVSpDF.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / TC.toDouble 86 | val fp = tVSpDF.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / TC.toDouble 87 | val fn = tVSpDF.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / TC.toDouble 88 | 89 | val MCC = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (fp + tn) * (tn + fn)) // Calculating Matthews correlation coefficient 90 | 91 | println("True positive rate: " + tp *100 + "%") 92 | println("False positive rate: " + fp * 100 + "%") 93 | println("True negative rate: " + tn * 100 + "%") 94 | println("False negative rate: " + fn * 100 + "%") 95 | println("Matthews correlation coefficient: " + MCC) 96 | } 97 | } -------------------------------------------------------------------------------- /Chapter03/src/main/scala/ScalaClassification/ChurnPredictionNB.scala: -------------------------------------------------------------------------------- 1 | package ScalaClassification 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, NaiveBayes, NaiveBayesModel } 7 | import org.apache.spark.ml.Pipeline 8 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } 9 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 10 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 11 | import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} 12 | 13 | /* 14 | class Stats(val tp: Int, val tn: Int, val fp: Int, val fn: Int) { 15 | val TPR = tp / (tp + fn).toDouble 16 | val recall = TPR 17 | val sensitivity = TPR 18 | val TNR = tn / (tn + fp).toDouble 19 | val specificity = TNR 20 | val PPV = tp / (tp + fp).toDouble 21 | val precision = PPV 22 | val NPV = tn / (tn + fn).toDouble 23 | val FPR = 1.0 - specificity 24 | val FNR = 1.0 - recall 25 | val FDR = 1.0 - precision 26 | val ACC = (tp + tn) / (tp + fp + fn + tn).toDouble 27 | val accuracy = ACC 28 | val F1 = 2 * PPV * TPR / (PPV + TPR).toDouble 29 | val MCC = (tp * tn - fp * fn).toDouble / math.sqrt((tp + fp).toDouble * (tp + fn).toDouble * (fp + tn).toDouble * (tn + fn).toDouble) 30 | } */ 31 | 32 | object ChurnPredictionNB { 33 | def main(args: Array[String]) { 34 | val spark = SparkSession 35 | .builder 36 | .master("local[*]") 37 | .config("spark.sql.warehouse.dir", "E:/Exp/") 38 | .appName("ChurnPrediction") 39 | .getOrCreate() 40 | 41 | import spark.implicits._ 42 | 43 | val numFolds = 10 44 | val nb = new NaiveBayes() 45 | .setLabelCol("label") 46 | .setFeaturesCol("features") 47 | 48 | // Chain indexers and tree in a Pipeline. 49 | val pipeline = new Pipeline().setStages(Array(PipelineConstruction.ipindexer, 50 | PipelineConstruction.labelindexer, 51 | PipelineConstruction.assembler, 52 | nb)) 53 | 54 | // Search through Naive Bayes's smoothing parameter for best model 55 | val paramGrid = new ParamGridBuilder() 56 | .addGrid(nb.smoothing, Array(1e-2, 1e-4, 1e-6, 1e-8)) 57 | .build() 58 | 59 | val evaluator = new BinaryClassificationEvaluator() 60 | .setLabelCol("label") 61 | .setRawPredictionCol("prediction") 62 | 63 | // Set up 10-fold cross validation 64 | val crossval = new CrossValidator() 65 | .setEstimator(pipeline) 66 | .setEvaluator(evaluator) 67 | .setEstimatorParamMaps(paramGrid) 68 | .setNumFolds(numFolds) 69 | 70 | val cvModel = crossval.fit(Preprocessing.trainDF) 71 | 72 | val predDF = cvModel.transform(Preprocessing.testSet) 73 | val result = predDF.select("label", "prediction", "probability") 74 | val resutDF = result.withColumnRenamed("prediction", "Predicted_label") 75 | resutDF.show(10) 76 | 77 | val accuracy = evaluator.evaluate(predDF) 78 | println("Classification accuracy: " + accuracy) 79 | 80 | // Compute other performence metrices 81 | val predictionAndLabels = predDF 82 | .select("prediction", "label") 83 | .rdd.map(x => (x(0).asInstanceOf[Double], x(1) 84 | .asInstanceOf[Double])) 85 | 86 | val metrics = new BinaryClassificationMetrics(predictionAndLabels) 87 | val areaUnderPR = metrics.areaUnderPR 88 | println("Area under the precision-recall curve: " + areaUnderPR) 89 | 90 | val areaUnderROC = metrics.areaUnderROC 91 | println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) 92 | 93 | val tVSpDF = predDF.select("label", "prediction") // True vs predicted labels 94 | val TC = predDF.count() //Total count 95 | 96 | val tp = tVSpDF.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / TC.toDouble 97 | val tn = tVSpDF.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / TC.toDouble 98 | val fp = tVSpDF.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / TC.toDouble 99 | val fn = tVSpDF.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / TC.toDouble 100 | 101 | val MCC = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (fp + tn) * (tn + fn)) // Calculating Matthews correlation coefficient 102 | 103 | println("True positive rate: " + tp *100 + "%") 104 | println("False positive rate: " + fp * 100 + "%") 105 | println("True negative rate: " + tn * 100 + "%") 106 | println("False negative rate: " + fn * 100 + "%") 107 | println("Matthews correlation coefficient: " + MCC) 108 | } 109 | } -------------------------------------------------------------------------------- /Chapter03/src/main/scala/ScalaClassification/ChurnPredictionSVM.scala: -------------------------------------------------------------------------------- 1 | package ScalaClassification 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.ml.classification.{ LinearSVC, LinearSVCModel } 7 | import org.apache.spark.sql.SparkSession 8 | import org.apache.spark.sql.functions.max 9 | import org.apache.spark.ml.Pipeline 10 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } 11 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 12 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 13 | 14 | object ChurnPredictionSVM { 15 | def main(args: Array[String]) { 16 | val spark = SparkSession 17 | .builder 18 | .master("local[*]") 19 | .config("spark.sql.warehouse.dir", "E:/Exp/") 20 | .appName("ChurnPrediction") 21 | .getOrCreate() 22 | 23 | import spark.implicits._ 24 | 25 | val numFolds = 10 26 | val MaxIter: Seq[Int] = Seq(10000) 27 | val RegParam: Seq[Double] = Seq(0.10) // L2 regularization param, set 0.10 with L1 reguarization 28 | val Tol: Seq[Double] = Seq(1e-4) 29 | val ElasticNetParam: Seq[Double] = Seq(0.00001) // Combination of L1 and L2 30 | 31 | val svm = new LinearSVC() 32 | 33 | // Chain indexers and tree in a Pipeline. 34 | val pipeline = new Pipeline() 35 | .setStages(Array(PipelineConstruction.ipindexer, 36 | PipelineConstruction.labelindexer, 37 | PipelineConstruction.assembler, 38 | svm)) 39 | 40 | // Search through decision tree's maxDepth parameter for best model 41 | val paramGrid = new ParamGridBuilder() 42 | .addGrid(svm.maxIter, MaxIter) 43 | .addGrid(svm.regParam, RegParam) 44 | .addGrid(svm.tol, Tol) 45 | .build() 46 | 47 | val evaluator = new BinaryClassificationEvaluator() 48 | .setLabelCol("label") 49 | .setRawPredictionCol("prediction") 50 | 51 | // Set up 3-fold cross validation 52 | val crossval = new CrossValidator() 53 | .setEstimator(pipeline) 54 | .setEvaluator(evaluator) 55 | .setEstimatorParamMaps(paramGrid) 56 | .setNumFolds(numFolds) 57 | 58 | val cvModel = crossval.fit(Preprocessing.trainDF) 59 | 60 | val predDF = cvModel.transform(Preprocessing.testSet) 61 | val result = predDF.select("label", "prediction", "probability") 62 | val resutDF = result.withColumnRenamed("prediction", "Predicted_label") 63 | resutDF.show(10) 64 | 65 | val accuracy = evaluator.evaluate(predDF) 66 | println("Classification accuracy: " + accuracy) 67 | 68 | // Compute other performence metrices 69 | val predictionAndLabels = predDF 70 | .select("prediction", "label") 71 | .rdd.map(x => (x(0).asInstanceOf[Double], x(1) 72 | .asInstanceOf[Double])) 73 | 74 | val metrics = new BinaryClassificationMetrics(predictionAndLabels) 75 | val areaUnderPR = metrics.areaUnderPR 76 | println("Area under the precision-recall curve: " + areaUnderPR) 77 | 78 | val areaUnderROC = metrics.areaUnderROC 79 | println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) 80 | 81 | val tVSpDF = predDF.select("label", "prediction") // True vs predicted labels 82 | val TC = predDF.count() //Total count 83 | 84 | val tp = tVSpDF.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / TC.toDouble 85 | val tn = tVSpDF.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / TC.toDouble 86 | val fp = tVSpDF.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / TC.toDouble 87 | val fn = tVSpDF.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / TC.toDouble 88 | 89 | val MCC = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (fp + tn) * (tn + fn)) // Calculating Matthews correlation coefficient 90 | 91 | println("True positive rate: " + tp *100 + "%") 92 | println("False positive rate: " + fp * 100 + "%") 93 | println("True negative rate: " + tn * 100 + "%") 94 | println("False negative rate: " + fn * 100 + "%") 95 | println("Matthews correlation coefficient: " + MCC) 96 | } 97 | } -------------------------------------------------------------------------------- /Chapter03/src/main/scala/ScalaClassification/Describe.scala: -------------------------------------------------------------------------------- 1 | package com.packt.ScalaML.ChrunPrediction 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel } 7 | import org.apache.spark.sql.SparkSession 8 | import org.apache.spark.sql.functions.max 9 | import org.apache.spark.ml.Pipeline 10 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } 11 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 12 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 13 | 14 | import org.apache.spark._ 15 | import org.apache.spark.sql.functions._ 16 | import org.apache.spark.sql.types._ 17 | import org.apache.spark.sql._ 18 | import org.apache.spark.sql.Dataset 19 | 20 | import org.apache.spark.ml.linalg.{ Matrix, Vectors } 21 | import org.apache.spark.ml.stat.Correlation 22 | import org.apache.spark.sql.Row 23 | 24 | object Describe { 25 | case class CustomerAccount(state_code: String, account_length: Integer, area_code: String, 26 | international_plan: String, voice_mail_plan: String, num_voice_mail: Double, 27 | total_day_mins: Double, total_day_calls: Double, total_day_charge: Double, 28 | total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double, 29 | total_night_mins: Double, total_night_calls: Double, total_night_charge: Double, 30 | total_international_mins: Double, total_international_calls: Double, total_international_charge: Double, 31 | total_international_num_calls: Double, churn: String) 32 | 33 | val schema = StructType(Array( 34 | StructField("state_code", StringType, true), 35 | StructField("account_length", IntegerType, true), 36 | StructField("area_code", StringType, true), 37 | StructField("international_plan", StringType, true), 38 | StructField("voice_mail_plan", StringType, true), 39 | StructField("num_voice_mail", DoubleType, true), 40 | StructField("total_day_mins", DoubleType, true), 41 | StructField("total_day_calls", DoubleType, true), 42 | StructField("total_day_charge", DoubleType, true), 43 | StructField("total_evening_mins", DoubleType, true), 44 | StructField("total_evening_calls", DoubleType, true), 45 | StructField("total_evening_charge", DoubleType, true), 46 | StructField("total_night_mins", DoubleType, true), 47 | StructField("total_night_calls", DoubleType, true), 48 | StructField("total_night_charge", DoubleType, true), 49 | StructField("total_international_mins", DoubleType, true), 50 | StructField("total_international_calls", DoubleType, true), 51 | StructField("total_international_charge", DoubleType, true), 52 | StructField("total_international_num_calls", DoubleType, true), 53 | StructField("churn", StringType, true))) 54 | 55 | def main(args: Array[String]) { 56 | val spark = SparkSession 57 | .builder 58 | .master("local[*]") 59 | .config("spark.sql.warehouse.dir", "E:/Exp/") 60 | .appName("Desribe") 61 | .getOrCreate() 62 | 63 | spark.conf.set("spark.debug.maxToStringFields", 10000) 64 | val DEFAULT_MAX_TO_STRING_FIELDS = 2500 65 | if (SparkEnv.get != null) { 66 | SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS) 67 | } else { 68 | DEFAULT_MAX_TO_STRING_FIELDS 69 | } 70 | import spark.implicits._ 71 | 72 | val trainSet: Dataset[CustomerAccount] = spark.read. 73 | option("inferSchema", "false") 74 | .format("com.databricks.spark.csv") 75 | .schema(schema) 76 | .load("data/churn-bigml-80.csv") 77 | .as[CustomerAccount] 78 | 79 | val statsDF = trainSet.describe() 80 | statsDF.show() 81 | 82 | trainSet.createOrReplaceTempView("UserAccount") 83 | spark.catalog.cacheTable("UserAccount") 84 | 85 | spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show() 86 | spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show() 87 | trainSet.groupBy("churn").count.show() 88 | spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn") 89 | 90 | } 91 | } -------------------------------------------------------------------------------- /Chapter03/src/main/scala/ScalaClassification/PipelineConstruction.scala: -------------------------------------------------------------------------------- 1 | package ScalaClassification 2 | 3 | import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} 4 | 5 | object PipelineConstruction { 6 | // Index labels, adding metadata to the label column. Fit on whole dataset to include all labels in index. 7 | val ipindexer = new StringIndexer() 8 | .setInputCol("international_plan") 9 | .setOutputCol("iplanIndex") 10 | 11 | val labelindexer = new StringIndexer() 12 | .setInputCol("churn") 13 | .setOutputCol("label") 14 | 15 | val featureCols = Array("account_length", "iplanIndex", "num_voice_mail", "total_day_mins", "total_day_calls", "total_evening_mins", "total_evening_calls", "total_night_mins", "total_night_calls", "total_international_mins", "total_international_calls", "total_international_num_calls") 16 | 17 | val assembler = new VectorAssembler() 18 | .setInputCols(featureCols) 19 | .setOutputCol("features") 20 | } -------------------------------------------------------------------------------- /Chapter03/src/main/scala/ScalaClassification/Preprocessing.scala: -------------------------------------------------------------------------------- 1 | package ScalaClassification 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql._ 7 | import org.apache.spark.sql.Dataset 8 | 9 | /* 10 | * Dataset schema 11 | State 12 | Account length 13 | Area code 14 | International plan 15 | Voice mail plan 16 | Number vmail messages 17 | Total day minutes 18 | Total day calls 19 | Total day charge 20 | Total eve minutes 21 | Total eve calls Total eve charge 22 | Total night minutes 23 | Total night calls 24 | Total night charge 25 | Total intl minutes 26 | Total intl calls 27 | Total intl charge 28 | Customer service calls 29 | Churn 30 | */ 31 | 32 | object Preprocessing { 33 | case class CustomerAccount(state_code: String, account_length: Integer, area_code: String, 34 | international_plan: String, voice_mail_plan: String, num_voice_mail: Double, 35 | total_day_mins: Double, total_day_calls: Double, total_day_charge: Double, 36 | total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double, 37 | total_night_mins: Double, total_night_calls: Double, total_night_charge: Double, 38 | total_international_mins: Double, total_international_calls: Double, total_international_charge: Double, 39 | total_international_num_calls: Double, churn: String) 40 | 41 | val schema = StructType(Array( 42 | StructField("state_code", StringType, true), 43 | StructField("account_length", IntegerType, true), 44 | StructField("area_code", StringType, true), 45 | StructField("international_plan", StringType, true), 46 | StructField("voice_mail_plan", StringType, true), 47 | StructField("num_voice_mail", DoubleType, true), 48 | StructField("total_day_mins", DoubleType, true), 49 | StructField("total_day_calls", DoubleType, true), 50 | StructField("total_day_charge", DoubleType, true), 51 | StructField("total_evening_mins", DoubleType, true), 52 | StructField("total_evening_calls", DoubleType, true), 53 | StructField("total_evening_charge", DoubleType, true), 54 | StructField("total_night_mins", DoubleType, true), 55 | StructField("total_night_calls", DoubleType, true), 56 | StructField("total_night_charge", DoubleType, true), 57 | StructField("total_international_mins", DoubleType, true), 58 | StructField("total_international_calls", DoubleType, true), 59 | StructField("total_international_charge", DoubleType, true), 60 | StructField("total_international_num_calls", DoubleType, true), 61 | StructField("churn", StringType, true))) 62 | 63 | val spark = SparkSession 64 | .builder 65 | .master("local[*]") 66 | .config("spark.sql.warehouse.dir", "E:/Exp/") 67 | .appName("ChurnPrediction") 68 | .getOrCreate() 69 | import spark.implicits._ 70 | 71 | val trainSet: Dataset[CustomerAccount] = spark.read. 72 | option("inferSchema", "false") 73 | .format("com.databricks.spark.csv") 74 | .schema(schema) 75 | .load("data/churn-bigml-80.csv") 76 | .as[CustomerAccount] 77 | 78 | val statsDF = trainSet.describe() 79 | statsDF.show() 80 | trainSet.cache() 81 | 82 | trainSet.groupBy("churn").sum("total_international_num_calls").show() 83 | trainSet.groupBy("churn").sum("total_international_charge").show() 84 | 85 | val testSet: Dataset[CustomerAccount] = spark.read. 86 | option("inferSchema", "false") 87 | .format("com.databricks.spark.csv") 88 | .schema(schema) 89 | .load("data/churn-bigml-20.csv") 90 | .as[CustomerAccount] 91 | 92 | testSet.describe() 93 | testSet.cache() 94 | 95 | trainSet.printSchema() 96 | trainSet.show() 97 | 98 | trainSet.createOrReplaceTempView("UserAccount") 99 | spark.catalog.cacheTable("UserAccount") 100 | 101 | /////////////// Feature engineering 102 | spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show() 103 | spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show() 104 | trainSet.groupBy("churn").count.show() 105 | spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) as Total_intl_call FROM UserAccount GROUP BY churn").show() 106 | 107 | val fractions = Map("False" -> 0.1675, "True" -> 1.0) 108 | 109 | //Here we're keeping all instances of the Churn=True class, but downsampling the Churn=False class to a fraction of 388/2278. 110 | val churnDF = trainSet.stat.sampleBy("churn", fractions, 123456L) 111 | 112 | churnDF.groupBy("churn").count.show() 113 | 114 | val trainDF = churnDF 115 | .drop("state_code") 116 | .drop("area_code") 117 | .drop("voice_mail_plan") 118 | .drop("total_day_charge") 119 | .drop("total_evening_charge") 120 | 121 | println(trainDF.count) 122 | trainDF.select("account_length", "international_plan", "num_voice_mail", "total_day_calls", "total_international_num_calls", "churn").show(10) 123 | } -------------------------------------------------------------------------------- /Chapter04/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.packt.AnomalyDetection 6 | RandomForest 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | ScalaMLQuickStartGuide 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 1.8 16 | 2.2.0 17 | 18 | 19 | 20 | 21 | jdk.tools 22 | jdk.tools 23 | 1.8.0_171 24 | system 25 | C:/Program Files/Java/jdk1.8.0_171/lib/tools.jar 26 | 27 | 28 | org.apache.directory.studio 29 | org.apache.commons.io 30 | 2.4 31 | 32 | 33 | org.apache.spark 34 | spark-core_2.11 35 | ${spark.version} 36 | 37 | 38 | com.github.tototoshi 39 | scala-csv_2.10 40 | 1.3.5 41 | 42 | 43 | org.apache.spark 44 | spark-sql_2.11 45 | ${spark.version} 46 | 47 | 48 | com.github.scopt 49 | scopt_2.11 50 | 3.3.0 51 | 52 | 53 | com.typesafe 54 | config 55 | 1.2.1 56 | 57 | 58 | org.apache.directory.api 59 | api-util 60 | 1.0.0 61 | 62 | 63 | commons-io 64 | commons-io 65 | 2.6 66 | 67 | 68 | com.esotericsoftware.kryo 69 | kryo 70 | 2.10 71 | 72 | 73 | edu.stanford.nlp 74 | stanford-corenlp 75 | 3.6.0 76 | 77 | 78 | edu.stanford.nlp 79 | stanford-corenlp 80 | 3.6.0 81 | models 82 | 83 | 84 | org.apache.hadoop 85 | hadoop-common 86 | 2.6.0 87 | 88 | 89 | org.sameersingh.scalaplot 90 | scalaplot 91 | 0.0.4 92 | 93 | 94 | org.apache.spark 95 | spark-mllib_2.11 96 | ${spark.version} 97 | 98 | 99 | org.apache.spark 100 | spark-graphx_2.11 101 | ${spark.version} 102 | 103 | 104 | org.apache.spark 105 | spark-yarn_2.11 106 | ${spark.version} 107 | 108 | 109 | org.apache.spark 110 | spark-network-shuffle_2.11 111 | ${spark.version} 112 | 113 | 114 | com.databricks 115 | spark-csv_2.11 116 | 1.3.0 117 | 118 | 119 | com.holdenkarau 120 | spark-testing-base_2.10 121 | 2.0.0_0.6.0 122 | 123 | 124 | com.databricks 125 | spark-avro_2.11 126 | 4.0.0 127 | 128 | 129 | org.apache.commons 130 | commons-math3 131 | 3.2 132 | 133 | 134 | org.apache.hive 135 | hive-exec 136 | 2.3.2 137 | 138 | 139 | junit 140 | junit 141 | 3.8.1 142 | test 143 | 144 | 145 | 146 | 147 | 148 | 149 | org.apache.maven.plugins 150 | maven-eclipse-plugin 151 | 2.9 152 | 153 | true 154 | false 155 | 156 | 157 | 158 | 159 | org.apache.maven.plugins 160 | maven-compiler-plugin 161 | 3.5.1 162 | 163 | ${jdk.version} 164 | ${jdk.version} 165 | 166 | 167 | 168 | maven-shade-plugin 169 | 2.4.3 170 | 171 | 172 | package 173 | 174 | shade 175 | 176 | 177 | false 178 | 179 | 180 | 181 | *:* 182 | 183 | META-INF/*.SF 184 | META-INF/*.DSA 185 | META-INF/*.RSA 186 | 187 | 188 | 189 | 190 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | org.apache.maven.plugins 200 | maven-assembly-plugin 201 | 2.4.1 202 | 203 | 204 | 205 | jar-with-dependencies 206 | 207 | 208 | 209 | 210 | com.packt.ScalaML.ProductionEngineering.BoschProductionLinePerformance2 211 | 212 | 213 | 214 | 215 | oozie.launcher.mapreduce.job.user.classpath.first 216 | true 217 | 218 | 219 | 220 | 221 | 222 | make-assembly 223 | 224 | package 225 | 226 | single 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | -------------------------------------------------------------------------------- /Chapter04/src/main/scala/ScalaTreeEnsimbles/ChurnPredictionDT.scala: -------------------------------------------------------------------------------- 1 | package ScalaTreeEnsimbles 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql.SQLContext 8 | import org.apache.spark.sql.SQLImplicits 9 | import org.apache.spark.sql._ 10 | import org.apache.spark.sql.Dataset 11 | import org.apache.spark.ml.Pipeline 12 | import org.apache.spark.ml.classification.{ DecisionTreeClassifier, DecisionTreeClassificationModel } 13 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 14 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 15 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } 16 | 17 | object ChurnPredictionDT { 18 | def main(args: Array[String]) { 19 | val spark = SparkSession 20 | .builder 21 | .master("local[*]") 22 | .config("spark.sql.warehouse.dir", "E:/Exp/") 23 | .appName("ChurnPrediction") 24 | .getOrCreate() 25 | 26 | import spark.implicits._ 27 | 28 | val dTree = new DecisionTreeClassifier() 29 | .setLabelCol("label") 30 | .setFeaturesCol("features") 31 | .setSeed(12357L) 32 | 33 | // Chain indexers and tree in a Pipeline. 34 | val pipeline = new Pipeline() 35 | .setStages(Array(ScalaClassification.PipelineConstruction.ipindexer, 36 | ScalaClassification.PipelineConstruction.labelindexer, 37 | ScalaClassification.PipelineConstruction.assembler, 38 | dTree)) 39 | 40 | // Search through decision tree's maxDepth parameter for best model 41 | var paramGrid = new ParamGridBuilder() 42 | .addGrid(dTree.impurity, "gini" :: "entropy" :: Nil) 43 | .addGrid(dTree.maxBins, 3 :: 5 :: 9 :: 10 :: Nil) 44 | .addGrid(dTree.maxDepth, 5 :: 10 :: 15 :: Nil) 45 | .build() 46 | 47 | val evaluator = new BinaryClassificationEvaluator() 48 | .setLabelCol("label") 49 | .setRawPredictionCol("prediction") 50 | 51 | // Set up 10-fold cross validation 52 | val numFolds = 10 53 | val crossval = new CrossValidator() 54 | .setEstimator(pipeline) 55 | .setEvaluator(evaluator) 56 | .setEstimatorParamMaps(paramGrid) 57 | .setNumFolds(numFolds) 58 | 59 | val cvModel = crossval.fit(ScalaClassification.Preprocessing.trainDF) 60 | 61 | val bestModel = cvModel.bestModel 62 | println("The Best Model and Parameters:\n--------------------") 63 | println(bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel].stages(3)) 64 | 65 | bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel] 66 | .stages(3) 67 | .extractParamMap 68 | 69 | val treeModel = bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel] 70 | .stages(3) 71 | .asInstanceOf[DecisionTreeClassificationModel] 72 | 73 | println("Learned classification tree model:\n" + treeModel.toDebugString) 74 | println("Feature 11:" + ScalaClassification.Preprocessing.trainDF.select(ScalaClassification.PipelineConstruction.featureCols(11))) 75 | println("Feature 3:" + ScalaClassification.Preprocessing.trainDF.select(ScalaClassification.PipelineConstruction.featureCols(3))) 76 | 77 | val predDF = cvModel.transform(ScalaClassification.Preprocessing.testSet) 78 | val result = predDF.select("label", "prediction", "probability") 79 | val resutDF = result.withColumnRenamed("prediction", "Predicted_label") 80 | resutDF.show(10) 81 | 82 | val accuracy = evaluator.evaluate(predDF) 83 | println("Classification accuracy: " + accuracy) 84 | 85 | // Compute other performence metrices 86 | val predictionAndLabels = predDF 87 | .select("prediction", "label") 88 | .rdd.map(x => (x(0).asInstanceOf[Double], x(1) 89 | .asInstanceOf[Double])) 90 | 91 | val metrics = new BinaryClassificationMetrics(predictionAndLabels) 92 | val areaUnderPR = metrics.areaUnderPR 93 | println("Area under the precision-recall curve: " + areaUnderPR) 94 | 95 | val areaUnderROC = metrics.areaUnderROC 96 | println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) 97 | 98 | val tVSpDF = predDF.select("label", "prediction") // True vs predicted labels 99 | val TC = predDF.count() //Total count 100 | 101 | val tp = tVSpDF.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / TC.toDouble 102 | val tn = tVSpDF.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / TC.toDouble 103 | val fp = tVSpDF.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / TC.toDouble 104 | val fn = tVSpDF.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / TC.toDouble 105 | 106 | val MCC = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (fp + tn) * (tn + fn)) // Calculating Matthews correlation coefficient 107 | 108 | println("True positive rate: " + tp *100 + "%") 109 | println("False positive rate: " + fp * 100 + "%") 110 | println("True negative rate: " + tn * 100 + "%") 111 | println("False negative rate: " + fn * 100 + "%") 112 | println("Matthews correlation coefficient: " + MCC) 113 | } 114 | } -------------------------------------------------------------------------------- /Chapter04/src/main/scala/ScalaTreeEnsimbles/ChurnPredictionGBT.scala: -------------------------------------------------------------------------------- 1 | package ScalaTreeEnsimbles 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql._ 8 | import org.apache.spark.ml.Pipeline 9 | import org.apache.spark.ml.classification.{GBTClassifier, GBTClassificationModel} 10 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 11 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 12 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } 13 | 14 | object ChurnPredictionGBT { 15 | def main(args: Array[String]) { 16 | val spark = SparkSession 17 | .builder 18 | .master("local[*]") 19 | .config("spark.sql.warehouse.dir", "E:/Exp/") 20 | .appName("ChurnPrediction") 21 | .getOrCreate() 22 | 23 | import spark.implicits._ 24 | 25 | val gbt = new GBTClassifier() 26 | .setLabelCol("label") 27 | .setFeaturesCol("features") 28 | .setSeed(1234567L) 29 | 30 | // Chain indexers and tree in a Pipeline. 31 | val pipeline = new Pipeline() 32 | .setStages(Array(ScalaClassification.PipelineConstruction.ipindexer, 33 | ScalaClassification.PipelineConstruction.labelindexer, 34 | ScalaClassification.PipelineConstruction.assembler, 35 | gbt)) 36 | 37 | // Search through decision tree's maxDepth parameter for best model 38 | val paramGrid = new ParamGridBuilder() 39 | .addGrid(gbt.maxDepth, 3 :: 5 :: 10 :: Nil) // :: 15 :: 20 :: 25 :: 30 :: Nil) 40 | .addGrid(gbt.impurity, "gini" :: "entropy" :: Nil) 41 | .addGrid(gbt.maxBins, 5 :: 10 :: 20 :: Nil) //10 :: 15 :: 25 :: 35 :: 45 :: Nil) 42 | .build() 43 | 44 | val evaluator = new BinaryClassificationEvaluator() 45 | .setLabelCol("label") 46 | .setRawPredictionCol("prediction") 47 | 48 | // Set up 10-fold cross validation 49 | val numFolds = 10 50 | val crossval = new CrossValidator() 51 | .setEstimator(pipeline) 52 | .setEvaluator(evaluator) 53 | .setEstimatorParamMaps(paramGrid) 54 | .setNumFolds(numFolds) 55 | 56 | val cvModel = crossval.fit(ScalaClassification.Preprocessing.trainDF) 57 | 58 | // Save the workflow 59 | cvModel.write.overwrite().save("model/RF_model_churn") 60 | 61 | val bestModel = cvModel.bestModel 62 | println("The Best Model and Parameters:\n--------------------") 63 | println(bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel].stages(3)) 64 | 65 | bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel] 66 | .stages(3) 67 | .extractParamMap 68 | 69 | val treeModel = bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel] 70 | .stages(3) 71 | .asInstanceOf[GBTClassificationModel] 72 | 73 | println("Learned classification tree model:\n" + treeModel.toDebugString) 74 | println("Feature 11:" + ScalaClassification.Preprocessing.trainDF.select(ScalaClassification.PipelineConstruction.featureCols(11))) 75 | println("Feature 3:" + ScalaClassification.Preprocessing.trainDF.select(ScalaClassification.PipelineConstruction.featureCols(3))) 76 | 77 | val predDF = cvModel.transform(ScalaClassification.Preprocessing.testSet) 78 | val result = predDF.select("label", "prediction", "probability") 79 | val resutDF = result.withColumnRenamed("prediction", "Predicted_label") 80 | resutDF.show(10) 81 | 82 | val accuracy = evaluator.evaluate(predDF) 83 | println("Classification accuracy: " + accuracy) 84 | 85 | // Compute other performence metrices 86 | val predictionAndLabels = predDF 87 | .select("prediction", "label") 88 | .rdd.map(x => (x(0).asInstanceOf[Double], x(1) 89 | .asInstanceOf[Double])) 90 | 91 | val metrics = new BinaryClassificationMetrics(predictionAndLabels) 92 | val areaUnderPR = metrics.areaUnderPR 93 | println("Area under the precision-recall curve: " + areaUnderPR) 94 | 95 | val areaUnderROC = metrics.areaUnderROC 96 | println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) 97 | 98 | val tVSpDF = predDF.select("label", "prediction") // True vs predicted labels 99 | val TC = predDF.count() //Total count 100 | 101 | val tp = tVSpDF.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / TC.toDouble 102 | val tn = tVSpDF.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / TC.toDouble 103 | val fp = tVSpDF.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / TC.toDouble 104 | val fn = tVSpDF.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / TC.toDouble 105 | 106 | val MCC = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (fp + tn) * (tn + fn)) // Calculating Matthews correlation coefficient 107 | 108 | println("True positive rate: " + tp *100 + "%") 109 | println("False positive rate: " + fp * 100 + "%") 110 | println("True negative rate: " + tn * 100 + "%") 111 | println("False negative rate: " + fn * 100 + "%") 112 | println("Matthews correlation coefficient: " + MCC) 113 | } 114 | } -------------------------------------------------------------------------------- /Chapter04/src/main/scala/ScalaTreeEnsimbles/ChurnPredictionRF.scala: -------------------------------------------------------------------------------- 1 | package ScalaTreeEnsimbles 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql._ 8 | import org.apache.spark.ml.Pipeline 9 | import org.apache.spark.ml.classification.{ RandomForestClassifier, RandomForestClassificationModel } 10 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 11 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 12 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } 13 | 14 | object ChurnPredictionRF { 15 | def main(args: Array[String]) { 16 | val spark = SparkSession 17 | .builder 18 | .master("local[*]") 19 | .config("spark.sql.warehouse.dir", "E:/Exp/") 20 | .appName("ChurnPrediction") 21 | .getOrCreate() 22 | 23 | import spark.implicits._ 24 | 25 | val rf = new RandomForestClassifier() 26 | .setLabelCol("label") 27 | .setFeaturesCol("features") 28 | .setSeed(1234567L) 29 | 30 | // Chain indexers and tree in a Pipeline. 31 | val pipeline = new Pipeline() 32 | .setStages(Array(ScalaClassification.PipelineConstruction.ipindexer, 33 | ScalaClassification.PipelineConstruction.labelindexer, 34 | ScalaClassification.PipelineConstruction.assembler, 35 | rf)) 36 | 37 | // Search through decision tree's maxDepth parameter for best model 38 | val paramGrid = new ParamGridBuilder() 39 | .addGrid(rf.maxDepth, 3 :: 5 :: 10 :: Nil) // :: 15 :: 20 :: 25 :: 30 :: Nil) 40 | .addGrid(rf.featureSubsetStrategy, "auto" :: "all" :: Nil) 41 | .addGrid(rf.impurity, "gini" :: "entropy" :: Nil) 42 | .addGrid(rf.maxBins, 5 :: 10 :: 20 :: Nil) //10 :: 15 :: 25 :: 35 :: 45 :: Nil) 43 | .addGrid(rf.numTrees, 5 :: 10 :: 20 :: Nil) // :: 100 :: Nil) 44 | .build() 45 | 46 | val evaluator = new BinaryClassificationEvaluator() 47 | .setLabelCol("label") 48 | .setRawPredictionCol("prediction") 49 | 50 | // Set up 10-fold cross validation 51 | val numFolds = 10 52 | val crossval = new CrossValidator() 53 | .setEstimator(pipeline) 54 | .setEvaluator(evaluator) 55 | .setEstimatorParamMaps(paramGrid) 56 | .setNumFolds(numFolds) 57 | 58 | val cvModel = crossval.fit(ScalaClassification.Preprocessing.trainDF) 59 | 60 | // Save the workflow 61 | cvModel.write.overwrite().save("model/RF_model_churn") 62 | 63 | val bestModel = cvModel.bestModel 64 | println("The Best Model and Parameters:\n--------------------") 65 | println(bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel].stages(3)) 66 | 67 | bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel] 68 | .stages(3) 69 | .extractParamMap 70 | 71 | val treeModel = bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel] 72 | .stages(3) 73 | .asInstanceOf[RandomForestClassificationModel] 74 | 75 | println("Learned classification tree model:\n" + treeModel.toDebugString) 76 | println("Feature 11:" + ScalaClassification.Preprocessing.trainDF.select(ScalaClassification.PipelineConstruction.featureCols(11))) 77 | println("Feature 3:" + ScalaClassification.Preprocessing.trainDF.select(ScalaClassification.PipelineConstruction.featureCols(3))) 78 | 79 | val predDF = cvModel.transform(ScalaClassification.Preprocessing.testSet) 80 | val result = predDF.select("label", "prediction", "probability") 81 | val resutDF = result.withColumnRenamed("prediction", "Predicted_label") 82 | resutDF.show(10) 83 | 84 | val accuracy = evaluator.evaluate(predDF) 85 | println("Classification accuracy: " + accuracy) 86 | 87 | // Compute other performence metrices 88 | val predictionAndLabels = predDF 89 | .select("prediction", "label") 90 | .rdd.map(x => (x(0).asInstanceOf[Double], x(1) 91 | .asInstanceOf[Double])) 92 | 93 | val metrics = new BinaryClassificationMetrics(predictionAndLabels) 94 | val areaUnderPR = metrics.areaUnderPR 95 | println("Area under the precision-recall curve: " + areaUnderPR) 96 | 97 | val areaUnderROC = metrics.areaUnderROC 98 | println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC) 99 | 100 | val tVSpDF = predDF.select("label", "prediction") // True vs predicted labels 101 | val TC = predDF.count() //Total count 102 | 103 | val tp = tVSpDF.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / TC.toDouble 104 | val tn = tVSpDF.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / TC.toDouble 105 | val fp = tVSpDF.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / TC.toDouble 106 | val fn = tVSpDF.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / TC.toDouble 107 | 108 | val MCC = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (fp + tn) * (tn + fn)) // Calculating Matthews correlation coefficient 109 | 110 | println("True positive rate: " + tp *100 + "%") 111 | println("False positive rate: " + fp * 100 + "%") 112 | println("True negative rate: " + tn * 100 + "%") 113 | println("False negative rate: " + fn * 100 + "%") 114 | println("Matthews correlation coefficient: " + MCC) 115 | } 116 | } -------------------------------------------------------------------------------- /Chapter04/src/main/scala/ScalaTreeEnsimbles/Preproessing.scala: -------------------------------------------------------------------------------- 1 | package ScalaTreeEnsimbles 2 | 3 | import org.apache.spark.ml.feature.{ StringIndexer, StringIndexerModel } 4 | import org.apache.spark.ml.feature.VectorAssembler 5 | import org.apache.spark.sql._ 6 | import org.apache.spark.sql.functions._ 7 | 8 | object Preproessing { 9 | var trainSample = 1.0 10 | var testSample = 1.0 11 | val train = "data/insurance_train.csv" 12 | val test = "data/insurance_test.csv" 13 | 14 | val spark = SparkSession 15 | .builder 16 | .master("local[*]") 17 | .config("spark.sql.warehouse.dir", "E:/Exp/") 18 | .appName(s"OneVsRestExample") 19 | .getOrCreate() 20 | 21 | import spark.implicits._ 22 | println("Reading data from " + train + " file") 23 | 24 | val trainInput = spark.read 25 | .option("header", "true") 26 | .option("inferSchema", "true") 27 | .format("com.databricks.spark.csv") 28 | .load(train) 29 | .cache 30 | 31 | val testInput = spark.read 32 | .option("header", "true") 33 | .option("inferSchema", "true") 34 | .format("com.databricks.spark.csv") 35 | .load(test) 36 | .cache 37 | 38 | println("Preparing data for training model") 39 | var data = trainInput.withColumnRenamed("loss", "label").sample(false, trainSample) 40 | var DF = data.na.drop() 41 | 42 | // Null check 43 | if (data == DF) 44 | println("No null values in the DataFrame") 45 | 46 | else { 47 | println("Null values exist in the DataFrame") 48 | data = DF 49 | } 50 | 51 | val seed = 23579L 52 | val splits = data.randomSplit(Array(0.80, 0.20), seed) 53 | val (trainingData, validData) = (splits(0), splits(1)) 54 | 55 | trainingData.cache 56 | validData.cache 57 | 58 | val testData = testInput.sample(false, testSample).cache 59 | 60 | def isCateg(c: String): Boolean = c.startsWith("cat") 61 | def categNewCol(c: String): String = if (isCateg(c)) s"idx_${c}" else c 62 | 63 | // Function to remove categorical columns with too many categories 64 | def removeTooManyCategs(c: String): Boolean = !(c matches "cat(109$|110$|112$|113$|116$)") 65 | 66 | // Function to select only feature columns (omit id and label) 67 | def onlyFeatureCols(c: String): Boolean = !(c matches "id|label") 68 | 69 | // Definitive set of feature columns 70 | val featureCols = trainingData.columns 71 | .filter(removeTooManyCategs) 72 | .filter(onlyFeatureCols) 73 | .map(categNewCol) 74 | 75 | // StringIndexer for categorical columns (OneHotEncoder should be evaluated as well) 76 | val stringIndexerStages = trainingData.columns.filter(isCateg) 77 | .map(c => new StringIndexer() 78 | .setInputCol(c) 79 | .setOutputCol(categNewCol(c)) 80 | .fit(trainInput.select(c).union(testInput.select(c)))) 81 | 82 | // VectorAssembler for training features 83 | val assembler = new VectorAssembler() 84 | .setInputCols(featureCols) 85 | .setOutputCol("features") 86 | } -------------------------------------------------------------------------------- /Chapter04/src/main/scala/ScalaTreeEnsimbles/UrbanTrafficDTRegressor.scala: -------------------------------------------------------------------------------- 1 | package ScalaTreeEnsimbles 2 | 3 | import org.apache.spark.ml.regression.{DecisionTreeRegressor, DecisionTreeRegressionModel} 4 | import org.apache.spark.ml.{ Pipeline, PipelineModel } 5 | import org.apache.spark.ml.evaluation.RegressionEvaluator 6 | import org.apache.spark.ml.tuning.ParamGridBuilder 7 | import org.apache.spark.ml.tuning.CrossValidator 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.sql.functions._ 10 | import org.apache.spark.mllib.evaluation.RegressionMetrics 11 | import org.apache.log4j.LogManager 12 | import org.apache.spark.ml.feature.VectorAssembler 13 | 14 | object UrbanTrafficDTRegressor { 15 | def main(args: Array[String]) { 16 | val spark = SparkSession 17 | .builder 18 | .master("local[*]") 19 | .config("spark.sql.warehouse.dir", "E:/Exp/") 20 | .appName(s"DecisionTreeRegressor") 21 | .getOrCreate() 22 | import spark.implicits._ 23 | 24 | val rawTrafficDF = spark.read 25 | .option("header", "true") 26 | .option("inferSchema", "true") 27 | .option("delimiter", ";") 28 | .format("com.databricks.spark.csv") 29 | .load("data/Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv") 30 | .cache 31 | 32 | val newTrafficDF = rawTrafficDF.withColumnRenamed("Slowness in traffic (%)", "label") 33 | val colNames = newTrafficDF.columns.dropRight(1) 34 | 35 | // VectorAssembler for training features 36 | val assembler = new VectorAssembler() 37 | .setInputCols(colNames) 38 | .setOutputCol("features") 39 | 40 | val assembleDF = assembler.transform(newTrafficDF).select("features", "label") 41 | assembleDF.printSchema() 42 | 43 | val seed = 12345 44 | val splits = assembleDF.randomSplit(Array(0.60, 0.40), seed) 45 | val (trainingData, testData) = (splits(0), splits(1)) 46 | 47 | trainingData.cache 48 | testData.cache 49 | 50 | // Estimator algorithm 51 | val gbtModel = new DecisionTreeRegressor().setFeaturesCol("features").setLabelCol("label") 52 | 53 | // *********************************************************** 54 | println("Preparing K-fold Cross Validation and Grid Search") 55 | // *********************************************************** 56 | 57 | // Search through decision tree's maxDepth parameter for best model 58 | var paramGrid = new ParamGridBuilder() 59 | .addGrid(gbtModel.impurity, "variance" :: Nil)// variance for regression 60 | .addGrid(gbtModel.maxBins, 25 :: 30 :: 35 :: Nil) 61 | .addGrid(gbtModel.maxDepth, 5 :: 10 :: 15 :: Nil) 62 | .build() 63 | 64 | val numFolds = 10 65 | val cv = new CrossValidator() 66 | .setEstimator(gbtModel) 67 | .setEvaluator(new RegressionEvaluator) 68 | .setEstimatorParamMaps(paramGrid) 69 | .setNumFolds(numFolds) 70 | 71 | // ************************************************************ 72 | println("Training model with GradientBoostedTrees algorithm") 73 | // ************************************************************ 74 | val cvModel = cv.fit(trainingData) 75 | 76 | // ********************************************************************** 77 | println("Evaluating the model on the test set and calculating the regression metrics") 78 | // ********************************************************************** 79 | val trainPredictionsAndLabels = cvModel.transform(testData).select("label", "prediction") 80 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd 81 | 82 | val testRegressionMetrics = new RegressionMetrics(trainPredictionsAndLabels) 83 | 84 | val results = "\n=====================================================================\n" + 85 | s"TrainingData count: ${trainingData.count}\n" + 86 | s"TestData count: ${testData.count}\n" + 87 | "=====================================================================\n" + 88 | s"TestData MSE = ${testRegressionMetrics.meanSquaredError}\n" + 89 | s"TestData RMSE = ${testRegressionMetrics.rootMeanSquaredError}\n" + 90 | s"TestData R-squared = ${testRegressionMetrics.r2}\n" + 91 | s"TestData MAE = ${testRegressionMetrics.meanAbsoluteError}\n" + 92 | s"TestData explained variance = ${testRegressionMetrics.explainedVariance}\n" + 93 | "=====================================================================\n" 94 | println(results) 95 | 96 | val bestModel = cvModel.bestModel.asInstanceOf[DecisionTreeRegressionModel] 97 | 98 | println("Decison tree from best cross-validated model: " + bestModel.toDebugString) 99 | 100 | val featureImportances = bestModel.featureImportances.toArray 101 | 102 | val FI_to_List_sorted = featureImportances.toList.sorted.toArray 103 | println("Feature importance generated by the best model: ") 104 | for(x <- FI_to_List_sorted) println(x) 105 | } 106 | } -------------------------------------------------------------------------------- /Chapter04/src/main/scala/ScalaTreeEnsimbles/UrbanTrafficGBTRegressor.scala: -------------------------------------------------------------------------------- 1 | package ScalaTreeEnsimbles 2 | 3 | import org.apache.spark.ml.regression.{ GBTRegressor, GBTRegressionModel } 4 | import org.apache.spark.ml.{ Pipeline, PipelineModel } 5 | import org.apache.spark.ml.evaluation.RegressionEvaluator 6 | import org.apache.spark.sql._ 7 | import org.apache.spark.sql.functions._ 8 | import org.apache.spark.mllib.evaluation.RegressionMetrics 9 | import org.apache.log4j.LogManager 10 | import org.apache.spark.ml.tuning.{ CrossValidator, ParamGridBuilder } 11 | import org.apache.spark.ml.feature.VectorAssembler 12 | 13 | 14 | object UrbanTrafficGBTRegressor { 15 | def main(args: Array[String]) { 16 | val spark = SparkSession 17 | .builder 18 | .master("local[*]") 19 | .config("spark.sql.warehouse.dir", "E:/Exp/") 20 | .appName(s"OneVsRestExample") 21 | .getOrCreate() 22 | 23 | import spark.implicits._ 24 | 25 | val rawTrafficDF = spark.read 26 | .option("header", "true") 27 | .option("inferSchema", "true") 28 | .option("delimiter", ";") 29 | .format("com.databricks.spark.csv") 30 | .load("data/Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv") 31 | .cache 32 | 33 | val newTrafficDF = rawTrafficDF.withColumnRenamed("Slowness in traffic (%)", "label") 34 | val colNames = newTrafficDF.columns.dropRight(1) 35 | 36 | // VectorAssembler for training features 37 | val assembler = new VectorAssembler() 38 | .setInputCols(colNames) 39 | .setOutputCol("features") 40 | 41 | val assembleDF = assembler.transform(newTrafficDF).select("features", "label") 42 | assembleDF.printSchema() 43 | 44 | val seed = 12345 45 | val splits = assembleDF.randomSplit(Array(0.60, 0.40), seed) 46 | val (trainingData, testData) = (splits(0), splits(1)) 47 | 48 | trainingData.cache 49 | testData.cache 50 | 51 | // Estimator algorithm 52 | val gbtModel = new GBTRegressor().setFeaturesCol("features").setLabelCol("label") 53 | 54 | // *********************************************************** 55 | println("Preparing K-fold Cross Validation and Grid Search") 56 | // *********************************************************** 57 | 58 | // Search through decision tree's maxDepth parameter for best model 59 | var paramGrid = new ParamGridBuilder() 60 | .addGrid(gbtModel.impurity, "variance" :: Nil)// variance for regression 61 | .addGrid(gbtModel.maxBins, 3 :: 5 :: 10 :: Nil) 62 | .addGrid(gbtModel.maxDepth, 2 :: 5 :: 10 :: Nil) 63 | .build() 64 | 65 | val numFolds = 10 66 | val cv = new CrossValidator() 67 | .setEstimator(gbtModel) 68 | .setEvaluator(new RegressionEvaluator) 69 | .setEstimatorParamMaps(paramGrid) 70 | .setNumFolds(numFolds) 71 | 72 | // ************************************************************ 73 | println("Training model with GradientBoostedTrees algorithm") 74 | // ************************************************************ 75 | val cvModel = cv.fit(trainingData) 76 | 77 | // ********************************************************************** 78 | println("Evaluating the model on the test set and calculating the regression metrics") 79 | // ********************************************************************** 80 | val trainPredictionsAndLabels = cvModel.transform(testData).select("label", "prediction") 81 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd 82 | 83 | val testRegressionMetrics = new RegressionMetrics(trainPredictionsAndLabels) 84 | 85 | val results = "\n=====================================================================\n" + 86 | s"TrainingData count: ${trainingData.count}\n" + 87 | s"TestData count: ${testData.count}\n" + 88 | "=====================================================================\n" + 89 | s"TestData MSE = ${testRegressionMetrics.meanSquaredError}\n" + 90 | s"TestData RMSE = ${testRegressionMetrics.rootMeanSquaredError}\n" + 91 | s"TestData R-squared = ${testRegressionMetrics.r2}\n" + 92 | s"TestData MAE = ${testRegressionMetrics.meanAbsoluteError}\n" + 93 | s"TestData explained variance = ${testRegressionMetrics.explainedVariance}\n" + 94 | "=====================================================================\n" 95 | println(results) 96 | 97 | val bestModel = cvModel.bestModel.asInstanceOf[GBTRegressionModel] 98 | 99 | println("Decison tree from best cross-validated model" + bestModel.toDebugString) 100 | 101 | val featureImportances = bestModel.featureImportances.toArray 102 | 103 | val FI_to_List_sorted = featureImportances.toList.sorted.toArray 104 | println("Feature importance generated by the best model: ") 105 | for(x <- FI_to_List_sorted) println(x) 106 | } 107 | } -------------------------------------------------------------------------------- /Chapter04/src/main/scala/ScalaTreeEnsimbles/UrbanTrafficRFRegressor.scala: -------------------------------------------------------------------------------- 1 | package ScalaTreeEnsimbles 2 | 3 | import org.apache.spark.ml.regression.{RandomForestRegressor, RandomForestRegressionModel} 4 | import org.apache.spark.ml.{Pipeline, PipelineModel} 5 | import org.apache.spark.ml.evaluation.RegressionEvaluator 6 | import org.apache.spark.sql._ 7 | import org.apache.spark.sql.functions._ 8 | import org.apache.spark.mllib.evaluation.RegressionMetrics 9 | import org.apache.log4j.LogManager 10 | import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder, CrossValidatorModel} 11 | import org.apache.spark.ml.feature.VectorAssembler 12 | 13 | object UrbanTrafficRFRegressor { 14 | def main(args: Array[String]) { 15 | val spark = SparkSession 16 | .builder 17 | .master("local[*]") 18 | .config("spark.sql.warehouse.dir", "E:/Exp/") 19 | .appName(s"RandomForestRegression") 20 | .getOrCreate() 21 | import spark.implicits._ 22 | 23 | val rawTrafficDF = spark.read 24 | .option("header", "true") 25 | .option("inferSchema", "true") 26 | .option("delimiter", ";") 27 | .format("com.databricks.spark.csv") 28 | .load("data/Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv") 29 | .cache 30 | 31 | val newTrafficDF = rawTrafficDF.withColumnRenamed("Slowness in traffic (%)", "label") 32 | val colNames = newTrafficDF.columns.dropRight(1) 33 | 34 | // VectorAssembler for training features 35 | val assembler = new VectorAssembler() 36 | .setInputCols(colNames) 37 | .setOutputCol("features") 38 | 39 | val assembleDF = assembler.transform(newTrafficDF).select("features", "label") 40 | assembleDF.printSchema() 41 | 42 | val seed = 12345 43 | val splits = assembleDF.randomSplit(Array(0.60, 0.40), seed) 44 | val (trainingData, testData) = (splits(0), splits(1)) 45 | 46 | trainingData.cache 47 | testData.cache 48 | 49 | // Estimator algorithm 50 | val rfModel = new RandomForestRegressor().setFeaturesCol("features").setLabelCol("label") 51 | 52 | // *********************************************************** 53 | println("Preparing K-fold Cross Validation and Grid Search") 54 | // *********************************************************** 55 | 56 | // Search through decision tree's maxDepth parameter for best model 57 | val paramGrid = new ParamGridBuilder() 58 | .addGrid(rfModel.impurity, "variance" :: Nil)// variance for regression 59 | .addGrid(rfModel.maxBins, 25 :: 30 :: 35 :: Nil) 60 | .addGrid(rfModel.maxDepth, 5 :: 10 :: 15 :: Nil) 61 | .addGrid(rfModel.numTrees, 3 :: 5 :: 10 :: 15 :: Nil) 62 | .build() 63 | 64 | val numFolds = 10 65 | val cv = new CrossValidator() 66 | .setEstimator(rfModel) 67 | .setEvaluator(new RegressionEvaluator) 68 | .setEstimatorParamMaps(paramGrid) 69 | .setNumFolds(numFolds) 70 | 71 | // ************************************************************ 72 | println("Training model with RandomForestRegressor algorithm") 73 | // ************************************************************ 74 | val cvModel = cv.fit(trainingData) 75 | 76 | // ********************************************************************** 77 | println("Evaluating the model on the test set and calculating the regression metrics") 78 | // ********************************************************************** 79 | val trainPredictionsAndLabels = cvModel.transform(testData).select("label", "prediction") 80 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd 81 | 82 | val testRegressionMetrics = new RegressionMetrics(trainPredictionsAndLabels) 83 | 84 | val results = "\n=====================================================================\n" + 85 | s"TrainingData count: ${trainingData.count}\n" + 86 | s"TestData count: ${testData.count}\n" + 87 | "=====================================================================\n" + 88 | s"TestData MSE = ${testRegressionMetrics.meanSquaredError}\n" + 89 | s"TestData RMSE = ${testRegressionMetrics.rootMeanSquaredError}\n" + 90 | s"TestData R-squared = ${testRegressionMetrics.r2}\n" + 91 | s"TestData MAE = ${testRegressionMetrics.meanAbsoluteError}\n" + 92 | s"TestData explained variance = ${testRegressionMetrics.explainedVariance}\n" + 93 | "=====================================================================\n" 94 | println(results) 95 | 96 | val bestModel = cvModel.bestModel.asInstanceOf[RandomForestRegressionModel] 97 | 98 | println("Decison tree from best cross-validated model: " + bestModel.toDebugString) 99 | 100 | val featureImportances = bestModel.featureImportances.toArray 101 | 102 | val FI_to_List_sorted = featureImportances.toList.sorted.toArray 103 | println("Feature importance generated by the best model: ") 104 | for(x <- FI_to_List_sorted) println(x) 105 | 106 | spark.stop() 107 | } 108 | } -------------------------------------------------------------------------------- /Chapter05/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.deri.sels 5 | PopulationClustering_v2 6 | 0.1-SNAPSHOT 7 | 8 | 2.4.0 9 | 2.11.8 10 | 3.22.1.1 11 | 2.4.1 12 | 0.23.0 13 | 14 | 15 | 16 | 17 | scala-tools.org 18 | Scala-tools Maven2 Repository 19 | http://scala-tools.org/repo-releases 20 | 21 | 22 | 23 | 24 | org.bdgenomics.adam 25 | adam-core_2.11 26 | ${adam.version} 27 | 28 | 29 | 30 | ai.h2o 31 | sparkling-water-core_2.11 32 | ${sparklingwater.version} 33 | 34 | 35 | ai.h2o 36 | sparkling-water-examples_2.11 37 | ${sparklingwater.version} 38 | 39 | 40 | org.apache.directory.studio 41 | org.apache.commons.io 42 | 2.4 43 | 44 | 45 | org.apache.spark 46 | spark-core_2.11 47 | ${spark.version} 48 | 49 | 50 | 51 | ai.h2o 52 | h2o-core 53 | ${h2o.version} 54 | 55 | 56 | ai.h2o 57 | h2o-scala_2.11 58 | ${h2o.version} 59 | 60 | 61 | ai.h2o 62 | h2o-algos 63 | ${h2o.version} 64 | 65 | 66 | ai.h2o 67 | h2o-app 68 | ${h2o.version} 69 | 70 | 71 | ai.h2o 72 | h2o-persist-hdfs 73 | ${h2o.version} 74 | 75 | 76 | scala-library 77 | org.scala-lang 78 | ${scala.version} 79 | 80 | 81 | ai.h2o 82 | google-analytics-java 83 | 1.1.2-H2O-CUSTOM 84 | 85 | 86 | joda-time 87 | joda-time 88 | 2.9.9 89 | 90 | 91 | 92 | 93 | snapshots-repo 94 | https://oss.sonatype.org/content/repositories/snapshots 95 | 96 | false 97 | 98 | 99 | true 100 | daily 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | org.apache.maven.plugins 109 | maven-eclipse-plugin 110 | 2.9 111 | 112 | true 113 | false 114 | 115 | 116 | 117 | 118 | org.apache.maven.plugins 119 | maven-compiler-plugin 120 | 3.5.1 121 | 122 | ${jdk.version} 123 | ${jdk.version} 124 | 125 | 126 | 127 | org.apache.maven.plugins 128 | maven-shade-plugin 129 | 2.4.3 130 | 131 | true 132 | 133 | 134 | 135 | 136 | org.apache.maven.plugins 137 | maven-assembly-plugin 138 | 2.4.1 139 | 140 | 141 | 142 | jar-with-dependencies 143 | 144 | 145 | 146 | 147 | org.fit.genomics.PopStratClassification 148 | 149 | 150 | 151 | 152 | oozie.launcher.mapreduce.job.user.classpath.first 153 | true 154 | 155 | 156 | 157 | 158 | 159 | make-assembly 160 | 161 | package 162 | 163 | single 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | -------------------------------------------------------------------------------- /Chapter05/src/main/scala/org/fit/genomics/PCA.scala: -------------------------------------------------------------------------------- 1 | package org.fit.genomics 2 | 3 | import org.apache.spark.sql._ 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.ml.feature.PCA 6 | import org.apache.spark.ml.linalg.Vectors 7 | 8 | object PCAExample { 9 | def main(args: Array[String]): Unit = { 10 | val spark: SparkSession = SparkSession.builder.appName("PopStrat").master("local[*]").getOrCreate() 11 | 12 | val data = Array( 13 | Vectors.dense(3.5, 2.0, 5.0, 6.3, 5.60, 2.4), 14 | Vectors.dense(4.40, 0.10, 3.0, 9.0, 7.0, 8.75), 15 | Vectors.dense(3.20, 2.40, 0.0, 6.0, 7.4, 3.34)) 16 | 17 | val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") 18 | df.show(false) 19 | 20 | val pca = new PCA() 21 | .setInputCol("features") 22 | .setOutputCol("pcaFeatures") 23 | .setK(4) 24 | .fit(df) 25 | 26 | val result = pca.transform(df).select("pcaFeatures") 27 | result.show(false) 28 | 29 | } 30 | } -------------------------------------------------------------------------------- /Chapter05/src/main/scala/org/fit/genomics/PopStratClustering.scala: -------------------------------------------------------------------------------- 1 | package org.fit.genomics 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.{ SparkConf, SparkContext } 6 | import org.apache.spark._ 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.mllib.linalg.{ Vectors, Vector } 9 | import org.apache.spark.ml.clustering.KMeans 10 | import org.apache.spark.ml.evaluation.ClusteringEvaluator 11 | import org.apache.spark.SparkContext 12 | import org.apache.spark.sql.types.{ IntegerType, StringType, StructField, StructType } 13 | import org.apache.spark.ml.feature.{ VectorAssembler, Normalizer } 14 | import org.apache.spark.ml.Pipeline 15 | import org.apache.spark.ml.feature.VectorIndexer 16 | import org.apache.spark.ml.feature.PCA 17 | 18 | import water._ 19 | import water.fvec.Frame 20 | import water.{ Job, Key } 21 | import water.fvec.Frame 22 | import hex.FrameSplitter 23 | import org.apache.spark.h2o._ 24 | import org.apache.spark.h2o.H2OContext 25 | 26 | import org.bdgenomics.adam.rdd.ADAMContext._ 27 | import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele } 28 | 29 | import java.io.File 30 | import java.io._ 31 | import scala.collection.JavaConverters._ 32 | import scala.collection.immutable.Range.inclusive 33 | import scala.io.Source 34 | 35 | object PopStratClusterings { 36 | def main(args: Array[String]): Unit = { 37 | val genotypeFile = "C:/Users/admin-karim/Downloads/1.vcf" 38 | val panelFile = "C:/Users/admin-karim/Downloads/genotypes.panel" 39 | 40 | val sparkSession: SparkSession = SparkSession.builder.appName("PopStrat").master("local[*]").getOrCreate() 41 | val sc: SparkContext = sparkSession.sparkContext 42 | 43 | val populations = Set("GBR", "MXL", "ASW", "CHB", "CLM") 44 | def extract(file: String, filter: (String, String) => Boolean): Map[String, String] = { 45 | Source 46 | .fromFile(file) 47 | .getLines() 48 | .map(line => { 49 | val tokens = line.split(Array('\t', ' ')).toList 50 | tokens(0) -> tokens(1) 51 | }) 52 | .toMap 53 | .filter(tuple => filter(tuple._1, tuple._2)) 54 | } 55 | 56 | val panel: Map[String, String] = extract( 57 | panelFile, 58 | (sampleID: String, pop: String) => populations.contains(pop)) 59 | val allGenotypes: RDD[Genotype] = sc.loadGenotypes(genotypeFile).rdd 60 | val genotypes: RDD[Genotype] = allGenotypes.filter(genotype => { 61 | panel.contains(genotype.getSampleId) 62 | }) 63 | 64 | // Convert the Genotype objects to our own SampleVariant objects to try and conserve memory 65 | case class SampleVariant(sampleId: String, 66 | variantId: Int, 67 | alternateCount: Int) 68 | 69 | def variantId(genotype: Genotype): String = { 70 | val name = genotype.getVariant.getContigName 71 | val start = genotype.getVariant.getStart 72 | val end = genotype.getVariant.getEnd 73 | s"$name:$start:$end" 74 | } 75 | 76 | def alternateCount(genotype: Genotype): Int = { 77 | genotype.getAlleles.asScala.count(_ != GenotypeAllele.REF) 78 | } 79 | 80 | def toVariant(genotype: Genotype): SampleVariant = { 81 | // Intern sample IDs as they will be repeated a lot 82 | new SampleVariant(genotype.getSampleId.intern(), 83 | variantId(genotype).hashCode(), 84 | alternateCount(genotype)) 85 | } 86 | 87 | val variantsRDD: RDD[SampleVariant] = genotypes.map(toVariant) 88 | val variantsBySampleId: RDD[(String, Iterable[SampleVariant])] = 89 | variantsRDD.groupBy(_.sampleId) 90 | val sampleCount: Long = variantsBySampleId.count() 91 | println("Found " + sampleCount + " samples") 92 | 93 | val variantsByVariantId: RDD[(Int, Iterable[SampleVariant])] = 94 | variantsRDD.groupBy(_.variantId).filter { 95 | case (_, sampleVariants) => sampleVariants.size == sampleCount 96 | } 97 | 98 | val variantFrequencies: collection.Map[Int, Int] = variantsByVariantId 99 | .map { 100 | case (variantId, sampleVariants) => 101 | (variantId, sampleVariants.count(_.alternateCount > 0)) 102 | } 103 | .collectAsMap() 104 | 105 | val permittedRange = inclusive(11, 11) 106 | val filteredVariantsBySampleId: RDD[(String, Iterable[SampleVariant])] = 107 | variantsBySampleId.map { 108 | case (sampleId, sampleVariants) => 109 | val filteredSampleVariants = sampleVariants.filter( 110 | variant => 111 | permittedRange.contains( 112 | variantFrequencies.getOrElse(variant.variantId, -1))) 113 | (sampleId, filteredSampleVariants) 114 | } 115 | 116 | val sortedVariantsBySampleId: RDD[(String, Array[SampleVariant])] = 117 | filteredVariantsBySampleId.map { 118 | case (sampleId, variants) => 119 | (sampleId, variants.toArray.sortBy(_.variantId)) 120 | } 121 | 122 | println(s"Sorted by Sample ID RDD: " + sortedVariantsBySampleId.first()) 123 | 124 | val header = StructType( 125 | Array(StructField("Region", StringType)) ++ 126 | sortedVariantsBySampleId 127 | .first() 128 | ._2 129 | .map(variant => { 130 | StructField(variant.variantId.toString, IntegerType) 131 | })) 132 | 133 | val rowRDD: RDD[Row] = sortedVariantsBySampleId.map { 134 | case (sampleId, sortedVariants) => 135 | val region: Array[String] = Array(panel.getOrElse(sampleId, "Unknown")) 136 | val alternateCounts: Array[Int] = sortedVariants.map(_.alternateCount) 137 | Row.fromSeq(region ++ alternateCounts) 138 | } 139 | 140 | // Create the SchemaRDD from the header and rows and convert the SchemaRDD into a Spark dataframe 141 | val sqlContext = sparkSession.sqlContext 142 | val schemaDF = sqlContext.createDataFrame(rowRDD, header).drop("Region") 143 | schemaDF.printSchema() 144 | schemaDF.show(10) 145 | 146 | println(schemaDF.columns.length) 147 | 148 | // Using vector assembler to create feature vector 149 | val featureCols = schemaDF.columns 150 | val assembler = new VectorAssembler() 151 | .setInputCols(featureCols) 152 | .setOutputCol("features") 153 | 154 | val assembleDF = assembler.transform(schemaDF).select("features") 155 | assembleDF.show() 156 | 157 | // Elbow method with reduced dimension 158 | val pca = new PCA() 159 | .setInputCol("features") 160 | .setOutputCol("pcaFeatures") 161 | .setK(5) 162 | .fit(assembleDF) 163 | 164 | val pcaDF = pca.transform(assembleDF).select("pcaFeatures").withColumnRenamed("pcaFeatures", "features") 165 | pcaDF.show() 166 | 167 | val iterations = 20 168 | for (i <- 2 to iterations) { 169 | // Trains a k-means model. 170 | val kmeans = new KMeans().setK(i).setSeed(12345L) 171 | val model = kmeans.fit(pcaDF) 172 | 173 | // Evaluate clustering by computing Within Set Sum of Squared Errors. 174 | val WCSS = model.computeCost(pcaDF) 175 | println("Within Set Sum of Squared Errors for k = " + i + " is " + WCSS) 176 | } 177 | /* 178 | Within Set Sum of Squared Errors for k = 2 is 135.0048361804504 179 | Within Set Sum of Squared Errors for k = 3 is 90.95271589232344 180 | Within Set Sum of Squared Errors for k = 4 is 73.03991105363087 181 | Within Set Sum of Squared Errors for k = 5 is 52.712937492025276 182 | Within Set Sum of Squared Errors for k = 6 is 35.0048649663809 183 | Within Set Sum of Squared Errors for k = 7 is 33.11707134428616 184 | Within Set Sum of Squared Errors for k = 8 is 30.546631341918243 185 | Within Set Sum of Squared Errors for k = 9 is 28.453155497711535 186 | Within Set Sum of Squared Errors for k = 10 is 24.93179715697327 187 | Within Set Sum of Squared Errors for k = 11 is 25.56839205985354 188 | Within Set Sum of Squared Errors for k = 12 is 18.76755804955161 189 | Within Set Sum of Squared Errors for k = 13 is 18.55123407031501 190 | Within Set Sum of Squared Errors for k = 14 is 16.140301237245204 191 | Within Set Sum of Squared Errors for k = 15 is 14.143806816130821 192 | Within Set Sum of Squared Errors for k = 16 is 15.017971347008297 193 | Within Set Sum of Squared Errors for k = 17 is 12.266417893931926 194 | Within Set Sum of Squared Errors for k = 18 is 11.108546956133177 195 | Within Set Sum of Squared Errors for k = 19 is 11.505990055606803 196 | Within Set Sum of Squared Errors for k = 20 is 12.26634441065655 197 | */ 198 | 199 | // Evaluate clustering by computing Silhouette score 200 | val evaluator = new ClusteringEvaluator() 201 | 202 | for (k <- 2 to 20 by 1) { 203 | val kmeans = new KMeans().setK(k).setSeed(12345L) 204 | val model = kmeans.fit(pcaDF) 205 | val transformedDF = model.transform(pcaDF) 206 | val score = evaluator.evaluate(transformedDF) 207 | println("Silhouette with squared euclidean distance for k = " + k + " is " + score) 208 | } 209 | /* 210 | * Silhouette with squared euclidean distance for k = 2 is 0.9175803927739566 211 | Silhouette with squared euclidean distance for k = 3 is 0.8288633816548874 212 | Silhouette with squared euclidean distance for k = 4 is 0.6376477607336495 213 | Silhouette with squared euclidean distance for k = 5 is 0.6731472765720269 214 | Silhouette with squared euclidean distance for k = 6 is 0.6641908680884869 215 | Silhouette with squared euclidean distance for k = 7 is 0.5758081075880451 216 | Silhouette with squared euclidean distance for k = 8 is 0.588881352222969 217 | Silhouette with squared euclidean distance for k = 9 is 0.6485153435398991 218 | Silhouette with squared euclidean distance for k = 10 is 0.48949118556376964 219 | Silhouette with squared euclidean distance for k = 11 is 0.5371218728964895 220 | Silhouette with squared euclidean distance for k = 12 is 0.5569086502410784 221 | Silhouette with squared euclidean distance for k = 13 is 0.3990728491364654 222 | Silhouette with squared euclidean distance for k = 14 is 0.5311155969749914 223 | Silhouette with squared euclidean distance for k = 15 is 0.5457021641983345 224 | Silhouette with squared euclidean distance for k = 16 is 0.4891629883332554 225 | Silhouette with squared euclidean distance for k = 17 is 0.5452872742013583 226 | Silhouette with squared euclidean distance for k = 18 is 0.5304994251201304 227 | Silhouette with squared euclidean distance for k = 19 is 0.5327466913746908 228 | Silhouette with squared euclidean distance for k = 20 is 0.45336547054142284 229 | */ 230 | 231 | val kmeansOptimal = new KMeans().setK(2).setSeed(12345L) 232 | val modelOptimal = kmeansOptimal.fit(pcaDF) 233 | 234 | // Making predictions 235 | val predictionsOptimalDF = modelOptimal.transform(pcaDF) 236 | predictionsOptimalDF.show() 237 | 238 | // Evaluate clustering by computing Silhouette score 239 | val evaluatorOptimal = new ClusteringEvaluator() 240 | 241 | val silhouette = evaluatorOptimal.evaluate(predictionsOptimalDF) 242 | println(s"Silhouette with squared euclidean distance = $silhouette") 243 | 244 | sparkSession.stop() 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /Chapter06/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.packt.AnomalyDetection 6 | RandomForest 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | ScalaMLQuickStartGuide 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 1.8 16 | 2.2.0 17 | 18 | 19 | 20 | 21 | jdk.tools 22 | jdk.tools 23 | 1.8.0_171 24 | system 25 | C:/Program Files/Java/jdk1.8.0_171/lib/tools.jar 26 | 27 | 28 | org.apache.directory.studio 29 | org.apache.commons.io 30 | 2.4 31 | 32 | 33 | org.apache.spark 34 | spark-core_2.11 35 | ${spark.version} 36 | 37 | 38 | com.github.tototoshi 39 | scala-csv_2.10 40 | 1.3.5 41 | 42 | 43 | org.apache.spark 44 | spark-sql_2.11 45 | ${spark.version} 46 | 47 | 48 | com.github.scopt 49 | scopt_2.11 50 | 3.3.0 51 | 52 | 53 | com.typesafe 54 | config 55 | 1.2.1 56 | 57 | 58 | org.apache.directory.api 59 | api-util 60 | 1.0.0 61 | 62 | 63 | commons-io 64 | commons-io 65 | 2.6 66 | 67 | 68 | com.esotericsoftware.kryo 69 | kryo 70 | 2.10 71 | 72 | 73 | edu.stanford.nlp 74 | stanford-corenlp 75 | 3.6.0 76 | 77 | 78 | edu.stanford.nlp 79 | stanford-corenlp 80 | 3.6.0 81 | models 82 | 83 | 84 | org.apache.hadoop 85 | hadoop-common 86 | 2.6.0 87 | 88 | 89 | org.sameersingh.scalaplot 90 | scalaplot 91 | 0.0.4 92 | 93 | 94 | org.apache.spark 95 | spark-mllib_2.11 96 | ${spark.version} 97 | 98 | 99 | org.apache.spark 100 | spark-graphx_2.11 101 | ${spark.version} 102 | 103 | 104 | org.apache.spark 105 | spark-yarn_2.11 106 | ${spark.version} 107 | 108 | 109 | org.apache.spark 110 | spark-network-shuffle_2.11 111 | ${spark.version} 112 | 113 | 114 | com.databricks 115 | spark-csv_2.11 116 | 1.3.0 117 | 118 | 119 | com.holdenkarau 120 | spark-testing-base_2.10 121 | 2.0.0_0.6.0 122 | 123 | 124 | com.databricks 125 | spark-avro_2.11 126 | 4.0.0 127 | 128 | 129 | org.apache.commons 130 | commons-math3 131 | 3.2 132 | 133 | 134 | org.apache.hive 135 | hive-exec 136 | 2.3.2 137 | 138 | 139 | junit 140 | junit 141 | 3.8.1 142 | test 143 | 144 | 145 | 146 | 147 | 148 | 149 | org.apache.maven.plugins 150 | maven-eclipse-plugin 151 | 2.9 152 | 153 | true 154 | false 155 | 156 | 157 | 158 | 159 | org.apache.maven.plugins 160 | maven-compiler-plugin 161 | 3.5.1 162 | 163 | ${jdk.version} 164 | ${jdk.version} 165 | 166 | 167 | 168 | maven-shade-plugin 169 | 2.4.3 170 | 171 | 172 | package 173 | 174 | shade 175 | 176 | 177 | false 178 | 179 | 180 | 181 | *:* 182 | 183 | META-INF/*.SF 184 | META-INF/*.DSA 185 | META-INF/*.RSA 186 | 187 | 188 | 189 | 190 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | org.apache.maven.plugins 200 | maven-assembly-plugin 201 | 2.4.1 202 | 203 | 204 | 205 | jar-with-dependencies 206 | 207 | 208 | 209 | 210 | com.packt.ScalaML.ProductionEngineering.BoschProductionLinePerformance2 211 | 212 | 213 | 214 | 215 | oozie.launcher.mapreduce.job.user.classpath.first 216 | true 217 | 218 | 219 | 220 | 221 | 222 | make-assembly 223 | 224 | package 225 | 226 | single 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | -------------------------------------------------------------------------------- /Chapter06/src/main/scala/ScalaBookRecommendation/BookRecommendation.scala: -------------------------------------------------------------------------------- 1 | package ScalaBookRecommendation 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.SQLContext 7 | import org.apache.spark.sql.SQLImplicits 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.sql.Dataset 10 | import org.apache.spark.mllib.recommendation.ALS 11 | import org.apache.spark.mllib.recommendation.MatrixFactorizationModel 12 | import org.apache.spark.mllib.recommendation.Rating 13 | import scala.Tuple2 14 | 15 | import org.apache.spark.rdd.RDD 16 | 17 | object BookRecommendation { 18 | //Compute the RMSE to evaluate the model. Less the RMSE better the model and it's prediction capability. 19 | def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean): Double = { 20 | val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product))) 21 | val predictionsAndRatings = predictions.map { x => ((x.user, x.product), x.rating) 22 | }.join(data.map(x => ((x.user, x.product), x.rating))).values 23 | math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean()) 24 | } 25 | 26 | def main(args: Array[String]) { 27 | val spark = SparkSession 28 | .builder 29 | .master("local[*]") 30 | .config("spark.sql.warehouse.dir", "E:/Exp/") 31 | .appName("BookRecommendation") 32 | .getOrCreate() 33 | 34 | import spark.implicits._ 35 | 36 | println("Loading Ratings data...") 37 | 38 | val ratigsFile = "data/BX-Book-Ratings.csv" 39 | var ratingDF = spark.read.format("com.databricks.spark.csv") 40 | .option("delimiter", ";") 41 | .option("header", true) 42 | .load(ratigsFile) 43 | 44 | ratingDF = ratingDF.withColumnRenamed("User-ID", "UserID").withColumnRenamed("Book-Rating", "Rating") 45 | ratingDF.printSchema() 46 | 47 | /* Explore and Query with Spark DataFrames */ 48 | val numRatings = ratingDF.count() 49 | val numUsers = ratingDF.select(ratingDF.col("UserID")).distinct().count() 50 | val numBooks = ratingDF.select(ratingDF.col("ISBN")).distinct().count() 51 | println("Got " + numRatings + " ratings from " + numUsers + " users on " + numBooks + " books") /* Got 1149780 ratings from 105283 users on 340556 books */ 52 | 53 | val booksFile = "data/BX-Books.csv" 54 | var bookDF = spark.read.format("com.databricks.spark.csv").option("header", "true").option("delimiter", ";").load(booksFile) 55 | bookDF.show() 56 | 57 | bookDF = bookDF.select(bookDF.col("ISBN"), bookDF.col("Book-Title"), bookDF.col("Book-Author"), bookDF.col("Year-Of-Publication")) 58 | bookDF = bookDF.withColumnRenamed("Book-Title", "Title").withColumnRenamed("Book-Author", "Author").withColumnRenamed("Year-Of-Publication", "Year") 59 | bookDF.show(10) 60 | /* 61 | * +----------+--------------------+--------------------+----+ 62 | | ISBN| Title| Author|Year| 63 | +----------+--------------------+--------------------+----+ 64 | |0195153448| Classical Mythology| Mark P. O. Morford|2002| 65 | |0002005018| Clara Callan|Richard Bruce Wright|2001| 66 | |0060973129|Decision in Normandy| Carlo D'Este|1991| 67 | |0374157065|Flu: The Story of...| Gina Bari Kolata|1999| 68 | |0393045218|The Mummies of Ur...| E. J. W. Barber|1999| 69 | |0399135782|The Kitchen God's...| Amy Tan|1991| 70 | |0425176428|What If?: The Wor...| Robert Cowley|2000| 71 | |0671870432| PLEADING GUILTY| Scott Turow|1993| 72 | |0679425608|Under the Black F...| David Cordingly|1996| 73 | |074322678X|Where You'll Find...| Ann Beattie|2002| 74 | +----------+--------------------+--------------------+----+ 75 | only showing top 10 rows 76 | */ 77 | 78 | ratingDF.createOrReplaceTempView("ratings") 79 | bookDF.createOrReplaceTempView("books") 80 | 81 | spark.sql("SELECT max(Rating) FROM ratings").show() 82 | 83 | // Get the max, min ratings along with the count of users who have rated a book. 84 | val statDF = spark.sql("select books.Title, bookrates.maxr, bookrates.minr, bookrates.cntu " 85 | + "from(SELECT ratings.ISBN,max(ratings.Rating) as maxr," 86 | + "min(ratings.Rating) as minr,count(distinct UserID) as cntu " 87 | + "FROM ratings group by ratings.ISBN) bookrates " 88 | + "join books on bookrates.ISBN=books.ISBN " + "order by bookrates.cntu desc") 89 | 90 | statDF.show(10) 91 | /* 92 | * +--------------------+----+----+----+ 93 | | Title|maxr|minr|cntu| 94 | +--------------------+----+----+----+ 95 | | Wild Animus| 9| 0|2502| 96 | |The Lovely Bones:...| 9| 0|1295| 97 | | The Da Vinci Code| 9| 0| 883| 98 | |Divine Secrets of...| 9| 0| 732| 99 | |The Red Tent (Bes...| 9| 0| 723| 100 | | A Painted House| 9| 0| 647| 101 | |The Secret Life o...| 9| 0| 615| 102 | |Snow Falling on C...| 9| 0| 614| 103 | | Angels & Demons| 9| 0| 586| 104 | |Where the Heart I...| 9| 0| 585| 105 | +--------------------+----+----+----+ 106 | only showing top 10 rows 107 | */ 108 | 109 | // Show the top 10 most-active users and how many times they rated a book 110 | val mostActiveReaders = spark.sql("SELECT ratings.UserID, count(*) as CT from ratings " 111 | + "group by ratings.UserID order by CT desc limit 10") 112 | mostActiveReaders.show() 113 | /* 114 | * +------+-----+ 115 | |UserID| CT| 116 | +------+-----+ 117 | | 11676|13602| 118 | |198711| 7550| 119 | |153662| 6109| 120 | | 98391| 5891| 121 | | 35859| 5850| 122 | |212898| 4785| 123 | |278418| 4533| 124 | | 76352| 3367| 125 | |110973| 3100| 126 | |235105| 3067| 127 | +------+-----+ 128 | */ 129 | 130 | // Find the movies that user 276744 rated higher than 5 131 | val ratingBySpecificReader = spark.sql( 132 | "SELECT ratings.UserID, ratings.ISBN," 133 | + "ratings.Rating, books.Title FROM ratings JOIN books " 134 | + "ON books.ISBN=ratings.ISBN " 135 | + "where ratings.UserID=276744 and ratings.Rating > 4") 136 | 137 | ratingBySpecificReader.show(false) 138 | 139 | /* 140 | * +------+----------+------+---------------+ 141 | |UserID|ISBN |Rating|Title | 142 | +------+----------+------+---------------+ 143 | |276744|038550120X|7 |A Painted House| 144 | +------+----------+------+---------------+ 145 | */ 146 | 147 | // Feature engineering 148 | ratingDF = ratingDF.withColumn("ISBN_1", hash($"ISBN")) 149 | ratingDF = ratingDF.select("UserID", "ISBN_1", "Rating") 150 | ratingDF = ratingDF.withColumn("ISBN", abs($"ISBN_1")) 151 | ratingDF = ratingDF.select("UserID", "ISBN", "Rating") 152 | 153 | ratingDF.printSchema() 154 | /* 155 | * root 156 | |-- UserID: string (nullable = true) 157 | |-- ISBN: integer (nullable = false) 158 | |-- Rating: string (nullable = true) 159 | */ 160 | 161 | val seed = 12345 162 | val splits = ratingDF.randomSplit(Array(0.60, 0.40), seed) 163 | val (trainingData, testData) = (splits(0), splits(1)) 164 | 165 | trainingData.cache 166 | testData.cache 167 | 168 | val numTrainingSample = trainingData.count() 169 | val numTestSample = testData.count() 170 | println("Training: " + numTrainingSample + " test: " + numTestSample) // Training: 689144 test: 345774 171 | 172 | val trainRatingsRDD = trainingData.rdd.map(row => { 173 | val userID = row.getString(0) 174 | val ISBN = row.getInt(1) 175 | val ratings = row.getString(2) 176 | Rating(userID.toInt, ISBN, ratings.toDouble) 177 | }) 178 | 179 | val testRatingsRDD = testData.rdd.map(row => { 180 | val userID = row.getString(0) 181 | val ISBN = row.getInt(1) 182 | val ratings = row.getString(2) 183 | Rating(userID.toInt, ISBN, ratings.toDouble) 184 | }) 185 | 186 | val model : MatrixFactorizationModel = new ALS() 187 | .setIterations(10) 188 | .setBlocks(-1) 189 | .setAlpha(1.0) 190 | .setLambda(0.01) 191 | .setRank(25) 192 | .setSeed(1234579L) 193 | .setImplicitPrefs(false) 194 | .run(trainRatingsRDD) 195 | 196 | //Saving the model for future use 197 | //val savedALSModel = model.save(spark.sparkContext, "model/MovieRecomModel") 198 | 199 | //Load the workflow back 200 | //val same_model = MatrixFactorizationModel.load(spark.sparkContext, "model/MovieRecomModel/") 201 | 202 | //Book recommendation for a specific user. Get the top 10 book predictions for reader 276747 203 | println("Recommendations: (ISBN, Rating)") 204 | println("----------------------------------") 205 | val recommendationsUser = model.recommendProducts(276747, 10) 206 | recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println) 207 | println("----------------------------------") 208 | 209 | /* 210 | Recommendations: (ISBN => Rating) 211 | (1051401851,15.127044702142243) 212 | (2056910662,15.11531283195148) 213 | (1013412890,14.75898119158678) 214 | (603241602,14.53024153450836) 215 | (1868529062,14.180262929540024) 216 | (746990712,14.121654522195225) 217 | (1630827789,13.741728003481194) 218 | (1179316963,13.571754513473993) 219 | (505970947,13.506755847456258) 220 | (632523982,13.46591014905454) 221 | ---------------------------------- 222 | */ 223 | 224 | // Evaluating the Model: we expect lower RMSE because smaller the calculated error, the better the model 225 | var rmseTest = computeRmse(model, testRatingsRDD, true) 226 | println("Test RMSE: = " + rmseTest) //Less is better // Test RMSE: = 1.6867585251053991 227 | 228 | val new_user_ID = 300000 // new user ID randomly chosen 229 | 230 | //The format of each line is (UserID, ISBN, Rating) 231 | val new_user_ratings = Seq( 232 | (new_user_ID, 817930596, 15.127044702142243), 233 | (new_user_ID, 1149373895, 15.11531283195148), 234 | (new_user_ID, 1885291767, 14.75898119158678), 235 | (new_user_ID, 459716613, 14.53024153450836), 236 | (new_user_ID, 3362860, 14.180262929540024), 237 | (new_user_ID, 1178102612, 14.121654522195225), 238 | (new_user_ID, 158895996, 13.741728003481194), 239 | (new_user_ID, 1007741925, 13.571754513473993), 240 | (new_user_ID, 1033268461, 13.506755847456258), 241 | (new_user_ID, 651677816, 13.46591014905454)) 242 | 243 | val new_user_ratings_RDD = spark.sparkContext.parallelize(new_user_ratings) 244 | val new_user_ratings_DF = spark.createDataFrame(new_user_ratings_RDD).toDF("UserID", "ISBN", "Rating") 245 | 246 | val newRatingsRDD = new_user_ratings_DF.rdd.map(row => { 247 | val userId = row.getInt(0) 248 | val movieId = row.getInt(1) 249 | val ratings = row.getDouble(2) 250 | Rating(userId, movieId, ratings) 251 | }) 252 | 253 | val complete_data_with_new_ratings_RDD = trainRatingsRDD.union(newRatingsRDD) 254 | 255 | val newModel : MatrixFactorizationModel = new ALS() 256 | .setIterations(10) 257 | .setBlocks(-1) 258 | .setAlpha(1.0) 259 | .setLambda(0.01) 260 | .setRank(25) 261 | .setSeed(123457L) 262 | .setImplicitPrefs(false) 263 | .run(complete_data_with_new_ratings_RDD) 264 | 265 | // Making Predictions. Get the top 10 book predictions for user 276724 266 | //Book recommendation for a specific user. Get the top 10 book predictions for reader 276747 267 | println("Recommendations: (ISBN, Rating)") 268 | println("----------------------------------") 269 | val newPredictions = newModel.recommendProducts(276747, 10) 270 | newPredictions.map(rating => (rating.product, rating.rating)).foreach(println) 271 | println("----------------------------------") 272 | 273 | var newrmseTest = computeRmse(newModel, testRatingsRDD, true) 274 | println("Test RMSE: = " + newrmseTest) //Less is better 275 | } 276 | } -------------------------------------------------------------------------------- /Chapter07/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | com.packt.AnomalyDetection 6 | RandomForest 7 | 0.0.1-SNAPSHOT 8 | jar 9 | 10 | ScalaMLQuickStartGuide 11 | http://maven.apache.org 12 | 13 | 14 | UTF-8 15 | 1.8 16 | 2.2.0 17 | 1.0.0-alpha 18 | 1.0.0-alpha 19 | 1.0.0-alpha 20 | 1.0.0-alpha 21 | 1.2.3 22 | 23 | 24 | 25 | 26 | jdk.tools 27 | jdk.tools 28 | 1.8.0_171 29 | system 30 | C:/Program Files/Java/jdk1.8.0_171/lib/tools.jar 31 | 32 | 33 | org.apache.directory.studio 34 | org.apache.commons.io 35 | 2.4 36 | 37 | 38 | org.deeplearning4j 39 | scalnet_2.11 40 | 1.0.0-alpha 41 | 42 | 43 | org.apache.spark 44 | spark-core_2.11 45 | ${spark.version} 46 | 47 | 48 | com.github.tototoshi 49 | scala-csv_2.10 50 | 1.3.5 51 | 52 | 53 | org.apache.spark 54 | spark-sql_2.11 55 | ${spark.version} 56 | 57 | 58 | com.github.scopt 59 | scopt_2.11 60 | 3.3.0 61 | 62 | 63 | com.typesafe 64 | config 65 | 1.2.1 66 | 67 | 68 | org.apache.directory.api 69 | api-util 70 | 1.0.0 71 | 72 | 73 | commons-io 74 | commons-io 75 | 2.6 76 | 77 | 78 | com.esotericsoftware.kryo 79 | kryo 80 | 2.10 81 | 82 | 83 | edu.stanford.nlp 84 | stanford-corenlp 85 | 3.6.0 86 | 87 | 88 | edu.stanford.nlp 89 | stanford-corenlp 90 | 3.6.0 91 | models 92 | 93 | 94 | org.apache.hadoop 95 | hadoop-common 96 | 2.6.0 97 | 98 | 99 | org.sameersingh.scalaplot 100 | scalaplot 101 | 0.0.4 102 | 103 | 104 | org.apache.spark 105 | spark-mllib_2.11 106 | ${spark.version} 107 | 108 | 109 | org.apache.spark 110 | spark-graphx_2.11 111 | ${spark.version} 112 | 113 | 114 | org.apache.spark 115 | spark-yarn_2.11 116 | ${spark.version} 117 | 118 | 119 | org.apache.spark 120 | spark-network-shuffle_2.11 121 | ${spark.version} 122 | 123 | 124 | com.databricks 125 | spark-csv_2.11 126 | 1.3.0 127 | 128 | 129 | com.holdenkarau 130 | spark-testing-base_2.10 131 | 2.0.0_0.6.0 132 | 133 | 134 | com.databricks 135 | spark-avro_2.11 136 | 4.0.0 137 | 138 | 139 | org.apache.commons 140 | commons-math3 141 | 3.2 142 | 143 | 144 | org.apache.hive 145 | hive-exec 146 | 2.3.2 147 | 148 | 149 | junit 150 | junit 151 | 3.8.1 152 | test 153 | 154 | 155 | org.nd4j 156 | nd4j-native 157 | ${nd4j.version} 158 | 159 | 160 | org.deeplearning4j 161 | deeplearning4j-ui_2.11 162 | ${dl4j.version} 163 | 164 | 165 | org.deeplearning4j 166 | deeplearning4j-core 167 | ${dl4j.version} 168 | 169 | 170 | org.deeplearning4j 171 | deeplearning4j-nlp 172 | ${dl4j.version} 173 | 174 | 175 | org.deeplearning4j 176 | deeplearning4j-zoo 177 | ${dl4j.version} 178 | 179 | 180 | org.deeplearning4j 181 | arbiter-deeplearning4j 182 | ${arbiter.version} 183 | 184 | 185 | org.deeplearning4j 186 | arbiter-ui_2.11 187 | ${arbiter.version} 188 | 189 | 190 | datavec-data-codec 191 | org.datavec 192 | ${datavec.version} 193 | 194 | 195 | org.apache.httpcomponents 196 | httpclient 197 | 4.3.5 198 | 199 | 200 | ch.qos.logback 201 | logback-classic 202 | ${logback.version} 203 | 204 | 205 | org.datavec 206 | datavec-data-image 207 | ${dl4j.version} 208 | 209 | 210 | org.bytedeco 211 | javacv-platform 212 | 1.4.1 213 | 214 | 215 | org.datavec 216 | datavec-hadoop 217 | ${datavec.version} 218 | 219 | 220 | 221 | org.deeplearning4j 222 | arbiter-deeplearning4j 223 | ${arbiter.version} 224 | 225 | 226 | org.deeplearning4j 227 | arbiter-ui_2.11 228 | ${arbiter.version} 229 | 230 | 231 | org.apache.httpcomponents 232 | httpclient 233 | 4.3.5 234 | 235 | 236 | ch.qos.logback 237 | logback-classic 238 | ${logback.version} 239 | 240 | 241 | 242 | jfree 243 | jfreechart 244 | 1.0.13 245 | 246 | 247 | org.jcodec 248 | jcodec 249 | 0.2.3 250 | 251 | 252 | 253 | 254 | 255 | 256 | org.apache.maven.plugins 257 | maven-eclipse-plugin 258 | 2.9 259 | 260 | true 261 | false 262 | 263 | 264 | 265 | 266 | org.apache.maven.plugins 267 | maven-compiler-plugin 268 | 3.5.1 269 | 270 | ${jdk.version} 271 | ${jdk.version} 272 | 273 | 274 | 275 | maven-shade-plugin 276 | 2.4.3 277 | 278 | 279 | package 280 | 281 | shade 282 | 283 | 284 | false 285 | 286 | 287 | 288 | *:* 289 | 290 | META-INF/*.SF 291 | META-INF/*.DSA 292 | META-INF/*.RSA 293 | 294 | 295 | 296 | 297 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | org.apache.maven.plugins 307 | maven-assembly-plugin 308 | 2.4.1 309 | 310 | 311 | 312 | jar-with-dependencies 313 | 314 | 315 | 316 | 317 | com.packt.ScalaML.ProductionEngineering.BoschProductionLinePerformance2 318 | 319 | 320 | 321 | 322 | oozie.launcher.mapreduce.job.user.classpath.first 323 | true 324 | 325 | 326 | 327 | 328 | 329 | make-assembly 330 | 331 | package 332 | 333 | single 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | -------------------------------------------------------------------------------- /Chapter07/src/main/scala/GettingStartedDL/CancerDataPreprocessor.scala: -------------------------------------------------------------------------------- 1 | package GettingStartedDL 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql._ 8 | import org.apache.spark.sql.Dataset 9 | import org.apache.spark.ml.Pipeline 10 | import org.apache.spark.ml.classification.RandomForestClassifier 11 | import org.apache.spark.ml.classification.RandomForestClassificationModel 12 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics 13 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 14 | import org.apache.spark.ml.feature.StringIndexer 15 | import org.apache.spark.ml.tuning.ParamGridBuilder 16 | import org.apache.spark.ml.tuning.CrossValidator 17 | import org.apache.spark.ml.feature.VectorAssembler 18 | 19 | object CancerDataPreprocessor { 20 | def main(args: Array[String]) = { 21 | val spark: SparkSession = SparkSession.builder(). 22 | appName("churn") 23 | .master("local[*]") 24 | .config("spark.sql.warehouse.dir", "E:/Exp/") 25 | .config("spark.sql.crossJoin.enabled", "true") 26 | .getOrCreate() 27 | 28 | val data = spark.read.option("maxColumns", 25000).format("com.databricks.spark.csv") 29 | .option("header", "true") // Use first line of all files as header 30 | .option("inferSchema", "true") // Automatically infer data types 31 | .load("C:/Users/admin-karim/Desktop/old2/TCGA-PANCAN/TCGA-PANCAN-HiSeq-801x20531/data.csv"); // set your path accordingly 32 | 33 | val numFeatures = data.columns.length 34 | val numSamples = data.count() 35 | println("Number of features: " + numFeatures) 36 | println("Number of samples: " + numSamples) 37 | 38 | val numericDF = data.drop("id") // now 20531 features left 39 | 40 | val labels = spark.read.format("com.databricks.spark.csv").option("header", "true") // Use first line of all files as header 41 | .option("inferSchema", "true") // Automatically infer data types 42 | .load("C:/Users/admin-karim/Desktop/old2/TCGA-PANCAN/TCGA-PANCAN-HiSeq-801x20531/labels.csv") 43 | 44 | labels.show(10) 45 | 46 | val indexer = new StringIndexer().setInputCol("Class").setOutputCol("label").setHandleInvalid("skip"); // skip null/invalid values 47 | val indexedDF = indexer.fit(labels).transform(labels).select(col("label").cast(DataTypes.IntegerType)); // casting data types to integer 48 | 49 | indexedDF.show() 50 | 51 | val combinedDF = numericDF.join(indexedDF) 52 | 53 | val splits = combinedDF.randomSplit(Array(0.7, 0.3), 12345L) //70% for training, 30% for testing 54 | val trainingData = splits(0) 55 | val testData = splits(1) 56 | 57 | println(trainingData.count()); // number of samples in training set 58 | println(testData.count()); // number of samples in test set 59 | 60 | trainingData.coalesce(1).write 61 | .format("com.databricks.spark.csv") 62 | .option("header", "false") 63 | .option("delimiter", ",") 64 | .save("output/TCGA_train.csv") 65 | 66 | testData.coalesce(1).write 67 | .format("com.databricks.spark.csv") 68 | .option("header", "false") 69 | .option("delimiter", ",") 70 | .save("output/TCGA_test.csv") 71 | 72 | } 73 | } -------------------------------------------------------------------------------- /Chapter07/src/main/scala/GettingStartedDL/CancerTypePrediction.scala: -------------------------------------------------------------------------------- 1 | package GettingStartedDL 2 | 3 | import java.io.File 4 | import java.io.IOException 5 | import org.datavec.api.records.reader.RecordReader 6 | import org.datavec.api.records.reader.impl.csv.CSVRecordReader 7 | import org.datavec.api.split.FileSplit 8 | import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator 9 | import org.deeplearning4j.eval.Evaluation 10 | import org.deeplearning4j.nn.api.Layer 11 | import org.deeplearning4j.nn.api.OptimizationAlgorithm 12 | import org.deeplearning4j.nn.conf.MultiLayerConfiguration 13 | import org.deeplearning4j.nn.conf.NeuralNetConfiguration 14 | import org.deeplearning4j.nn.conf.layers.LSTM 15 | import org.deeplearning4j.nn.conf.layers.RnnOutputLayer 16 | import org.deeplearning4j.nn.multilayer.MultiLayerNetwork 17 | import org.deeplearning4j.nn.weights.WeightInit 18 | import org.deeplearning4j.optimize.listeners.ScoreIterationListener 19 | import org.nd4j.linalg.activations.Activation 20 | import org.nd4j.linalg.api.ndarray.INDArray 21 | import org.nd4j.linalg.dataset.DataSet 22 | import org.nd4j.linalg.dataset.api.iterator.DataSetIterator 23 | import org.nd4j.linalg.learning.config.Adam 24 | import org.nd4j.linalg.lossfunctions.LossFunctions.LossFunction 25 | 26 | object CancerTypePrediction { 27 | def readCSVDataset(csvFileClasspath:String, batchSize:Int, labelIndex:Int, numClasses:Int) : DataSetIterator = { 28 | val rr:RecordReader = new CSVRecordReader() 29 | val input:File = new File(csvFileClasspath) 30 | rr.initialize(new FileSplit(input)) 31 | val iterator:DataSetIterator = new RecordReaderDataSetIterator(rr, batchSize, labelIndex, numClasses) 32 | return iterator 33 | } 34 | 35 | def main(args: Array[String]): Unit = { 36 | val numEpochs = 10 37 | // Show data paths 38 | val trainPath = "C:/Users/admin-karim/Desktop/old2/TCGA-PANCAN/TCGA_train.csv" 39 | val testPath = "C:/Users/admin-karim/Desktop/old2/TCGA-PANCAN/TCGA_test.csv" 40 | 41 | // ---------------------------------- 42 | // Preparing training and test set. 43 | val labelIndex = 20531 44 | val numClasses = 5 45 | val batchSize = 128 46 | 47 | // This dataset is used for training 48 | val trainingDataIt: DataSetIterator = readCSVDataset(trainPath, batchSize, labelIndex, numClasses) 49 | 50 | // This is the data we want to classify 51 | val testDataIt:DataSetIterator = readCSVDataset(testPath, batchSize, labelIndex, numClasses) 52 | 53 | // ---------------------------------- 54 | // Network hyperparameters 55 | val seed = 12345 56 | val numInputs = labelIndex 57 | val numOutputs = numClasses 58 | val numHiddenNodes = 5000 59 | 60 | //First LSTM layer 61 | val layer_0 = new LSTM.Builder() 62 | .nIn(numInputs) 63 | .nOut(numHiddenNodes) 64 | .activation(Activation.RELU) 65 | .build() 66 | 67 | //Second LSTM layer 68 | val layer_1 = new LSTM.Builder() 69 | .nIn(numHiddenNodes) 70 | .nOut(numHiddenNodes) 71 | .activation(Activation.RELU) 72 | .build() 73 | 74 | //Third LSTM layer 75 | val layer_2 = new LSTM.Builder() 76 | .nIn(numHiddenNodes) 77 | .nOut(numHiddenNodes) 78 | .activation(Activation.RELU) 79 | .build() 80 | 81 | //RNN output layer 82 | val layer_3 = new RnnOutputLayer.Builder() 83 | .activation(Activation.SOFTMAX) 84 | .lossFunction(LossFunction.MCXENT) 85 | .nIn(numHiddenNodes) 86 | .nOut(numOutputs) 87 | .build() 88 | 89 | // Create network configuration and conduct network training 90 | val LSTMconf: MultiLayerConfiguration = new NeuralNetConfiguration.Builder() 91 | .seed(seed) //Random number generator seed for improved repeatability. Optional. 92 | .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT) 93 | .weightInit(WeightInit.XAVIER) 94 | .updater(new Adam(5e-3)) 95 | .l2(1e-5) 96 | .list() 97 | .layer(0, layer_0) 98 | .layer(1, layer_1) 99 | .layer(2, layer_2) 100 | .layer(3, layer_3) 101 | .pretrain(false).backprop(true).build() 102 | 103 | // Create and initialize multilayer network 104 | val model: MultiLayerNetwork = new MultiLayerNetwork(LSTMconf) 105 | model.init() 106 | 107 | //print the score with every 1 iteration 108 | model.setListeners(new ScoreIterationListener(1)); 109 | 110 | //Print the number of parameters in the network (and for each layer) 111 | val layers = model.getLayers() 112 | var totalNumParams = 0 113 | var i = 0 114 | 115 | for (i <- 0 to layers.length-1) { 116 | val nParams = layers(i).numParams() 117 | println("Number of parameters in layer " + i + ": " + nParams) 118 | totalNumParams = totalNumParams + nParams 119 | } 120 | 121 | println("Total number of network parameters: " + totalNumParams) 122 | 123 | var j = 0 124 | println("Train model....") 125 | for (j <- 0 to numEpochs-1) { 126 | model.fit(trainingDataIt) 127 | } 128 | 129 | println("Evaluate model....") 130 | val eval: Evaluation = new Evaluation(5) //create an evaluation object with 10 possible classes 131 | 132 | while (testDataIt.hasNext()) { 133 | val next:DataSet = testDataIt.next() 134 | val output:INDArray = model.output(next.getFeatureMatrix()) //get the networks prediction 135 | eval.eval(next.getLabels(), output) //check the prediction against the true class 136 | } 137 | 138 | println(eval.stats()) 139 | println("****************Example finished********************") 140 | } 141 | } 142 | -------------------------------------------------------------------------------- /Chapter07/src/test/scala/com/packt/ScalaMLQuickStartGuide/AppTest.java: -------------------------------------------------------------------------------- 1 | package com.packt.ScalaMLQuickStartGuide; 2 | 3 | import junit.framework.Test; 4 | import junit.framework.TestCase; 5 | import junit.framework.TestSuite; 6 | 7 | /** 8 | * Unit test for simple App. 9 | */ 10 | public class AppTest 11 | extends TestCase 12 | { 13 | /** 14 | * Create the test case 15 | * 16 | * @param testName name of the test case 17 | */ 18 | public AppTest( String testName ) 19 | { 20 | super( testName ); 21 | } 22 | 23 | /** 24 | * @return the suite of tests being tested 25 | */ 26 | public static Test suite() 27 | { 28 | return new TestSuite( AppTest.class ); 29 | } 30 | 31 | /** 32 | * Rigourous Test :-) 33 | */ 34 | public void testApp() 35 | { 36 | assertTrue( true ); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Machine Learning with Scala Quick Start Guide 5 | Machine Learning with Scala Quick Start Guide 6 | 7 | This is the code repository for [Machine Learning with Scala Quick Start Guide](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-scala-quick-start-guide), published by Packt. 8 | 9 | **Leverage popular machine learning algorithms and techniques and implement them in Scala** 10 | 11 | ## What is this book about? 12 | Scala is a highly scalable integration of object-oriented nature and functional programming concepts that make it easy to build scalable and complex big data applications. This book is a handy guide for machine learning developers and data scientists who want to develop and train effective machine learning models in Scala. 13 | 14 | This book covers the following exciting features: 15 | * Get acquainted with JVM-based machine learning libraries for Scala such as Spark ML and Deeplearning4j 16 | * Learn RDDs, DataFrame, and Spark SQL for analyzing structured and unstructured data 17 | * Understand supervised and unsupervised learning techniques with best practices and pitfalls 18 | * Learn classification and regression analysis with linear regression, logistic regression, Naïve Bayes, support vector machine, and tree-based ensemble techniques 19 | * Learn effective ways of clustering analysis with dimensionality reduction techniques 20 | 21 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1789345073) today! 22 | 23 | https://www.packtpub.com/ 25 | 26 | 27 | ## Instructions and Navigations 28 | All of the code is organized into folders. For example, Chapter02. 29 | 30 | The code will look like the following: 31 | ``` 32 | rawTrafficDF.select("Hour (Coded)", "Immobilized bus", "Broken Truck", 33 | "Vehicle excess", "Fire", "Slowness in traffic (%)").show(5) 34 | ``` 35 | 36 | **Following is what you need for this book:** 37 | This book is for machine learning developers looking to train machine learning models in Scala without spending too much time and effort. Some fundamental knowledge of Scala programming and some basics of statistics and linear algebra is all you need to get started with this book. 38 | 39 | With the following software and hardware list you can run all code files present in the book (Chapter 1-7). 40 | 41 | ### Software and Hardware List 42 | 43 | | Chapter | Software required | OS required | 44 | | -------- | ------------------------------------| -----------------------------------| 45 | | 1-3,6 | Spark: 2.3.0 (or higher), Hadoop: 2.7 (or higher), Java (JDK and JRE): 1.8+, Scala: 2.11.x (or higher), Eclipse Mars/Luna: latest, Maven Eclipse plugin: 2.9 or higher, Maven compiler plugin for Eclipse: 2.3.2 or higher, Maven assembly plugin for Eclipse: 2.4.1 or higher, Importantly, re-use the provided pom.xml file with Packt supplementary and change the version mentioned above and APIs. Then everything will be managed accordingly.| Windows, Mac OS X, and Linux (Any) | 46 | | 5 | Same as above plus the following: h2o version: 3.22.1.1, sparkling water version: 2.4.1, adam version: 0.23.0 | Windows, Mac OS X, and Linux (Any) | 47 | | 7 | Same as above PLUS the following: Spark csv_2.11 version: 1.3.0, ND4j backend version: - If GPU configured: nd4j-cuda-9.0-platform - Otherwise: nd4j-native, ND4j version: 1.0.0-alpha, DL4j version: 1.0.0-alpha, Datavec version: 1.0.0-alpha, Arbiter version: 1.0.0-alpha, Logback version: 1.2.3. | Windows, Mac OS X, and Linux (Any) | 48 | 49 | 50 | ## Code in Action 51 | 52 | Click on the following link to see the Code in Action: 53 | 54 | [http://bit.ly/2WhQf2i](http://bit.ly/2WhQf2i) 55 | 56 | ### Related products 57 | * Scala Machine Learning Projects [[Packt]](https://prod.packtpub.com/in/big-data-and-business-intelligence/scala-machine-learning-projects?utm_source=github&utm_medium=repository&utm_campaign=9781788479042) [[Amazon]](https://www.amazon.com/dp/1788479041) 58 | 59 | * Scala and Spark for Big Data Analytics [[Packt]](https://prod.packtpub.com/in/big-data-and-business-intelligence/scala-and-spark-big-data-analytics?utm_source=github&utm_medium=repository&utm_campaign=9781785280849) [[Amazon]](https://www.amazon.com/dp/1785280848) 60 | 61 | ## Get to Know the Author 62 | **Md. Rezaul Karim** 63 | Md. Rezaul Karim is a researcher, author, and data science enthusiast with a strong computer science background, plus 10 years of R&D experience in machine learning, deep learning, and data mining algorithms to solve emerging bioinformatics research problems by making them explainable. He is passionate about applied machine learning, knowledge graphs, and explainable artificial intelligence (XAI). 64 | Currently, he is working as a research scientist at Fraunhofer FIT, Germany. He is also a Ph.D. candidate at RWTH Aachen University, Germany. Before joining FIT, he worked as a researcher at the Insight Centre for Data Analytics, Ireland. Previously, he worked as a lead software engineer at Samsung Electronics, Korea. 65 | 66 | ### Suggestions and Feedback 67 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions. 68 | ### Download a free PDF 69 | 70 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
71 |

https://packt.link/free-ebook/9781789345070

--------------------------------------------------------------------------------