├── Chapter01
├── .classpath
├── .project
├── data
│ └── Cryotherapy.csv
├── pom.xml
└── src
│ └── main
│ └── scala
│ └── GettingStartedML
│ └── CryotherapyPrediction.scala
├── Chapter02
├── pom.xml
└── src
│ └── main
│ └── scala
│ └── RegressionAnalysis
│ ├── EDA.scala
│ ├── UrbanTrafficGeneralizedLinearRegression.scala
│ └── UrbanTrafficLinearRegression.scala
├── Chapter03
├── pom.xml
└── src
│ └── main
│ └── scala
│ └── ScalaClassification
│ ├── ChurnPredictionLR.scala
│ ├── ChurnPredictionNB.scala
│ ├── ChurnPredictionSVM.scala
│ ├── Describe.scala
│ ├── PipelineConstruction.scala
│ └── Preprocessing.scala
├── Chapter04
├── pom.xml
└── src
│ └── main
│ └── scala
│ └── ScalaTreeEnsimbles
│ ├── ChurnPredictionDT.scala
│ ├── ChurnPredictionGBT.scala
│ ├── ChurnPredictionRF.scala
│ ├── Preproessing.scala
│ ├── UrbanTrafficDTRegressor.scala
│ ├── UrbanTrafficGBTRegressor.scala
│ └── UrbanTrafficRFRegressor.scala
├── Chapter05
├── pom.xml
└── src
│ └── main
│ └── scala
│ └── org
│ └── fit
│ └── genomics
│ ├── PCA.scala
│ └── PopStratClustering.scala
├── Chapter06
├── pom.xml
└── src
│ └── main
│ └── scala
│ └── ScalaBookRecommendation
│ └── BookRecommendation.scala
├── Chapter07
├── pom.xml
└── src
│ ├── main
│ └── scala
│ │ └── GettingStartedDL
│ │ ├── CancerDataPreprocessor.scala
│ │ └── CancerTypePrediction.scala
│ └── test
│ └── scala
│ └── com
│ └── packt
│ └── ScalaMLQuickStartGuide
│ └── AppTest.java
├── LICENSE
└── README.md
/Chapter01/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/Chapter01/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | ScalaMLQuickStartGuide
4 |
5 |
6 |
7 |
8 |
9 | org.scala-ide.sdt.core.scalabuilder
10 |
11 |
12 |
13 |
14 | org.eclipse.m2e.core.maven2Builder
15 |
16 |
17 |
18 |
19 |
20 | org.scala-ide.sdt.core.scalanature
21 | org.eclipse.jdt.core.javanature
22 | org.eclipse.m2e.core.maven2Nature
23 |
24 |
25 |
--------------------------------------------------------------------------------
/Chapter01/data/Cryotherapy.csv:
--------------------------------------------------------------------------------
1 | sex,age,Time,Number_of_Warts,Type,Area,Result_of_Treatment
2 | 1,35,12,5,1,100,0
3 | 1,29,7,5,1,96,1
4 | 1,50,8,1,3,132,0
5 | 1,32,11.75,7,3,750,0
6 | 1,67,9.25,1,1,42,0
7 | 1,41,8,2,2,20,1
8 | 1,36,11,2,1,8,0
9 | 1,59,3.5,3,3,20,0
10 | 1,20,4.5,12,1,6,1
11 | 2,34,11.25,3,3,150,0
12 | 2,21,10.75,5,1,35,0
13 | 2,15,6,2,1,30,1
14 | 2,15,2,3,1,4,1
15 | 2,15,3.75,2,3,70,1
16 | 2,17,11,2,1,10,0
17 | 2,17,5.25,3,1,63,1
18 | 2,23,11.75,12,3,72,0
19 | 2,27,8.75,2,1,6,0
20 | 2,15,4.25,1,1,6,1
21 | 2,18,5.75,1,1,80,1
22 | 1,22,5.5,2,1,70,1
23 | 2,16,8.5,1,2,60,1
24 | 1,28,4.75,3,1,100,1
25 | 2,40,9.75,1,2,80,0
26 | 1,30,2.5,2,1,115,1
27 | 2,34,12,3,3,95,0
28 | 1,20,0.5,2,1,75,1
29 | 2,35,12,5,3,100,0
30 | 2,24,9.5,3,3,20,0
31 | 2,19,8.75,6,1,160,1
32 | 1,35,9.25,9,1,100,1
33 | 1,29,7.25,6,1,96,1
34 | 1,50,8.75,11,3,132,0
35 | 2,32,12,4,3,750,0
36 | 2,67,12,12,3,42,0
37 | 2,41,10.5,2,2,20,1
38 | 2,36,11,6,1,8,0
39 | 1,63,2.75,3,3,20,0
40 | 1,20,5,3,1,6,1
41 | 1,34,12,1,3,150,0
42 | 2,21,10.5,5,1,35,0
43 | 2,15,8,12,1,30,1
44 | 1,15,3.5,2,1,4,1
45 | 2,15,1.5,12,3,70,1
46 | 1,17,11.5,2,1,10,0
47 | 1,17,5.25,4,1,63,1
48 | 2,23,9.5,5,3,72,0
49 | 1,27,10,5,1,6,0
50 | 1,15,4,7,1,6,1
51 | 2,18,4.5,8,1,80,1
52 | 2,22,5,9,1,70,1
53 | 1,16,10.25,3,2,60,1
54 | 2,28,4,11,1,100,1
55 | 2,40,8.75,6,2,80,0
56 | 2,30,0.5,8,3,115,1
57 | 1,34,10.75,1,3,95,0
58 | 1,20,3.75,11,1,75,1
59 | 2,35,8.5,6,3,100,0
60 | 1,24,9.5,8,1,20,1
61 | 2,19,8,9,1,160,1
62 | 1,35,7.25,2,1,100,1
63 | 1,29,11.75,5,1,96,0
64 | 2,50,9.5,4,3,132,0
65 | 2,32,12,12,3,750,0
66 | 1,67,10,7,1,42,0
67 | 2,41,7.75,5,2,20,1
68 | 2,36,10.5,4,1,8,0
69 | 1,67,3.75,11,3,20,0
70 | 1,20,4,3,1,6,1
71 | 1,34,11.25,1,3,150,0
72 | 2,21,10.75,7,1,35,0
73 | 1,15,10.5,11,1,30,1
74 | 1,15,2,11,1,4,1
75 | 2,15,2,10,3,70,1
76 | 1,17,9.25,12,1,10,0
77 | 1,17,5.75,10,1,63,1
78 | 1,23,10.25,7,3,72,0
79 | 1,27,10.5,7,1,6,0
80 | 1,15,5.5,5,1,6,1
81 | 1,18,4,1,1,80,1
82 | 2,22,4.5,2,1,70,1
83 | 1,16,11,3,2,60,1
84 | 2,28,5,9,1,100,1
85 | 1,40,11.5,9,2,80,0
86 | 1,30,0.25,10,1,115,1
87 | 2,34,12,3,3,95,0
88 | 2,20,3.5,6,1,75,1
89 | 2,35,8.25,8,3,100,0
90 | 1,24,10.75,10,1,20,1
91 | 1,19,8,8,1,160,1
92 |
--------------------------------------------------------------------------------
/Chapter01/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.packt.AnomalyDetection
6 | RandomForest
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | ScalaMLQuickStartGuide
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 | 1.8
16 | 2.2.0
17 | 1.0.0-alpha
18 | 1.0.0-alpha
19 | 1.0.0-alpha
20 | 1.0.0-alpha
21 | 1.2.3
22 |
23 |
24 |
25 |
26 | jdk.tools
27 | jdk.tools
28 | 1.8.0_171
29 | system
30 | C:/Program Files/Java/jdk1.8.0_171/lib/tools.jar
31 |
32 |
33 | org.apache.directory.studio
34 | org.apache.commons.io
35 | 2.4
36 |
37 |
38 | org.deeplearning4j
39 | scalnet_2.11
40 | 1.0.0-alpha
41 |
42 |
43 | org.apache.spark
44 | spark-core_2.11
45 | ${spark.version}
46 |
47 |
48 | com.github.tototoshi
49 | scala-csv_2.10
50 | 1.3.5
51 |
52 |
53 | org.apache.spark
54 | spark-sql_2.11
55 | ${spark.version}
56 |
57 |
58 | com.github.scopt
59 | scopt_2.11
60 | 3.3.0
61 |
62 |
63 | com.typesafe
64 | config
65 | 1.2.1
66 |
67 |
68 | org.apache.directory.api
69 | api-util
70 | 1.0.0
71 |
72 |
73 | commons-io
74 | commons-io
75 | 2.6
76 |
77 |
78 | com.esotericsoftware.kryo
79 | kryo
80 | 2.10
81 |
82 |
83 | edu.stanford.nlp
84 | stanford-corenlp
85 | 3.6.0
86 |
87 |
88 | edu.stanford.nlp
89 | stanford-corenlp
90 | 3.6.0
91 | models
92 |
93 |
94 | org.apache.hadoop
95 | hadoop-common
96 | 2.6.0
97 |
98 |
99 | org.sameersingh.scalaplot
100 | scalaplot
101 | 0.0.4
102 |
103 |
104 | org.apache.spark
105 | spark-mllib_2.11
106 | ${spark.version}
107 |
108 |
109 | org.apache.spark
110 | spark-graphx_2.11
111 | ${spark.version}
112 |
113 |
114 | org.apache.spark
115 | spark-yarn_2.11
116 | ${spark.version}
117 |
118 |
119 | org.apache.spark
120 | spark-network-shuffle_2.11
121 | ${spark.version}
122 |
123 |
124 | com.databricks
125 | spark-csv_2.11
126 | 1.3.0
127 |
128 |
129 | com.holdenkarau
130 | spark-testing-base_2.10
131 | 2.0.0_0.6.0
132 |
133 |
134 | com.databricks
135 | spark-avro_2.11
136 | 4.0.0
137 |
138 |
139 | org.apache.commons
140 | commons-math3
141 | 3.2
142 |
143 |
144 | org.apache.hive
145 | hive-exec
146 | 2.3.2
147 |
148 |
149 | junit
150 | junit
151 | 3.8.1
152 | test
153 |
154 |
155 | org.nd4j
156 | nd4j-native
157 | ${nd4j.version}
158 |
159 |
160 | org.deeplearning4j
161 | deeplearning4j-ui_2.11
162 | ${dl4j.version}
163 |
164 |
165 | org.deeplearning4j
166 | deeplearning4j-core
167 | ${dl4j.version}
168 |
169 |
170 | org.deeplearning4j
171 | deeplearning4j-nlp
172 | ${dl4j.version}
173 |
174 |
175 | org.deeplearning4j
176 | deeplearning4j-zoo
177 | ${dl4j.version}
178 |
179 |
180 | org.deeplearning4j
181 | arbiter-deeplearning4j
182 | ${arbiter.version}
183 |
184 |
185 | org.deeplearning4j
186 | arbiter-ui_2.11
187 | ${arbiter.version}
188 |
189 |
190 | datavec-data-codec
191 | org.datavec
192 | ${datavec.version}
193 |
194 |
195 | org.apache.httpcomponents
196 | httpclient
197 | 4.3.5
198 |
199 |
200 | ch.qos.logback
201 | logback-classic
202 | ${logback.version}
203 |
204 |
205 | org.datavec
206 | datavec-data-image
207 | ${dl4j.version}
208 |
209 |
210 | org.bytedeco
211 | javacv-platform
212 | 1.4.1
213 |
214 |
215 | org.datavec
216 | datavec-hadoop
217 | ${datavec.version}
218 |
219 |
220 |
221 | org.deeplearning4j
222 | arbiter-deeplearning4j
223 | ${arbiter.version}
224 |
225 |
226 | org.deeplearning4j
227 | arbiter-ui_2.11
228 | ${arbiter.version}
229 |
230 |
231 | org.apache.httpcomponents
232 | httpclient
233 | 4.3.5
234 |
235 |
236 | ch.qos.logback
237 | logback-classic
238 | ${logback.version}
239 |
240 |
241 |
242 | jfree
243 | jfreechart
244 | 1.0.13
245 |
246 |
247 | org.jcodec
248 | jcodec
249 | 0.2.3
250 |
251 |
252 |
253 |
254 |
255 |
256 | org.apache.maven.plugins
257 | maven-eclipse-plugin
258 | 2.9
259 |
260 | true
261 | false
262 |
263 |
264 |
265 |
266 | org.apache.maven.plugins
267 | maven-compiler-plugin
268 | 3.5.1
269 |
270 | ${jdk.version}
271 | ${jdk.version}
272 |
273 |
274 |
275 | maven-shade-plugin
276 | 2.4.3
277 |
278 |
279 | package
280 |
281 | shade
282 |
283 |
284 | false
285 |
286 |
287 |
288 | *:*
289 |
290 | META-INF/*.SF
291 | META-INF/*.DSA
292 | META-INF/*.RSA
293 |
294 |
295 |
296 |
297 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 | org.apache.maven.plugins
307 | maven-assembly-plugin
308 | 2.4.1
309 |
310 |
311 |
312 | jar-with-dependencies
313 |
314 |
315 |
316 |
317 | com.packt.ScalaML.ProductionEngineering.BoschProductionLinePerformance2
318 |
319 |
320 |
321 |
322 | oozie.launcher.mapreduce.job.user.classpath.first
323 | true
324 |
325 |
326 |
327 |
328 |
329 | make-assembly
330 |
331 | package
332 |
333 | single
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
--------------------------------------------------------------------------------
/Chapter01/src/main/scala/GettingStartedML/CryotherapyPrediction.scala:
--------------------------------------------------------------------------------
1 | package GettingStartedML
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.ml._
5 | import org.apache.spark.ml.feature._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.DataFrame
8 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
9 | import org.apache.spark.ml.classification.DecisionTreeClassificationModel
10 | import org.apache.spark.ml.classification.DecisionTreeClassifier
11 |
12 | object CryotherapyPrediction {
13 | def main(args: Array[String]) {
14 | val spark = SparkSession
15 | .builder
16 | .master("local[*]")
17 | .config("spark.sql.warehouse.dir", "E:/Exp/")
18 | .appName("CryotherapyPrediction")
19 | .getOrCreate()
20 |
21 | import spark.implicits._
22 |
23 | var CryotherapyDF = spark.read.option("header", "true")
24 | .option("inferSchema", "true")
25 | .csv("data/Cryotherapy.csv")
26 |
27 | CryotherapyDF.printSchema()
28 | CryotherapyDF.show(10)
29 |
30 | //Since Spark ML algorithm expect a 'label' column, which is in our case 'Survived". Let's rename it to 'label'
31 | CryotherapyDF = CryotherapyDF.withColumnRenamed("Result_of_Treatment", "label")
32 | CryotherapyDF.printSchema()
33 |
34 | //Select columns for preparing training data using VectorAssembler()
35 | val selectedCols = Array("sex", "age", "Time", "Number_of_Warts", "Type", "Area")
36 |
37 | val vectorAssembler = new VectorAssembler()
38 | .setInputCols(selectedCols)
39 | .setOutputCol("features")
40 |
41 | // We convert prepare a training data containing "label" and "features", where the features contains existing numeric features and one hot encoded ones:
42 | val numericDF = vectorAssembler.transform(CryotherapyDF)
43 | .select("label", "features")
44 | numericDF.show(10)
45 |
46 | // Spliting the training data into train and test sets. We use 60% for the training and the rest 40% for testing
47 | val splits = numericDF.randomSplit(Array(0.8, 0.2))
48 | val trainDF = splits(0)
49 | val testDF = splits(1)
50 |
51 | // Train a DecisionTree model.
52 | val dt = new DecisionTreeClassifier()
53 | .setImpurity("gini")
54 | .setMaxBins(10)
55 | .setMaxDepth(30)
56 | .setLabelCol("label")
57 | .setFeaturesCol("features")
58 |
59 | // Train model. This also runs the indexers.
60 | val dtModel = dt.fit(trainDF)
61 |
62 | // Since it's a binary clasisfication problem, we need BinaryClassificationEvaluator() estimator to evaluatemodel's performance on the test set
63 | val evaluator = new BinaryClassificationEvaluator()
64 | .setLabelCol("label")
65 |
66 | // Making predictions on test set
67 | val predictionDF = dtModel.transform(testDF)
68 |
69 | //Computing classification accuracy
70 | val accuracy = evaluator.evaluate(predictionDF)
71 | println("Accuracy = " + accuracy)
72 |
73 | // Finally, we stop the Spark session by invokin stop() method
74 | spark.stop()
75 | }
76 | }
--------------------------------------------------------------------------------
/Chapter02/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.packt.AnomalyDetection
6 | RandomForest
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | ScalaMLQuickStartGuide
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 | 1.8
16 | 2.4.4
17 |
18 |
19 |
20 |
21 | jdk.tools
22 | jdk.tools
23 | 1.8.0_171
24 | system
25 | C:/Program Files/Java/jdk1.8.0_171/lib/tools.jar
26 |
27 |
28 | org.apache.directory.studio
29 | org.apache.commons.io
30 | 2.4
31 |
32 |
33 | org.apache.spark
34 | spark-core_2.11
35 | ${spark.version}
36 |
37 |
38 | com.github.tototoshi
39 | scala-csv_2.10
40 | 1.3.5
41 |
42 |
43 | org.apache.spark
44 | spark-sql_2.11
45 | ${spark.version}
46 |
47 |
48 | com.github.scopt
49 | scopt_2.11
50 | 3.3.0
51 |
52 |
53 | com.typesafe
54 | config
55 | 1.2.1
56 |
57 |
58 | org.apache.directory.api
59 | api-util
60 | 1.0.0
61 |
62 |
63 | commons-io
64 | commons-io
65 | 2.6
66 |
67 |
68 | com.esotericsoftware.kryo
69 | kryo
70 | 2.10
71 |
72 |
73 | edu.stanford.nlp
74 | stanford-corenlp
75 | 3.6.0
76 |
77 |
78 | edu.stanford.nlp
79 | stanford-corenlp
80 | 3.6.0
81 | models
82 |
83 |
84 | org.apache.hadoop
85 | hadoop-common
86 | 2.6.0
87 |
88 |
89 | org.sameersingh.scalaplot
90 | scalaplot
91 | 0.0.4
92 |
93 |
94 | org.apache.spark
95 | spark-mllib_2.11
96 | ${spark.version}
97 |
98 |
99 | org.apache.spark
100 | spark-graphx_2.11
101 | ${spark.version}
102 |
103 |
104 | org.apache.spark
105 | spark-yarn_2.11
106 | ${spark.version}
107 |
108 |
109 | org.apache.spark
110 | spark-network-shuffle_2.11
111 | ${spark.version}
112 |
113 |
114 | com.databricks
115 | spark-csv_2.11
116 | 1.3.0
117 |
118 |
119 | com.holdenkarau
120 | spark-testing-base_2.10
121 | 2.0.0_0.6.0
122 |
123 |
124 | com.databricks
125 | spark-avro_2.11
126 | 4.0.0
127 |
128 |
129 | org.apache.commons
130 | commons-math3
131 | 3.2
132 |
133 |
134 | org.apache.hive
135 | hive-exec
136 | 2.3.2
137 |
138 |
139 | junit
140 | junit
141 | 3.8.1
142 | test
143 |
144 |
145 |
146 |
147 |
148 |
149 | org.apache.maven.plugins
150 | maven-eclipse-plugin
151 | 2.9
152 |
153 | true
154 | false
155 |
156 |
157 |
158 |
159 | org.apache.maven.plugins
160 | maven-compiler-plugin
161 | 3.5.1
162 |
163 | ${jdk.version}
164 | ${jdk.version}
165 |
166 |
167 |
168 | maven-shade-plugin
169 | 2.4.3
170 |
171 |
172 | package
173 |
174 | shade
175 |
176 |
177 | false
178 |
179 |
180 |
181 | *:*
182 |
183 | META-INF/*.SF
184 | META-INF/*.DSA
185 | META-INF/*.RSA
186 |
187 |
188 |
189 |
190 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 | org.apache.maven.plugins
200 | maven-assembly-plugin
201 | 2.4.1
202 |
203 |
204 |
205 | jar-with-dependencies
206 |
207 |
208 |
209 |
210 | com.packt.ScalaML.ProductionEngineering.BoschProductionLinePerformance2
211 |
212 |
213 |
214 |
215 | oozie.launcher.mapreduce.job.user.classpath.first
216 | true
217 |
218 |
219 |
220 |
221 |
222 | make-assembly
223 |
224 | package
225 |
226 | single
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
--------------------------------------------------------------------------------
/Chapter02/src/main/scala/RegressionAnalysis/EDA.scala:
--------------------------------------------------------------------------------
1 | package RegressionAnalysis
2 |
3 | import org.apache.spark.sql._
4 | import org.apache.spark.sql.functions._
5 |
6 | object EDA {
7 | def main(args: Array[String]): Unit = {
8 | val spark = SparkSession
9 | .builder
10 | .master("local[*]")
11 | .config("spark.sql.warehouse.dir", "E:/Exp/")
12 | .appName(s"OneVsRestExample")
13 | .getOrCreate()
14 |
15 | import spark.implicits._
16 |
17 | val rawTrafficDF = spark.read
18 | .option("header", "true")
19 | .option("inferSchema", "true")
20 | .option("delimiter", ";")
21 | .format("com.databricks.spark.csv")
22 | .load("data/Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv")
23 | .cache
24 |
25 | rawTrafficDF.select("Hour (Coded)", "Immobilized bus", "Broken Truck", "Vehicle excess", "Fire", "Slowness in traffic (%)").show(5)
26 | println(rawTrafficDF.count())
27 | rawTrafficDF.printSchema()
28 |
29 | rawTrafficDF.select("Hour (Coded)", "Immobilized bus", "Broken Truck", "Point of flooding", "Fire", "Slowness in traffic (%)").describe().show()
30 |
31 | var newTrafficDF = rawTrafficDF.withColumnRenamed("Slowness in traffic (%)", "label")
32 |
33 | // Let's explore two other important features Point of flooding and Vehicle excess. We can rename these two columns as follows:
34 |
35 | newTrafficDF = newTrafficDF.withColumnRenamed("Point of flooding", "NoOfFloodPoint")
36 |
37 | newTrafficDF.createOrReplaceTempView("slDF")
38 | spark.sql("SELECT avg(label) as avgSlowness FROM slDF").show()
39 |
40 | spark.sql("SELECT max(NoOfFloodPoint) FROM slDF").show()
41 | }
42 | }
--------------------------------------------------------------------------------
/Chapter02/src/main/scala/RegressionAnalysis/UrbanTrafficGeneralizedLinearRegression.scala:
--------------------------------------------------------------------------------
1 | package RegressionAnalysis
2 |
3 | import org.apache.spark.ml.regression.{ GeneralizedLinearRegression, GeneralizedLinearRegressionModel }
4 | import org.apache.spark.ml.{ Pipeline, PipelineModel }
5 | import org.apache.spark.ml.tuning.{ CrossValidator, ParamGridBuilder }
6 | import org.apache.spark.ml.evaluation.RegressionEvaluator
7 | import org.apache.spark.sql._
8 | import org.apache.spark.sql.functions._
9 | import org.apache.spark.mllib.evaluation.RegressionMetrics
10 | import org.apache.spark.ml.feature.VectorAssembler
11 |
12 | import org.apache.log4j.Logger
13 | import org.apache.log4j.Level
14 |
15 | object UrbanTrafficGeneralizedLinearRegression {
16 | def main(args: Array[String]) {
17 | val spark = SparkSession
18 | .builder
19 | .master("local[*]")
20 | .config("spark.sql.warehouse.dir", "E:/Exp/")
21 | .appName(s"OneVsRestExample")
22 | .getOrCreate()
23 |
24 | Logger.getLogger("org").setLevel(Level.FATAL)
25 | Logger.getLogger("akka").setLevel(Level.ERROR)
26 |
27 | import spark.implicits._
28 |
29 | val rawTrafficDF = spark.read
30 | .option("header", "true")
31 | .option("inferSchema", "true")
32 | .option("delimiter", ";")
33 | .format("com.databricks.spark.csv")
34 | .load("data/Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv")
35 | .cache
36 |
37 | rawTrafficDF.show()
38 | rawTrafficDF.printSchema()
39 | rawTrafficDF.describe().show()
40 |
41 | val newTrafficDF = rawTrafficDF.withColumnRenamed("Slowness in traffic (%)", "label")
42 | val colNames = newTrafficDF.columns.dropRight(1)
43 |
44 | colNames.foreach(println)
45 |
46 | newTrafficDF.printSchema()
47 |
48 | // VectorAssembler for training features
49 | val assembler = new VectorAssembler()
50 | .setInputCols(colNames)
51 | .setOutputCol("features")
52 |
53 | val assembleDF = assembler.transform(newTrafficDF).select("features", "label")
54 | assembleDF.printSchema()
55 |
56 | val seed = 1357911L
57 | val splits = assembleDF.randomSplit(Array(0.60, 0.40), seed)
58 | val (trainingData, testData) = (splits(0), splits(1))
59 |
60 | trainingData.cache
61 | testData.cache
62 |
63 | // Create an LinerRegression estimator
64 | val glr = new GeneralizedLinearRegression()
65 | .setFeaturesCol("features")
66 | .setLabelCol("label")
67 |
68 | // Building the Pipeline model for transformations and predictor
69 | println("Building ML regression model")
70 | val glrModel = glr.fit(trainingData)
71 |
72 | // **********************************************************************
73 | println("Evaluating the model on the test set and calculating the regression metrics")
74 | // **********************************************************************
75 | val trainPredictionsAndLabels = glrModel.transform(testData).select("label", "prediction")
76 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd
77 |
78 | val testRegressionMetrics = new RegressionMetrics(trainPredictionsAndLabels)
79 |
80 | val results = "\n=====================================================================\n" +
81 | s"TrainingData count: ${trainingData.count}\n" +
82 | s"TestData count: ${testData.count}\n" +
83 | "=====================================================================\n" +
84 | s"TestData MSE = ${testRegressionMetrics.meanSquaredError}\n" +
85 | s"TestData RMSE = ${testRegressionMetrics.rootMeanSquaredError}\n" +
86 | s"TestData R-squared = ${testRegressionMetrics.r2}\n" +
87 | s"TestData MAE = ${testRegressionMetrics.meanAbsoluteError}\n" +
88 | s"TestData explained variance = ${testRegressionMetrics.explainedVariance}\n" +
89 | "=====================================================================\n"
90 | println(results)
91 |
92 | // ***********************************************************
93 | println("Preparing K-fold Cross Validation and Grid Search")
94 | // ***********************************************************
95 | val paramGrid = new ParamGridBuilder()
96 | .addGrid(glr.maxIter, Array(10, 20, 30, 50, 100, 500, 1000))
97 | .addGrid(glr.regParam, Array(0.001, 0.01, 0.1))
98 | .addGrid(glr.tol, Array(0.01, 0.1))
99 | .build()
100 |
101 | val numFolds = 10 //10-fold cross-validation
102 | val cv = new CrossValidator()
103 | .setEstimator(glr)
104 | .setEvaluator(new RegressionEvaluator)
105 | .setEstimatorParamMaps(paramGrid)
106 | .setNumFolds(numFolds)
107 |
108 | // ************************************************************
109 | println("Training model with Linear Regression algorithm")
110 | // ************************************************************
111 | val cvModel = cv.fit(trainingData)
112 |
113 | // Save the workflow
114 | //cvModel.write.overwrite().save("model/GLR_model")
115 |
116 | // Load the workflow back
117 | //val sameCV = CrossValidatorModel.load("model/GLR_model")
118 |
119 | // **********************************************************************
120 | println("Evaluating the cross validated model on the validation set and calculating the regression metrics")
121 | // **********************************************************************
122 | val trainPredictionsAndLabelsCV = cvModel.transform(testData).select("label", "prediction")
123 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd
124 |
125 | val testRegressionMetricsCV = new RegressionMetrics(trainPredictionsAndLabelsCV)
126 |
127 | val cvResults = "\n=====================================================================\n" +
128 | s"TrainingData count: ${trainingData.count}\n" +
129 | s"TestData count: ${testData.count}\n" +
130 | "=====================================================================\n" +
131 | s"TestData MSE = ${testRegressionMetricsCV.meanSquaredError}\n" +
132 | s"TestData RMSE = ${testRegressionMetricsCV.rootMeanSquaredError}\n" +
133 | s"TestData R-squared = ${testRegressionMetricsCV.r2}\n" +
134 | s"TestData MAE = ${testRegressionMetricsCV.meanAbsoluteError}\n" +
135 | s"TestData explained variance = ${testRegressionMetricsCV.explainedVariance}\n" +
136 | "=====================================================================\n"
137 | println(cvResults)
138 |
139 | // Print the coefficients and intercept for generalized linear regression model
140 | println(s"Coefficients: ${glrModel.coefficients}")
141 | println(s"Intercept: ${glrModel.intercept}")
142 |
143 | spark.stop()
144 | }
145 | }
--------------------------------------------------------------------------------
/Chapter02/src/main/scala/RegressionAnalysis/UrbanTrafficLinearRegression.scala:
--------------------------------------------------------------------------------
1 | package RegressionAnalysis
2 |
3 | import org.apache.spark.ml.regression.{ LinearRegression, LinearRegressionModel }
4 | import org.apache.spark.ml.{ Pipeline, PipelineModel }
5 | import org.apache.spark.ml.tuning.{ CrossValidator, ParamGridBuilder }
6 | import org.apache.spark.ml.evaluation.RegressionEvaluator
7 | import org.apache.spark.sql._
8 | import org.apache.spark.sql.functions._
9 | import org.apache.spark.mllib.evaluation.RegressionMetrics
10 | import org.apache.spark.ml.feature.VectorAssembler
11 |
12 | import org.apache.log4j.Logger
13 | import org.apache.log4j.Level
14 |
15 | object UrbanTrafficLinearRegression {
16 | def main(args: Array[String]) {
17 | val spark = SparkSession
18 | .builder
19 | .master("local[*]")
20 | .config("spark.sql.warehouse.dir", "E:/Exp/")
21 | .appName(s"OneVsRestExample")
22 | .getOrCreate()
23 |
24 | Logger.getLogger("org").setLevel(Level.FATAL)
25 | Logger.getLogger("akka").setLevel(Level.ERROR)
26 |
27 | import spark.implicits._
28 |
29 | val rawTrafficDF = spark.read
30 | .option("header", "true")
31 | .option("inferSchema", "true")
32 | .option("delimiter", ";")
33 | .format("com.databricks.spark.csv")
34 | .load("data/Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv")
35 | .cache
36 |
37 | rawTrafficDF.show()
38 | rawTrafficDF.printSchema()
39 | rawTrafficDF.describe().show()
40 |
41 | val newTrafficDF = rawTrafficDF.withColumnRenamed("Slowness in traffic (%)", "label")
42 |
43 | newTrafficDF.createOrReplaceTempView("slDF")
44 | spark.sql("SELECT avg(label) FROM slDF").show()
45 |
46 | val colNames = newTrafficDF.columns.dropRight(1)
47 |
48 | // VectorAssembler for training features
49 | val assembler = new VectorAssembler()
50 | .setInputCols(colNames)
51 | .setOutputCol("features")
52 |
53 | val assembleDF = assembler.transform(newTrafficDF).select("features", "label")
54 | assembleDF.show(10)
55 |
56 | val seed = 12345
57 | val splits = assembleDF.randomSplit(Array(0.60, 0.40), seed)
58 | val (trainingData, testData) = (splits(0), splits(1))
59 |
60 | trainingData.cache
61 | testData.cache
62 |
63 | // Create an LinerRegression estimator
64 | val lr = new LinearRegression()
65 | .setFeaturesCol("features")
66 | .setLabelCol("label")
67 |
68 | // Building the Pipeline model for transformations and predictor
69 | println("Building ML regression model")
70 | val lrModel = lr.fit(trainingData)
71 |
72 | // Save the workflow
73 | //lrModel.write.overwrite().save("model/LR_model")
74 |
75 | // Load the workflow back
76 | //val sameLRModel = CrossValidatorModel.load("model/GLR_model")
77 |
78 | // **********************************************************************
79 | println("Evaluating the model on the test set and calculating the regression metrics")
80 | // **********************************************************************
81 | val trainPredictionsAndLabels = lrModel.transform(testData).select("label", "prediction")
82 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd
83 |
84 | val testRegressionMetrics = new RegressionMetrics(trainPredictionsAndLabels)
85 |
86 | val results = "\n=====================================================================\n" +
87 | s"TrainingData count: ${trainingData.count}\n" +
88 | s"TestData count: ${testData.count}\n" +
89 | "=====================================================================\n" +
90 | s"TestData MSE = ${testRegressionMetrics.meanSquaredError}\n" +
91 | s"TestData RMSE = ${testRegressionMetrics.rootMeanSquaredError}\n" +
92 | s"TestData R-squared = ${testRegressionMetrics.r2}\n" +
93 | s"TestData MAE = ${testRegressionMetrics.meanAbsoluteError}\n" +
94 | s"TestData explained variance = ${testRegressionMetrics.explainedVariance}\n" +
95 | "=====================================================================\n"
96 | println(results)
97 |
98 | // ***********************************************************
99 | println("Preparing K-fold Cross Validation and Grid Search")
100 | // ***********************************************************
101 | val paramGrid = new ParamGridBuilder()
102 | .addGrid(lr.maxIter, Array(10, 20, 30, 50, 100, 500, 1000))
103 | .addGrid(lr.regParam, Array(0.001, 0.01, 0.1))
104 | .addGrid(lr.tol, Array(0.01, 0.1))
105 | .build()
106 |
107 | val numFolds = 10 //10-fold cross-validation
108 | val cv = new CrossValidator()
109 | .setEstimator(lr)
110 | .setEvaluator(new RegressionEvaluator())
111 | .setEstimatorParamMaps(paramGrid)
112 | .setNumFolds(numFolds)
113 |
114 | // ************************************************************
115 | println("Training model with Linear Regression algorithm")
116 | // ************************************************************
117 | val cvModel = cv.fit(trainingData)
118 |
119 | // Save the workflow
120 | cvModel.write.overwrite().save("model/LR_model")
121 |
122 | // Load the workflow back
123 | val sameCVModel = LinearRegressionModel.load("model/LR_model")
124 |
125 | // **********************************************************************
126 | println("Evaluating the cross validated model on the validation set and calculating the regression metrics")
127 | // **********************************************************************
128 | val trainPredictionsAndLabelsCV = cvModel.transform(testData).select("label", "prediction")
129 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd
130 |
131 | val testRegressionMetricsCV = new RegressionMetrics(trainPredictionsAndLabelsCV)
132 |
133 | val cvResults = "\n=====================================================================\n" +
134 | s"TrainingData count: ${trainingData.count}\n" +
135 | s"TestData count: ${testData.count}\n" +
136 | "=====================================================================\n" +
137 | s"TestData MSE = ${testRegressionMetricsCV.meanSquaredError}\n" +
138 | s"TestData RMSE = ${testRegressionMetricsCV.rootMeanSquaredError}\n" +
139 | s"TestData R-squared = ${testRegressionMetricsCV.r2}\n" +
140 | s"TestData MAE = ${testRegressionMetricsCV.meanAbsoluteError}\n" +
141 | s"TestData explained variance = ${testRegressionMetricsCV.explainedVariance}\n" +
142 | "=====================================================================\n"
143 | println(cvResults)
144 |
145 | spark.stop()
146 | }
147 | }
--------------------------------------------------------------------------------
/Chapter03/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.packt.AnomalyDetection
6 | RandomForest
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | ScalaMLQuickStartGuide
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 | 1.8
16 | 2.2.0
17 |
18 |
19 |
20 |
21 | jdk.tools
22 | jdk.tools
23 | 1.8.0_171
24 | system
25 | C:/Program Files/Java/jdk1.8.0_171/lib/tools.jar
26 |
27 |
28 | org.apache.directory.studio
29 | org.apache.commons.io
30 | 2.4
31 |
32 |
33 | org.apache.spark
34 | spark-core_2.11
35 | ${spark.version}
36 |
37 |
38 | com.github.tototoshi
39 | scala-csv_2.10
40 | 1.3.5
41 |
42 |
43 | org.apache.spark
44 | spark-sql_2.11
45 | ${spark.version}
46 |
47 |
48 | com.github.scopt
49 | scopt_2.11
50 | 3.3.0
51 |
52 |
53 | com.typesafe
54 | config
55 | 1.2.1
56 |
57 |
58 | org.apache.directory.api
59 | api-util
60 | 1.0.0
61 |
62 |
63 | commons-io
64 | commons-io
65 | 2.6
66 |
67 |
68 | com.esotericsoftware.kryo
69 | kryo
70 | 2.10
71 |
72 |
73 | edu.stanford.nlp
74 | stanford-corenlp
75 | 3.6.0
76 |
77 |
78 | edu.stanford.nlp
79 | stanford-corenlp
80 | 3.6.0
81 | models
82 |
83 |
84 | org.apache.hadoop
85 | hadoop-common
86 | 2.6.0
87 |
88 |
89 | org.sameersingh.scalaplot
90 | scalaplot
91 | 0.0.4
92 |
93 |
94 | org.apache.spark
95 | spark-mllib_2.11
96 | ${spark.version}
97 |
98 |
99 | org.apache.spark
100 | spark-graphx_2.11
101 | ${spark.version}
102 |
103 |
104 | org.apache.spark
105 | spark-yarn_2.11
106 | ${spark.version}
107 |
108 |
109 | org.apache.spark
110 | spark-network-shuffle_2.11
111 | ${spark.version}
112 |
113 |
114 | com.databricks
115 | spark-csv_2.11
116 | 1.3.0
117 |
118 |
119 | com.holdenkarau
120 | spark-testing-base_2.10
121 | 2.0.0_0.6.0
122 |
123 |
124 | com.databricks
125 | spark-avro_2.11
126 | 4.0.0
127 |
128 |
129 | org.apache.commons
130 | commons-math3
131 | 3.2
132 |
133 |
134 | org.apache.hive
135 | hive-exec
136 | 2.3.2
137 |
138 |
139 | junit
140 | junit
141 | 3.8.1
142 | test
143 |
144 |
145 |
146 |
147 |
148 |
149 | org.apache.maven.plugins
150 | maven-eclipse-plugin
151 | 2.9
152 |
153 | true
154 | false
155 |
156 |
157 |
158 |
159 | org.apache.maven.plugins
160 | maven-compiler-plugin
161 | 3.5.1
162 |
163 | ${jdk.version}
164 | ${jdk.version}
165 |
166 |
167 |
168 | maven-shade-plugin
169 | 2.4.3
170 |
171 |
172 | package
173 |
174 | shade
175 |
176 |
177 | false
178 |
179 |
180 |
181 | *:*
182 |
183 | META-INF/*.SF
184 | META-INF/*.DSA
185 | META-INF/*.RSA
186 |
187 |
188 |
189 |
190 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 | org.apache.maven.plugins
200 | maven-assembly-plugin
201 | 2.4.1
202 |
203 |
204 |
205 | jar-with-dependencies
206 |
207 |
208 |
209 |
210 | com.packt.ScalaML.ProductionEngineering.BoschProductionLinePerformance2
211 |
212 |
213 |
214 |
215 | oozie.launcher.mapreduce.job.user.classpath.first
216 | true
217 |
218 |
219 |
220 |
221 |
222 | make-assembly
223 |
224 | package
225 |
226 | single
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
--------------------------------------------------------------------------------
/Chapter03/src/main/scala/ScalaClassification/ChurnPredictionLR.scala:
--------------------------------------------------------------------------------
1 | package ScalaClassification
2 |
3 | import org.apache.spark._
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.functions._
6 | import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel }
7 | import org.apache.spark.ml.Pipeline
8 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
9 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
10 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
11 |
12 | object ChurnPredictionLR {
13 | def main(args: Array[String]) {
14 | val spark = SparkSession
15 | .builder
16 | .master("local[*]")
17 | .config("spark.sql.warehouse.dir", "E:/Exp/")
18 | .appName("ChurnPrediction")
19 | .getOrCreate()
20 | import spark.implicits._
21 |
22 | val numFolds = 10
23 | val MaxIter: Seq[Int] = Seq(100)
24 | val RegParam: Seq[Double] = Seq(0.01) // L2 regularization param, set 0.10 with L1 reguarization
25 | val Tol: Seq[Double] = Seq(1e-4)
26 | val ElasticNetParam: Seq[Double] = Seq(1.0) // Combination of L1 and L2
27 |
28 | val lr = new LogisticRegression()
29 | .setLabelCol("label")
30 | .setFeaturesCol("features")
31 |
32 | // Chain indexers and tree in a Pipeline.
33 | val pipeline = new Pipeline()
34 | .setStages(Array(PipelineConstruction.ipindexer,
35 | PipelineConstruction.labelindexer,
36 | PipelineConstruction.assembler,
37 | lr))
38 |
39 | // Search through decision tree's maxDepth parameter for best model
40 | val paramGrid = new ParamGridBuilder()
41 | .addGrid(lr.maxIter, MaxIter)
42 | .addGrid(lr.regParam, RegParam)
43 | .addGrid(lr.tol, Tol)
44 | .addGrid(lr.elasticNetParam, ElasticNetParam)
45 | .build()
46 |
47 | val evaluator = new BinaryClassificationEvaluator()
48 | .setLabelCol("label")
49 | .setRawPredictionCol("prediction")
50 |
51 | // Set up 10-fold cross validation
52 | val crossval = new CrossValidator()
53 | .setEstimator(pipeline)
54 | .setEvaluator(evaluator)
55 | .setEstimatorParamMaps(paramGrid)
56 | .setNumFolds(numFolds)
57 |
58 | val cvModel = crossval.fit(Preprocessing.trainDF)
59 |
60 | val predDF = cvModel.transform(Preprocessing.testSet)
61 | val result = predDF.select("label", "prediction", "probability")
62 | val resutDF = result.withColumnRenamed("prediction", "Predicted_label")
63 | resutDF.show(10)
64 |
65 | val accuracy = evaluator.evaluate(predDF)
66 | println("Classification accuracy: " + accuracy)
67 |
68 | // Compute other performence metrices
69 | val predictionAndLabels = predDF
70 | .select("prediction", "label")
71 | .rdd.map(x => (x(0).asInstanceOf[Double], x(1)
72 | .asInstanceOf[Double]))
73 |
74 | val metrics = new BinaryClassificationMetrics(predictionAndLabels)
75 | val areaUnderPR = metrics.areaUnderPR
76 | println("Area under the precision-recall curve: " + areaUnderPR)
77 |
78 | val areaUnderROC = metrics.areaUnderROC
79 | println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC)
80 |
81 | val tVSpDF = predDF.select("label", "prediction") // True vs predicted labels
82 | val TC = predDF.count() //Total count
83 |
84 | val tp = tVSpDF.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / TC.toDouble
85 | val tn = tVSpDF.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / TC.toDouble
86 | val fp = tVSpDF.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / TC.toDouble
87 | val fn = tVSpDF.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / TC.toDouble
88 |
89 | val MCC = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (fp + tn) * (tn + fn)) // Calculating Matthews correlation coefficient
90 |
91 | println("True positive rate: " + tp *100 + "%")
92 | println("False positive rate: " + fp * 100 + "%")
93 | println("True negative rate: " + tn * 100 + "%")
94 | println("False negative rate: " + fn * 100 + "%")
95 | println("Matthews correlation coefficient: " + MCC)
96 | }
97 | }
--------------------------------------------------------------------------------
/Chapter03/src/main/scala/ScalaClassification/ChurnPredictionNB.scala:
--------------------------------------------------------------------------------
1 | package ScalaClassification
2 |
3 | import org.apache.spark._
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.functions._
6 | import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, NaiveBayes, NaiveBayesModel }
7 | import org.apache.spark.ml.Pipeline
8 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
9 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
10 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
11 | import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
12 |
13 | /*
14 | class Stats(val tp: Int, val tn: Int, val fp: Int, val fn: Int) {
15 | val TPR = tp / (tp + fn).toDouble
16 | val recall = TPR
17 | val sensitivity = TPR
18 | val TNR = tn / (tn + fp).toDouble
19 | val specificity = TNR
20 | val PPV = tp / (tp + fp).toDouble
21 | val precision = PPV
22 | val NPV = tn / (tn + fn).toDouble
23 | val FPR = 1.0 - specificity
24 | val FNR = 1.0 - recall
25 | val FDR = 1.0 - precision
26 | val ACC = (tp + tn) / (tp + fp + fn + tn).toDouble
27 | val accuracy = ACC
28 | val F1 = 2 * PPV * TPR / (PPV + TPR).toDouble
29 | val MCC = (tp * tn - fp * fn).toDouble / math.sqrt((tp + fp).toDouble * (tp + fn).toDouble * (fp + tn).toDouble * (tn + fn).toDouble)
30 | } */
31 |
32 | object ChurnPredictionNB {
33 | def main(args: Array[String]) {
34 | val spark = SparkSession
35 | .builder
36 | .master("local[*]")
37 | .config("spark.sql.warehouse.dir", "E:/Exp/")
38 | .appName("ChurnPrediction")
39 | .getOrCreate()
40 |
41 | import spark.implicits._
42 |
43 | val numFolds = 10
44 | val nb = new NaiveBayes()
45 | .setLabelCol("label")
46 | .setFeaturesCol("features")
47 |
48 | // Chain indexers and tree in a Pipeline.
49 | val pipeline = new Pipeline().setStages(Array(PipelineConstruction.ipindexer,
50 | PipelineConstruction.labelindexer,
51 | PipelineConstruction.assembler,
52 | nb))
53 |
54 | // Search through Naive Bayes's smoothing parameter for best model
55 | val paramGrid = new ParamGridBuilder()
56 | .addGrid(nb.smoothing, Array(1e-2, 1e-4, 1e-6, 1e-8))
57 | .build()
58 |
59 | val evaluator = new BinaryClassificationEvaluator()
60 | .setLabelCol("label")
61 | .setRawPredictionCol("prediction")
62 |
63 | // Set up 10-fold cross validation
64 | val crossval = new CrossValidator()
65 | .setEstimator(pipeline)
66 | .setEvaluator(evaluator)
67 | .setEstimatorParamMaps(paramGrid)
68 | .setNumFolds(numFolds)
69 |
70 | val cvModel = crossval.fit(Preprocessing.trainDF)
71 |
72 | val predDF = cvModel.transform(Preprocessing.testSet)
73 | val result = predDF.select("label", "prediction", "probability")
74 | val resutDF = result.withColumnRenamed("prediction", "Predicted_label")
75 | resutDF.show(10)
76 |
77 | val accuracy = evaluator.evaluate(predDF)
78 | println("Classification accuracy: " + accuracy)
79 |
80 | // Compute other performence metrices
81 | val predictionAndLabels = predDF
82 | .select("prediction", "label")
83 | .rdd.map(x => (x(0).asInstanceOf[Double], x(1)
84 | .asInstanceOf[Double]))
85 |
86 | val metrics = new BinaryClassificationMetrics(predictionAndLabels)
87 | val areaUnderPR = metrics.areaUnderPR
88 | println("Area under the precision-recall curve: " + areaUnderPR)
89 |
90 | val areaUnderROC = metrics.areaUnderROC
91 | println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC)
92 |
93 | val tVSpDF = predDF.select("label", "prediction") // True vs predicted labels
94 | val TC = predDF.count() //Total count
95 |
96 | val tp = tVSpDF.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / TC.toDouble
97 | val tn = tVSpDF.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / TC.toDouble
98 | val fp = tVSpDF.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / TC.toDouble
99 | val fn = tVSpDF.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / TC.toDouble
100 |
101 | val MCC = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (fp + tn) * (tn + fn)) // Calculating Matthews correlation coefficient
102 |
103 | println("True positive rate: " + tp *100 + "%")
104 | println("False positive rate: " + fp * 100 + "%")
105 | println("True negative rate: " + tn * 100 + "%")
106 | println("False negative rate: " + fn * 100 + "%")
107 | println("Matthews correlation coefficient: " + MCC)
108 | }
109 | }
--------------------------------------------------------------------------------
/Chapter03/src/main/scala/ScalaClassification/ChurnPredictionSVM.scala:
--------------------------------------------------------------------------------
1 | package ScalaClassification
2 |
3 | import org.apache.spark._
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.functions._
6 | import org.apache.spark.ml.classification.{ LinearSVC, LinearSVCModel }
7 | import org.apache.spark.sql.SparkSession
8 | import org.apache.spark.sql.functions.max
9 | import org.apache.spark.ml.Pipeline
10 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
11 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
12 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
13 |
14 | object ChurnPredictionSVM {
15 | def main(args: Array[String]) {
16 | val spark = SparkSession
17 | .builder
18 | .master("local[*]")
19 | .config("spark.sql.warehouse.dir", "E:/Exp/")
20 | .appName("ChurnPrediction")
21 | .getOrCreate()
22 |
23 | import spark.implicits._
24 |
25 | val numFolds = 10
26 | val MaxIter: Seq[Int] = Seq(10000)
27 | val RegParam: Seq[Double] = Seq(0.10) // L2 regularization param, set 0.10 with L1 reguarization
28 | val Tol: Seq[Double] = Seq(1e-4)
29 | val ElasticNetParam: Seq[Double] = Seq(0.00001) // Combination of L1 and L2
30 |
31 | val svm = new LinearSVC()
32 |
33 | // Chain indexers and tree in a Pipeline.
34 | val pipeline = new Pipeline()
35 | .setStages(Array(PipelineConstruction.ipindexer,
36 | PipelineConstruction.labelindexer,
37 | PipelineConstruction.assembler,
38 | svm))
39 |
40 | // Search through decision tree's maxDepth parameter for best model
41 | val paramGrid = new ParamGridBuilder()
42 | .addGrid(svm.maxIter, MaxIter)
43 | .addGrid(svm.regParam, RegParam)
44 | .addGrid(svm.tol, Tol)
45 | .build()
46 |
47 | val evaluator = new BinaryClassificationEvaluator()
48 | .setLabelCol("label")
49 | .setRawPredictionCol("prediction")
50 |
51 | // Set up 3-fold cross validation
52 | val crossval = new CrossValidator()
53 | .setEstimator(pipeline)
54 | .setEvaluator(evaluator)
55 | .setEstimatorParamMaps(paramGrid)
56 | .setNumFolds(numFolds)
57 |
58 | val cvModel = crossval.fit(Preprocessing.trainDF)
59 |
60 | val predDF = cvModel.transform(Preprocessing.testSet)
61 | val result = predDF.select("label", "prediction", "probability")
62 | val resutDF = result.withColumnRenamed("prediction", "Predicted_label")
63 | resutDF.show(10)
64 |
65 | val accuracy = evaluator.evaluate(predDF)
66 | println("Classification accuracy: " + accuracy)
67 |
68 | // Compute other performence metrices
69 | val predictionAndLabels = predDF
70 | .select("prediction", "label")
71 | .rdd.map(x => (x(0).asInstanceOf[Double], x(1)
72 | .asInstanceOf[Double]))
73 |
74 | val metrics = new BinaryClassificationMetrics(predictionAndLabels)
75 | val areaUnderPR = metrics.areaUnderPR
76 | println("Area under the precision-recall curve: " + areaUnderPR)
77 |
78 | val areaUnderROC = metrics.areaUnderROC
79 | println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC)
80 |
81 | val tVSpDF = predDF.select("label", "prediction") // True vs predicted labels
82 | val TC = predDF.count() //Total count
83 |
84 | val tp = tVSpDF.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / TC.toDouble
85 | val tn = tVSpDF.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / TC.toDouble
86 | val fp = tVSpDF.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / TC.toDouble
87 | val fn = tVSpDF.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / TC.toDouble
88 |
89 | val MCC = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (fp + tn) * (tn + fn)) // Calculating Matthews correlation coefficient
90 |
91 | println("True positive rate: " + tp *100 + "%")
92 | println("False positive rate: " + fp * 100 + "%")
93 | println("True negative rate: " + tn * 100 + "%")
94 | println("False negative rate: " + fn * 100 + "%")
95 | println("Matthews correlation coefficient: " + MCC)
96 | }
97 | }
--------------------------------------------------------------------------------
/Chapter03/src/main/scala/ScalaClassification/Describe.scala:
--------------------------------------------------------------------------------
1 | package com.packt.ScalaML.ChrunPrediction
2 |
3 | import org.apache.spark._
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.functions._
6 | import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel }
7 | import org.apache.spark.sql.SparkSession
8 | import org.apache.spark.sql.functions.max
9 | import org.apache.spark.ml.Pipeline
10 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
11 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
12 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
13 |
14 | import org.apache.spark._
15 | import org.apache.spark.sql.functions._
16 | import org.apache.spark.sql.types._
17 | import org.apache.spark.sql._
18 | import org.apache.spark.sql.Dataset
19 |
20 | import org.apache.spark.ml.linalg.{ Matrix, Vectors }
21 | import org.apache.spark.ml.stat.Correlation
22 | import org.apache.spark.sql.Row
23 |
24 | object Describe {
25 | case class CustomerAccount(state_code: String, account_length: Integer, area_code: String,
26 | international_plan: String, voice_mail_plan: String, num_voice_mail: Double,
27 | total_day_mins: Double, total_day_calls: Double, total_day_charge: Double,
28 | total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double,
29 | total_night_mins: Double, total_night_calls: Double, total_night_charge: Double,
30 | total_international_mins: Double, total_international_calls: Double, total_international_charge: Double,
31 | total_international_num_calls: Double, churn: String)
32 |
33 | val schema = StructType(Array(
34 | StructField("state_code", StringType, true),
35 | StructField("account_length", IntegerType, true),
36 | StructField("area_code", StringType, true),
37 | StructField("international_plan", StringType, true),
38 | StructField("voice_mail_plan", StringType, true),
39 | StructField("num_voice_mail", DoubleType, true),
40 | StructField("total_day_mins", DoubleType, true),
41 | StructField("total_day_calls", DoubleType, true),
42 | StructField("total_day_charge", DoubleType, true),
43 | StructField("total_evening_mins", DoubleType, true),
44 | StructField("total_evening_calls", DoubleType, true),
45 | StructField("total_evening_charge", DoubleType, true),
46 | StructField("total_night_mins", DoubleType, true),
47 | StructField("total_night_calls", DoubleType, true),
48 | StructField("total_night_charge", DoubleType, true),
49 | StructField("total_international_mins", DoubleType, true),
50 | StructField("total_international_calls", DoubleType, true),
51 | StructField("total_international_charge", DoubleType, true),
52 | StructField("total_international_num_calls", DoubleType, true),
53 | StructField("churn", StringType, true)))
54 |
55 | def main(args: Array[String]) {
56 | val spark = SparkSession
57 | .builder
58 | .master("local[*]")
59 | .config("spark.sql.warehouse.dir", "E:/Exp/")
60 | .appName("Desribe")
61 | .getOrCreate()
62 |
63 | spark.conf.set("spark.debug.maxToStringFields", 10000)
64 | val DEFAULT_MAX_TO_STRING_FIELDS = 2500
65 | if (SparkEnv.get != null) {
66 | SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS)
67 | } else {
68 | DEFAULT_MAX_TO_STRING_FIELDS
69 | }
70 | import spark.implicits._
71 |
72 | val trainSet: Dataset[CustomerAccount] = spark.read.
73 | option("inferSchema", "false")
74 | .format("com.databricks.spark.csv")
75 | .schema(schema)
76 | .load("data/churn-bigml-80.csv")
77 | .as[CustomerAccount]
78 |
79 | val statsDF = trainSet.describe()
80 | statsDF.show()
81 |
82 | trainSet.createOrReplaceTempView("UserAccount")
83 | spark.catalog.cacheTable("UserAccount")
84 |
85 | spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show()
86 | spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show()
87 | trainSet.groupBy("churn").count.show()
88 | spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn")
89 |
90 | }
91 | }
--------------------------------------------------------------------------------
/Chapter03/src/main/scala/ScalaClassification/PipelineConstruction.scala:
--------------------------------------------------------------------------------
1 | package ScalaClassification
2 |
3 | import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
4 |
5 | object PipelineConstruction {
6 | // Index labels, adding metadata to the label column. Fit on whole dataset to include all labels in index.
7 | val ipindexer = new StringIndexer()
8 | .setInputCol("international_plan")
9 | .setOutputCol("iplanIndex")
10 |
11 | val labelindexer = new StringIndexer()
12 | .setInputCol("churn")
13 | .setOutputCol("label")
14 |
15 | val featureCols = Array("account_length", "iplanIndex", "num_voice_mail", "total_day_mins", "total_day_calls", "total_evening_mins", "total_evening_calls", "total_night_mins", "total_night_calls", "total_international_mins", "total_international_calls", "total_international_num_calls")
16 |
17 | val assembler = new VectorAssembler()
18 | .setInputCols(featureCols)
19 | .setOutputCol("features")
20 | }
--------------------------------------------------------------------------------
/Chapter03/src/main/scala/ScalaClassification/Preprocessing.scala:
--------------------------------------------------------------------------------
1 | package ScalaClassification
2 |
3 | import org.apache.spark._
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql._
7 | import org.apache.spark.sql.Dataset
8 |
9 | /*
10 | * Dataset schema
11 | State
12 | Account length
13 | Area code
14 | International plan
15 | Voice mail plan
16 | Number vmail messages
17 | Total day minutes
18 | Total day calls
19 | Total day charge
20 | Total eve minutes
21 | Total eve calls Total eve charge
22 | Total night minutes
23 | Total night calls
24 | Total night charge
25 | Total intl minutes
26 | Total intl calls
27 | Total intl charge
28 | Customer service calls
29 | Churn
30 | */
31 |
32 | object Preprocessing {
33 | case class CustomerAccount(state_code: String, account_length: Integer, area_code: String,
34 | international_plan: String, voice_mail_plan: String, num_voice_mail: Double,
35 | total_day_mins: Double, total_day_calls: Double, total_day_charge: Double,
36 | total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double,
37 | total_night_mins: Double, total_night_calls: Double, total_night_charge: Double,
38 | total_international_mins: Double, total_international_calls: Double, total_international_charge: Double,
39 | total_international_num_calls: Double, churn: String)
40 |
41 | val schema = StructType(Array(
42 | StructField("state_code", StringType, true),
43 | StructField("account_length", IntegerType, true),
44 | StructField("area_code", StringType, true),
45 | StructField("international_plan", StringType, true),
46 | StructField("voice_mail_plan", StringType, true),
47 | StructField("num_voice_mail", DoubleType, true),
48 | StructField("total_day_mins", DoubleType, true),
49 | StructField("total_day_calls", DoubleType, true),
50 | StructField("total_day_charge", DoubleType, true),
51 | StructField("total_evening_mins", DoubleType, true),
52 | StructField("total_evening_calls", DoubleType, true),
53 | StructField("total_evening_charge", DoubleType, true),
54 | StructField("total_night_mins", DoubleType, true),
55 | StructField("total_night_calls", DoubleType, true),
56 | StructField("total_night_charge", DoubleType, true),
57 | StructField("total_international_mins", DoubleType, true),
58 | StructField("total_international_calls", DoubleType, true),
59 | StructField("total_international_charge", DoubleType, true),
60 | StructField("total_international_num_calls", DoubleType, true),
61 | StructField("churn", StringType, true)))
62 |
63 | val spark = SparkSession
64 | .builder
65 | .master("local[*]")
66 | .config("spark.sql.warehouse.dir", "E:/Exp/")
67 | .appName("ChurnPrediction")
68 | .getOrCreate()
69 | import spark.implicits._
70 |
71 | val trainSet: Dataset[CustomerAccount] = spark.read.
72 | option("inferSchema", "false")
73 | .format("com.databricks.spark.csv")
74 | .schema(schema)
75 | .load("data/churn-bigml-80.csv")
76 | .as[CustomerAccount]
77 |
78 | val statsDF = trainSet.describe()
79 | statsDF.show()
80 | trainSet.cache()
81 |
82 | trainSet.groupBy("churn").sum("total_international_num_calls").show()
83 | trainSet.groupBy("churn").sum("total_international_charge").show()
84 |
85 | val testSet: Dataset[CustomerAccount] = spark.read.
86 | option("inferSchema", "false")
87 | .format("com.databricks.spark.csv")
88 | .schema(schema)
89 | .load("data/churn-bigml-20.csv")
90 | .as[CustomerAccount]
91 |
92 | testSet.describe()
93 | testSet.cache()
94 |
95 | trainSet.printSchema()
96 | trainSet.show()
97 |
98 | trainSet.createOrReplaceTempView("UserAccount")
99 | spark.catalog.cacheTable("UserAccount")
100 |
101 | /////////////// Feature engineering
102 | spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show()
103 | spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show()
104 | trainSet.groupBy("churn").count.show()
105 | spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) as Total_intl_call FROM UserAccount GROUP BY churn").show()
106 |
107 | val fractions = Map("False" -> 0.1675, "True" -> 1.0)
108 |
109 | //Here we're keeping all instances of the Churn=True class, but downsampling the Churn=False class to a fraction of 388/2278.
110 | val churnDF = trainSet.stat.sampleBy("churn", fractions, 123456L)
111 |
112 | churnDF.groupBy("churn").count.show()
113 |
114 | val trainDF = churnDF
115 | .drop("state_code")
116 | .drop("area_code")
117 | .drop("voice_mail_plan")
118 | .drop("total_day_charge")
119 | .drop("total_evening_charge")
120 |
121 | println(trainDF.count)
122 | trainDF.select("account_length", "international_plan", "num_voice_mail", "total_day_calls", "total_international_num_calls", "churn").show(10)
123 | }
--------------------------------------------------------------------------------
/Chapter04/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.packt.AnomalyDetection
6 | RandomForest
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | ScalaMLQuickStartGuide
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 | 1.8
16 | 2.2.0
17 |
18 |
19 |
20 |
21 | jdk.tools
22 | jdk.tools
23 | 1.8.0_171
24 | system
25 | C:/Program Files/Java/jdk1.8.0_171/lib/tools.jar
26 |
27 |
28 | org.apache.directory.studio
29 | org.apache.commons.io
30 | 2.4
31 |
32 |
33 | org.apache.spark
34 | spark-core_2.11
35 | ${spark.version}
36 |
37 |
38 | com.github.tototoshi
39 | scala-csv_2.10
40 | 1.3.5
41 |
42 |
43 | org.apache.spark
44 | spark-sql_2.11
45 | ${spark.version}
46 |
47 |
48 | com.github.scopt
49 | scopt_2.11
50 | 3.3.0
51 |
52 |
53 | com.typesafe
54 | config
55 | 1.2.1
56 |
57 |
58 | org.apache.directory.api
59 | api-util
60 | 1.0.0
61 |
62 |
63 | commons-io
64 | commons-io
65 | 2.6
66 |
67 |
68 | com.esotericsoftware.kryo
69 | kryo
70 | 2.10
71 |
72 |
73 | edu.stanford.nlp
74 | stanford-corenlp
75 | 3.6.0
76 |
77 |
78 | edu.stanford.nlp
79 | stanford-corenlp
80 | 3.6.0
81 | models
82 |
83 |
84 | org.apache.hadoop
85 | hadoop-common
86 | 2.6.0
87 |
88 |
89 | org.sameersingh.scalaplot
90 | scalaplot
91 | 0.0.4
92 |
93 |
94 | org.apache.spark
95 | spark-mllib_2.11
96 | ${spark.version}
97 |
98 |
99 | org.apache.spark
100 | spark-graphx_2.11
101 | ${spark.version}
102 |
103 |
104 | org.apache.spark
105 | spark-yarn_2.11
106 | ${spark.version}
107 |
108 |
109 | org.apache.spark
110 | spark-network-shuffle_2.11
111 | ${spark.version}
112 |
113 |
114 | com.databricks
115 | spark-csv_2.11
116 | 1.3.0
117 |
118 |
119 | com.holdenkarau
120 | spark-testing-base_2.10
121 | 2.0.0_0.6.0
122 |
123 |
124 | com.databricks
125 | spark-avro_2.11
126 | 4.0.0
127 |
128 |
129 | org.apache.commons
130 | commons-math3
131 | 3.2
132 |
133 |
134 | org.apache.hive
135 | hive-exec
136 | 2.3.2
137 |
138 |
139 | junit
140 | junit
141 | 3.8.1
142 | test
143 |
144 |
145 |
146 |
147 |
148 |
149 | org.apache.maven.plugins
150 | maven-eclipse-plugin
151 | 2.9
152 |
153 | true
154 | false
155 |
156 |
157 |
158 |
159 | org.apache.maven.plugins
160 | maven-compiler-plugin
161 | 3.5.1
162 |
163 | ${jdk.version}
164 | ${jdk.version}
165 |
166 |
167 |
168 | maven-shade-plugin
169 | 2.4.3
170 |
171 |
172 | package
173 |
174 | shade
175 |
176 |
177 | false
178 |
179 |
180 |
181 | *:*
182 |
183 | META-INF/*.SF
184 | META-INF/*.DSA
185 | META-INF/*.RSA
186 |
187 |
188 |
189 |
190 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 | org.apache.maven.plugins
200 | maven-assembly-plugin
201 | 2.4.1
202 |
203 |
204 |
205 | jar-with-dependencies
206 |
207 |
208 |
209 |
210 | com.packt.ScalaML.ProductionEngineering.BoschProductionLinePerformance2
211 |
212 |
213 |
214 |
215 | oozie.launcher.mapreduce.job.user.classpath.first
216 | true
217 |
218 |
219 |
220 |
221 |
222 | make-assembly
223 |
224 | package
225 |
226 | single
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
--------------------------------------------------------------------------------
/Chapter04/src/main/scala/ScalaTreeEnsimbles/ChurnPredictionDT.scala:
--------------------------------------------------------------------------------
1 | package ScalaTreeEnsimbles
2 |
3 | import org.apache.spark._
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.functions._
6 | import org.apache.spark.sql.types._
7 | import org.apache.spark.sql.SQLContext
8 | import org.apache.spark.sql.SQLImplicits
9 | import org.apache.spark.sql._
10 | import org.apache.spark.sql.Dataset
11 | import org.apache.spark.ml.Pipeline
12 | import org.apache.spark.ml.classification.{ DecisionTreeClassifier, DecisionTreeClassificationModel }
13 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
14 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
15 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
16 |
17 | object ChurnPredictionDT {
18 | def main(args: Array[String]) {
19 | val spark = SparkSession
20 | .builder
21 | .master("local[*]")
22 | .config("spark.sql.warehouse.dir", "E:/Exp/")
23 | .appName("ChurnPrediction")
24 | .getOrCreate()
25 |
26 | import spark.implicits._
27 |
28 | val dTree = new DecisionTreeClassifier()
29 | .setLabelCol("label")
30 | .setFeaturesCol("features")
31 | .setSeed(12357L)
32 |
33 | // Chain indexers and tree in a Pipeline.
34 | val pipeline = new Pipeline()
35 | .setStages(Array(ScalaClassification.PipelineConstruction.ipindexer,
36 | ScalaClassification.PipelineConstruction.labelindexer,
37 | ScalaClassification.PipelineConstruction.assembler,
38 | dTree))
39 |
40 | // Search through decision tree's maxDepth parameter for best model
41 | var paramGrid = new ParamGridBuilder()
42 | .addGrid(dTree.impurity, "gini" :: "entropy" :: Nil)
43 | .addGrid(dTree.maxBins, 3 :: 5 :: 9 :: 10 :: Nil)
44 | .addGrid(dTree.maxDepth, 5 :: 10 :: 15 :: Nil)
45 | .build()
46 |
47 | val evaluator = new BinaryClassificationEvaluator()
48 | .setLabelCol("label")
49 | .setRawPredictionCol("prediction")
50 |
51 | // Set up 10-fold cross validation
52 | val numFolds = 10
53 | val crossval = new CrossValidator()
54 | .setEstimator(pipeline)
55 | .setEvaluator(evaluator)
56 | .setEstimatorParamMaps(paramGrid)
57 | .setNumFolds(numFolds)
58 |
59 | val cvModel = crossval.fit(ScalaClassification.Preprocessing.trainDF)
60 |
61 | val bestModel = cvModel.bestModel
62 | println("The Best Model and Parameters:\n--------------------")
63 | println(bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel].stages(3))
64 |
65 | bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel]
66 | .stages(3)
67 | .extractParamMap
68 |
69 | val treeModel = bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel]
70 | .stages(3)
71 | .asInstanceOf[DecisionTreeClassificationModel]
72 |
73 | println("Learned classification tree model:\n" + treeModel.toDebugString)
74 | println("Feature 11:" + ScalaClassification.Preprocessing.trainDF.select(ScalaClassification.PipelineConstruction.featureCols(11)))
75 | println("Feature 3:" + ScalaClassification.Preprocessing.trainDF.select(ScalaClassification.PipelineConstruction.featureCols(3)))
76 |
77 | val predDF = cvModel.transform(ScalaClassification.Preprocessing.testSet)
78 | val result = predDF.select("label", "prediction", "probability")
79 | val resutDF = result.withColumnRenamed("prediction", "Predicted_label")
80 | resutDF.show(10)
81 |
82 | val accuracy = evaluator.evaluate(predDF)
83 | println("Classification accuracy: " + accuracy)
84 |
85 | // Compute other performence metrices
86 | val predictionAndLabels = predDF
87 | .select("prediction", "label")
88 | .rdd.map(x => (x(0).asInstanceOf[Double], x(1)
89 | .asInstanceOf[Double]))
90 |
91 | val metrics = new BinaryClassificationMetrics(predictionAndLabels)
92 | val areaUnderPR = metrics.areaUnderPR
93 | println("Area under the precision-recall curve: " + areaUnderPR)
94 |
95 | val areaUnderROC = metrics.areaUnderROC
96 | println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC)
97 |
98 | val tVSpDF = predDF.select("label", "prediction") // True vs predicted labels
99 | val TC = predDF.count() //Total count
100 |
101 | val tp = tVSpDF.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / TC.toDouble
102 | val tn = tVSpDF.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / TC.toDouble
103 | val fp = tVSpDF.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / TC.toDouble
104 | val fn = tVSpDF.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / TC.toDouble
105 |
106 | val MCC = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (fp + tn) * (tn + fn)) // Calculating Matthews correlation coefficient
107 |
108 | println("True positive rate: " + tp *100 + "%")
109 | println("False positive rate: " + fp * 100 + "%")
110 | println("True negative rate: " + tn * 100 + "%")
111 | println("False negative rate: " + fn * 100 + "%")
112 | println("Matthews correlation coefficient: " + MCC)
113 | }
114 | }
--------------------------------------------------------------------------------
/Chapter04/src/main/scala/ScalaTreeEnsimbles/ChurnPredictionGBT.scala:
--------------------------------------------------------------------------------
1 | package ScalaTreeEnsimbles
2 |
3 | import org.apache.spark._
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.functions._
6 | import org.apache.spark.sql.types._
7 | import org.apache.spark.sql._
8 | import org.apache.spark.ml.Pipeline
9 | import org.apache.spark.ml.classification.{GBTClassifier, GBTClassificationModel}
10 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
11 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
12 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
13 |
14 | object ChurnPredictionGBT {
15 | def main(args: Array[String]) {
16 | val spark = SparkSession
17 | .builder
18 | .master("local[*]")
19 | .config("spark.sql.warehouse.dir", "E:/Exp/")
20 | .appName("ChurnPrediction")
21 | .getOrCreate()
22 |
23 | import spark.implicits._
24 |
25 | val gbt = new GBTClassifier()
26 | .setLabelCol("label")
27 | .setFeaturesCol("features")
28 | .setSeed(1234567L)
29 |
30 | // Chain indexers and tree in a Pipeline.
31 | val pipeline = new Pipeline()
32 | .setStages(Array(ScalaClassification.PipelineConstruction.ipindexer,
33 | ScalaClassification.PipelineConstruction.labelindexer,
34 | ScalaClassification.PipelineConstruction.assembler,
35 | gbt))
36 |
37 | // Search through decision tree's maxDepth parameter for best model
38 | val paramGrid = new ParamGridBuilder()
39 | .addGrid(gbt.maxDepth, 3 :: 5 :: 10 :: Nil) // :: 15 :: 20 :: 25 :: 30 :: Nil)
40 | .addGrid(gbt.impurity, "gini" :: "entropy" :: Nil)
41 | .addGrid(gbt.maxBins, 5 :: 10 :: 20 :: Nil) //10 :: 15 :: 25 :: 35 :: 45 :: Nil)
42 | .build()
43 |
44 | val evaluator = new BinaryClassificationEvaluator()
45 | .setLabelCol("label")
46 | .setRawPredictionCol("prediction")
47 |
48 | // Set up 10-fold cross validation
49 | val numFolds = 10
50 | val crossval = new CrossValidator()
51 | .setEstimator(pipeline)
52 | .setEvaluator(evaluator)
53 | .setEstimatorParamMaps(paramGrid)
54 | .setNumFolds(numFolds)
55 |
56 | val cvModel = crossval.fit(ScalaClassification.Preprocessing.trainDF)
57 |
58 | // Save the workflow
59 | cvModel.write.overwrite().save("model/RF_model_churn")
60 |
61 | val bestModel = cvModel.bestModel
62 | println("The Best Model and Parameters:\n--------------------")
63 | println(bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel].stages(3))
64 |
65 | bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel]
66 | .stages(3)
67 | .extractParamMap
68 |
69 | val treeModel = bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel]
70 | .stages(3)
71 | .asInstanceOf[GBTClassificationModel]
72 |
73 | println("Learned classification tree model:\n" + treeModel.toDebugString)
74 | println("Feature 11:" + ScalaClassification.Preprocessing.trainDF.select(ScalaClassification.PipelineConstruction.featureCols(11)))
75 | println("Feature 3:" + ScalaClassification.Preprocessing.trainDF.select(ScalaClassification.PipelineConstruction.featureCols(3)))
76 |
77 | val predDF = cvModel.transform(ScalaClassification.Preprocessing.testSet)
78 | val result = predDF.select("label", "prediction", "probability")
79 | val resutDF = result.withColumnRenamed("prediction", "Predicted_label")
80 | resutDF.show(10)
81 |
82 | val accuracy = evaluator.evaluate(predDF)
83 | println("Classification accuracy: " + accuracy)
84 |
85 | // Compute other performence metrices
86 | val predictionAndLabels = predDF
87 | .select("prediction", "label")
88 | .rdd.map(x => (x(0).asInstanceOf[Double], x(1)
89 | .asInstanceOf[Double]))
90 |
91 | val metrics = new BinaryClassificationMetrics(predictionAndLabels)
92 | val areaUnderPR = metrics.areaUnderPR
93 | println("Area under the precision-recall curve: " + areaUnderPR)
94 |
95 | val areaUnderROC = metrics.areaUnderROC
96 | println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC)
97 |
98 | val tVSpDF = predDF.select("label", "prediction") // True vs predicted labels
99 | val TC = predDF.count() //Total count
100 |
101 | val tp = tVSpDF.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / TC.toDouble
102 | val tn = tVSpDF.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / TC.toDouble
103 | val fp = tVSpDF.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / TC.toDouble
104 | val fn = tVSpDF.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / TC.toDouble
105 |
106 | val MCC = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (fp + tn) * (tn + fn)) // Calculating Matthews correlation coefficient
107 |
108 | println("True positive rate: " + tp *100 + "%")
109 | println("False positive rate: " + fp * 100 + "%")
110 | println("True negative rate: " + tn * 100 + "%")
111 | println("False negative rate: " + fn * 100 + "%")
112 | println("Matthews correlation coefficient: " + MCC)
113 | }
114 | }
--------------------------------------------------------------------------------
/Chapter04/src/main/scala/ScalaTreeEnsimbles/ChurnPredictionRF.scala:
--------------------------------------------------------------------------------
1 | package ScalaTreeEnsimbles
2 |
3 | import org.apache.spark._
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.functions._
6 | import org.apache.spark.sql.types._
7 | import org.apache.spark.sql._
8 | import org.apache.spark.ml.Pipeline
9 | import org.apache.spark.ml.classification.{ RandomForestClassifier, RandomForestClassificationModel }
10 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
11 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
12 | import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
13 |
14 | object ChurnPredictionRF {
15 | def main(args: Array[String]) {
16 | val spark = SparkSession
17 | .builder
18 | .master("local[*]")
19 | .config("spark.sql.warehouse.dir", "E:/Exp/")
20 | .appName("ChurnPrediction")
21 | .getOrCreate()
22 |
23 | import spark.implicits._
24 |
25 | val rf = new RandomForestClassifier()
26 | .setLabelCol("label")
27 | .setFeaturesCol("features")
28 | .setSeed(1234567L)
29 |
30 | // Chain indexers and tree in a Pipeline.
31 | val pipeline = new Pipeline()
32 | .setStages(Array(ScalaClassification.PipelineConstruction.ipindexer,
33 | ScalaClassification.PipelineConstruction.labelindexer,
34 | ScalaClassification.PipelineConstruction.assembler,
35 | rf))
36 |
37 | // Search through decision tree's maxDepth parameter for best model
38 | val paramGrid = new ParamGridBuilder()
39 | .addGrid(rf.maxDepth, 3 :: 5 :: 10 :: Nil) // :: 15 :: 20 :: 25 :: 30 :: Nil)
40 | .addGrid(rf.featureSubsetStrategy, "auto" :: "all" :: Nil)
41 | .addGrid(rf.impurity, "gini" :: "entropy" :: Nil)
42 | .addGrid(rf.maxBins, 5 :: 10 :: 20 :: Nil) //10 :: 15 :: 25 :: 35 :: 45 :: Nil)
43 | .addGrid(rf.numTrees, 5 :: 10 :: 20 :: Nil) // :: 100 :: Nil)
44 | .build()
45 |
46 | val evaluator = new BinaryClassificationEvaluator()
47 | .setLabelCol("label")
48 | .setRawPredictionCol("prediction")
49 |
50 | // Set up 10-fold cross validation
51 | val numFolds = 10
52 | val crossval = new CrossValidator()
53 | .setEstimator(pipeline)
54 | .setEvaluator(evaluator)
55 | .setEstimatorParamMaps(paramGrid)
56 | .setNumFolds(numFolds)
57 |
58 | val cvModel = crossval.fit(ScalaClassification.Preprocessing.trainDF)
59 |
60 | // Save the workflow
61 | cvModel.write.overwrite().save("model/RF_model_churn")
62 |
63 | val bestModel = cvModel.bestModel
64 | println("The Best Model and Parameters:\n--------------------")
65 | println(bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel].stages(3))
66 |
67 | bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel]
68 | .stages(3)
69 | .extractParamMap
70 |
71 | val treeModel = bestModel.asInstanceOf[org.apache.spark.ml.PipelineModel]
72 | .stages(3)
73 | .asInstanceOf[RandomForestClassificationModel]
74 |
75 | println("Learned classification tree model:\n" + treeModel.toDebugString)
76 | println("Feature 11:" + ScalaClassification.Preprocessing.trainDF.select(ScalaClassification.PipelineConstruction.featureCols(11)))
77 | println("Feature 3:" + ScalaClassification.Preprocessing.trainDF.select(ScalaClassification.PipelineConstruction.featureCols(3)))
78 |
79 | val predDF = cvModel.transform(ScalaClassification.Preprocessing.testSet)
80 | val result = predDF.select("label", "prediction", "probability")
81 | val resutDF = result.withColumnRenamed("prediction", "Predicted_label")
82 | resutDF.show(10)
83 |
84 | val accuracy = evaluator.evaluate(predDF)
85 | println("Classification accuracy: " + accuracy)
86 |
87 | // Compute other performence metrices
88 | val predictionAndLabels = predDF
89 | .select("prediction", "label")
90 | .rdd.map(x => (x(0).asInstanceOf[Double], x(1)
91 | .asInstanceOf[Double]))
92 |
93 | val metrics = new BinaryClassificationMetrics(predictionAndLabels)
94 | val areaUnderPR = metrics.areaUnderPR
95 | println("Area under the precision-recall curve: " + areaUnderPR)
96 |
97 | val areaUnderROC = metrics.areaUnderROC
98 | println("Area under the receiver operating characteristic (ROC) curve: " + areaUnderROC)
99 |
100 | val tVSpDF = predDF.select("label", "prediction") // True vs predicted labels
101 | val TC = predDF.count() //Total count
102 |
103 | val tp = tVSpDF.filter($"prediction" === 0.0).filter($"label" === $"prediction").count() / TC.toDouble
104 | val tn = tVSpDF.filter($"prediction" === 1.0).filter($"label" === $"prediction").count() / TC.toDouble
105 | val fp = tVSpDF.filter($"prediction" === 1.0).filter(not($"label" === $"prediction")).count() / TC.toDouble
106 | val fn = tVSpDF.filter($"prediction" === 0.0).filter(not($"label" === $"prediction")).count() / TC.toDouble
107 |
108 | val MCC = (tp * tn - fp * fn) / math.sqrt((tp + fp) * (tp + fn) * (fp + tn) * (tn + fn)) // Calculating Matthews correlation coefficient
109 |
110 | println("True positive rate: " + tp *100 + "%")
111 | println("False positive rate: " + fp * 100 + "%")
112 | println("True negative rate: " + tn * 100 + "%")
113 | println("False negative rate: " + fn * 100 + "%")
114 | println("Matthews correlation coefficient: " + MCC)
115 | }
116 | }
--------------------------------------------------------------------------------
/Chapter04/src/main/scala/ScalaTreeEnsimbles/Preproessing.scala:
--------------------------------------------------------------------------------
1 | package ScalaTreeEnsimbles
2 |
3 | import org.apache.spark.ml.feature.{ StringIndexer, StringIndexerModel }
4 | import org.apache.spark.ml.feature.VectorAssembler
5 | import org.apache.spark.sql._
6 | import org.apache.spark.sql.functions._
7 |
8 | object Preproessing {
9 | var trainSample = 1.0
10 | var testSample = 1.0
11 | val train = "data/insurance_train.csv"
12 | val test = "data/insurance_test.csv"
13 |
14 | val spark = SparkSession
15 | .builder
16 | .master("local[*]")
17 | .config("spark.sql.warehouse.dir", "E:/Exp/")
18 | .appName(s"OneVsRestExample")
19 | .getOrCreate()
20 |
21 | import spark.implicits._
22 | println("Reading data from " + train + " file")
23 |
24 | val trainInput = spark.read
25 | .option("header", "true")
26 | .option("inferSchema", "true")
27 | .format("com.databricks.spark.csv")
28 | .load(train)
29 | .cache
30 |
31 | val testInput = spark.read
32 | .option("header", "true")
33 | .option("inferSchema", "true")
34 | .format("com.databricks.spark.csv")
35 | .load(test)
36 | .cache
37 |
38 | println("Preparing data for training model")
39 | var data = trainInput.withColumnRenamed("loss", "label").sample(false, trainSample)
40 | var DF = data.na.drop()
41 |
42 | // Null check
43 | if (data == DF)
44 | println("No null values in the DataFrame")
45 |
46 | else {
47 | println("Null values exist in the DataFrame")
48 | data = DF
49 | }
50 |
51 | val seed = 23579L
52 | val splits = data.randomSplit(Array(0.80, 0.20), seed)
53 | val (trainingData, validData) = (splits(0), splits(1))
54 |
55 | trainingData.cache
56 | validData.cache
57 |
58 | val testData = testInput.sample(false, testSample).cache
59 |
60 | def isCateg(c: String): Boolean = c.startsWith("cat")
61 | def categNewCol(c: String): String = if (isCateg(c)) s"idx_${c}" else c
62 |
63 | // Function to remove categorical columns with too many categories
64 | def removeTooManyCategs(c: String): Boolean = !(c matches "cat(109$|110$|112$|113$|116$)")
65 |
66 | // Function to select only feature columns (omit id and label)
67 | def onlyFeatureCols(c: String): Boolean = !(c matches "id|label")
68 |
69 | // Definitive set of feature columns
70 | val featureCols = trainingData.columns
71 | .filter(removeTooManyCategs)
72 | .filter(onlyFeatureCols)
73 | .map(categNewCol)
74 |
75 | // StringIndexer for categorical columns (OneHotEncoder should be evaluated as well)
76 | val stringIndexerStages = trainingData.columns.filter(isCateg)
77 | .map(c => new StringIndexer()
78 | .setInputCol(c)
79 | .setOutputCol(categNewCol(c))
80 | .fit(trainInput.select(c).union(testInput.select(c))))
81 |
82 | // VectorAssembler for training features
83 | val assembler = new VectorAssembler()
84 | .setInputCols(featureCols)
85 | .setOutputCol("features")
86 | }
--------------------------------------------------------------------------------
/Chapter04/src/main/scala/ScalaTreeEnsimbles/UrbanTrafficDTRegressor.scala:
--------------------------------------------------------------------------------
1 | package ScalaTreeEnsimbles
2 |
3 | import org.apache.spark.ml.regression.{DecisionTreeRegressor, DecisionTreeRegressionModel}
4 | import org.apache.spark.ml.{ Pipeline, PipelineModel }
5 | import org.apache.spark.ml.evaluation.RegressionEvaluator
6 | import org.apache.spark.ml.tuning.ParamGridBuilder
7 | import org.apache.spark.ml.tuning.CrossValidator
8 | import org.apache.spark.sql._
9 | import org.apache.spark.sql.functions._
10 | import org.apache.spark.mllib.evaluation.RegressionMetrics
11 | import org.apache.log4j.LogManager
12 | import org.apache.spark.ml.feature.VectorAssembler
13 |
14 | object UrbanTrafficDTRegressor {
15 | def main(args: Array[String]) {
16 | val spark = SparkSession
17 | .builder
18 | .master("local[*]")
19 | .config("spark.sql.warehouse.dir", "E:/Exp/")
20 | .appName(s"DecisionTreeRegressor")
21 | .getOrCreate()
22 | import spark.implicits._
23 |
24 | val rawTrafficDF = spark.read
25 | .option("header", "true")
26 | .option("inferSchema", "true")
27 | .option("delimiter", ";")
28 | .format("com.databricks.spark.csv")
29 | .load("data/Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv")
30 | .cache
31 |
32 | val newTrafficDF = rawTrafficDF.withColumnRenamed("Slowness in traffic (%)", "label")
33 | val colNames = newTrafficDF.columns.dropRight(1)
34 |
35 | // VectorAssembler for training features
36 | val assembler = new VectorAssembler()
37 | .setInputCols(colNames)
38 | .setOutputCol("features")
39 |
40 | val assembleDF = assembler.transform(newTrafficDF).select("features", "label")
41 | assembleDF.printSchema()
42 |
43 | val seed = 12345
44 | val splits = assembleDF.randomSplit(Array(0.60, 0.40), seed)
45 | val (trainingData, testData) = (splits(0), splits(1))
46 |
47 | trainingData.cache
48 | testData.cache
49 |
50 | // Estimator algorithm
51 | val gbtModel = new DecisionTreeRegressor().setFeaturesCol("features").setLabelCol("label")
52 |
53 | // ***********************************************************
54 | println("Preparing K-fold Cross Validation and Grid Search")
55 | // ***********************************************************
56 |
57 | // Search through decision tree's maxDepth parameter for best model
58 | var paramGrid = new ParamGridBuilder()
59 | .addGrid(gbtModel.impurity, "variance" :: Nil)// variance for regression
60 | .addGrid(gbtModel.maxBins, 25 :: 30 :: 35 :: Nil)
61 | .addGrid(gbtModel.maxDepth, 5 :: 10 :: 15 :: Nil)
62 | .build()
63 |
64 | val numFolds = 10
65 | val cv = new CrossValidator()
66 | .setEstimator(gbtModel)
67 | .setEvaluator(new RegressionEvaluator)
68 | .setEstimatorParamMaps(paramGrid)
69 | .setNumFolds(numFolds)
70 |
71 | // ************************************************************
72 | println("Training model with GradientBoostedTrees algorithm")
73 | // ************************************************************
74 | val cvModel = cv.fit(trainingData)
75 |
76 | // **********************************************************************
77 | println("Evaluating the model on the test set and calculating the regression metrics")
78 | // **********************************************************************
79 | val trainPredictionsAndLabels = cvModel.transform(testData).select("label", "prediction")
80 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd
81 |
82 | val testRegressionMetrics = new RegressionMetrics(trainPredictionsAndLabels)
83 |
84 | val results = "\n=====================================================================\n" +
85 | s"TrainingData count: ${trainingData.count}\n" +
86 | s"TestData count: ${testData.count}\n" +
87 | "=====================================================================\n" +
88 | s"TestData MSE = ${testRegressionMetrics.meanSquaredError}\n" +
89 | s"TestData RMSE = ${testRegressionMetrics.rootMeanSquaredError}\n" +
90 | s"TestData R-squared = ${testRegressionMetrics.r2}\n" +
91 | s"TestData MAE = ${testRegressionMetrics.meanAbsoluteError}\n" +
92 | s"TestData explained variance = ${testRegressionMetrics.explainedVariance}\n" +
93 | "=====================================================================\n"
94 | println(results)
95 |
96 | val bestModel = cvModel.bestModel.asInstanceOf[DecisionTreeRegressionModel]
97 |
98 | println("Decison tree from best cross-validated model: " + bestModel.toDebugString)
99 |
100 | val featureImportances = bestModel.featureImportances.toArray
101 |
102 | val FI_to_List_sorted = featureImportances.toList.sorted.toArray
103 | println("Feature importance generated by the best model: ")
104 | for(x <- FI_to_List_sorted) println(x)
105 | }
106 | }
--------------------------------------------------------------------------------
/Chapter04/src/main/scala/ScalaTreeEnsimbles/UrbanTrafficGBTRegressor.scala:
--------------------------------------------------------------------------------
1 | package ScalaTreeEnsimbles
2 |
3 | import org.apache.spark.ml.regression.{ GBTRegressor, GBTRegressionModel }
4 | import org.apache.spark.ml.{ Pipeline, PipelineModel }
5 | import org.apache.spark.ml.evaluation.RegressionEvaluator
6 | import org.apache.spark.sql._
7 | import org.apache.spark.sql.functions._
8 | import org.apache.spark.mllib.evaluation.RegressionMetrics
9 | import org.apache.log4j.LogManager
10 | import org.apache.spark.ml.tuning.{ CrossValidator, ParamGridBuilder }
11 | import org.apache.spark.ml.feature.VectorAssembler
12 |
13 |
14 | object UrbanTrafficGBTRegressor {
15 | def main(args: Array[String]) {
16 | val spark = SparkSession
17 | .builder
18 | .master("local[*]")
19 | .config("spark.sql.warehouse.dir", "E:/Exp/")
20 | .appName(s"OneVsRestExample")
21 | .getOrCreate()
22 |
23 | import spark.implicits._
24 |
25 | val rawTrafficDF = spark.read
26 | .option("header", "true")
27 | .option("inferSchema", "true")
28 | .option("delimiter", ";")
29 | .format("com.databricks.spark.csv")
30 | .load("data/Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv")
31 | .cache
32 |
33 | val newTrafficDF = rawTrafficDF.withColumnRenamed("Slowness in traffic (%)", "label")
34 | val colNames = newTrafficDF.columns.dropRight(1)
35 |
36 | // VectorAssembler for training features
37 | val assembler = new VectorAssembler()
38 | .setInputCols(colNames)
39 | .setOutputCol("features")
40 |
41 | val assembleDF = assembler.transform(newTrafficDF).select("features", "label")
42 | assembleDF.printSchema()
43 |
44 | val seed = 12345
45 | val splits = assembleDF.randomSplit(Array(0.60, 0.40), seed)
46 | val (trainingData, testData) = (splits(0), splits(1))
47 |
48 | trainingData.cache
49 | testData.cache
50 |
51 | // Estimator algorithm
52 | val gbtModel = new GBTRegressor().setFeaturesCol("features").setLabelCol("label")
53 |
54 | // ***********************************************************
55 | println("Preparing K-fold Cross Validation and Grid Search")
56 | // ***********************************************************
57 |
58 | // Search through decision tree's maxDepth parameter for best model
59 | var paramGrid = new ParamGridBuilder()
60 | .addGrid(gbtModel.impurity, "variance" :: Nil)// variance for regression
61 | .addGrid(gbtModel.maxBins, 3 :: 5 :: 10 :: Nil)
62 | .addGrid(gbtModel.maxDepth, 2 :: 5 :: 10 :: Nil)
63 | .build()
64 |
65 | val numFolds = 10
66 | val cv = new CrossValidator()
67 | .setEstimator(gbtModel)
68 | .setEvaluator(new RegressionEvaluator)
69 | .setEstimatorParamMaps(paramGrid)
70 | .setNumFolds(numFolds)
71 |
72 | // ************************************************************
73 | println("Training model with GradientBoostedTrees algorithm")
74 | // ************************************************************
75 | val cvModel = cv.fit(trainingData)
76 |
77 | // **********************************************************************
78 | println("Evaluating the model on the test set and calculating the regression metrics")
79 | // **********************************************************************
80 | val trainPredictionsAndLabels = cvModel.transform(testData).select("label", "prediction")
81 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd
82 |
83 | val testRegressionMetrics = new RegressionMetrics(trainPredictionsAndLabels)
84 |
85 | val results = "\n=====================================================================\n" +
86 | s"TrainingData count: ${trainingData.count}\n" +
87 | s"TestData count: ${testData.count}\n" +
88 | "=====================================================================\n" +
89 | s"TestData MSE = ${testRegressionMetrics.meanSquaredError}\n" +
90 | s"TestData RMSE = ${testRegressionMetrics.rootMeanSquaredError}\n" +
91 | s"TestData R-squared = ${testRegressionMetrics.r2}\n" +
92 | s"TestData MAE = ${testRegressionMetrics.meanAbsoluteError}\n" +
93 | s"TestData explained variance = ${testRegressionMetrics.explainedVariance}\n" +
94 | "=====================================================================\n"
95 | println(results)
96 |
97 | val bestModel = cvModel.bestModel.asInstanceOf[GBTRegressionModel]
98 |
99 | println("Decison tree from best cross-validated model" + bestModel.toDebugString)
100 |
101 | val featureImportances = bestModel.featureImportances.toArray
102 |
103 | val FI_to_List_sorted = featureImportances.toList.sorted.toArray
104 | println("Feature importance generated by the best model: ")
105 | for(x <- FI_to_List_sorted) println(x)
106 | }
107 | }
--------------------------------------------------------------------------------
/Chapter04/src/main/scala/ScalaTreeEnsimbles/UrbanTrafficRFRegressor.scala:
--------------------------------------------------------------------------------
1 | package ScalaTreeEnsimbles
2 |
3 | import org.apache.spark.ml.regression.{RandomForestRegressor, RandomForestRegressionModel}
4 | import org.apache.spark.ml.{Pipeline, PipelineModel}
5 | import org.apache.spark.ml.evaluation.RegressionEvaluator
6 | import org.apache.spark.sql._
7 | import org.apache.spark.sql.functions._
8 | import org.apache.spark.mllib.evaluation.RegressionMetrics
9 | import org.apache.log4j.LogManager
10 | import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder, CrossValidatorModel}
11 | import org.apache.spark.ml.feature.VectorAssembler
12 |
13 | object UrbanTrafficRFRegressor {
14 | def main(args: Array[String]) {
15 | val spark = SparkSession
16 | .builder
17 | .master("local[*]")
18 | .config("spark.sql.warehouse.dir", "E:/Exp/")
19 | .appName(s"RandomForestRegression")
20 | .getOrCreate()
21 | import spark.implicits._
22 |
23 | val rawTrafficDF = spark.read
24 | .option("header", "true")
25 | .option("inferSchema", "true")
26 | .option("delimiter", ";")
27 | .format("com.databricks.spark.csv")
28 | .load("data/Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv")
29 | .cache
30 |
31 | val newTrafficDF = rawTrafficDF.withColumnRenamed("Slowness in traffic (%)", "label")
32 | val colNames = newTrafficDF.columns.dropRight(1)
33 |
34 | // VectorAssembler for training features
35 | val assembler = new VectorAssembler()
36 | .setInputCols(colNames)
37 | .setOutputCol("features")
38 |
39 | val assembleDF = assembler.transform(newTrafficDF).select("features", "label")
40 | assembleDF.printSchema()
41 |
42 | val seed = 12345
43 | val splits = assembleDF.randomSplit(Array(0.60, 0.40), seed)
44 | val (trainingData, testData) = (splits(0), splits(1))
45 |
46 | trainingData.cache
47 | testData.cache
48 |
49 | // Estimator algorithm
50 | val rfModel = new RandomForestRegressor().setFeaturesCol("features").setLabelCol("label")
51 |
52 | // ***********************************************************
53 | println("Preparing K-fold Cross Validation and Grid Search")
54 | // ***********************************************************
55 |
56 | // Search through decision tree's maxDepth parameter for best model
57 | val paramGrid = new ParamGridBuilder()
58 | .addGrid(rfModel.impurity, "variance" :: Nil)// variance for regression
59 | .addGrid(rfModel.maxBins, 25 :: 30 :: 35 :: Nil)
60 | .addGrid(rfModel.maxDepth, 5 :: 10 :: 15 :: Nil)
61 | .addGrid(rfModel.numTrees, 3 :: 5 :: 10 :: 15 :: Nil)
62 | .build()
63 |
64 | val numFolds = 10
65 | val cv = new CrossValidator()
66 | .setEstimator(rfModel)
67 | .setEvaluator(new RegressionEvaluator)
68 | .setEstimatorParamMaps(paramGrid)
69 | .setNumFolds(numFolds)
70 |
71 | // ************************************************************
72 | println("Training model with RandomForestRegressor algorithm")
73 | // ************************************************************
74 | val cvModel = cv.fit(trainingData)
75 |
76 | // **********************************************************************
77 | println("Evaluating the model on the test set and calculating the regression metrics")
78 | // **********************************************************************
79 | val trainPredictionsAndLabels = cvModel.transform(testData).select("label", "prediction")
80 | .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd
81 |
82 | val testRegressionMetrics = new RegressionMetrics(trainPredictionsAndLabels)
83 |
84 | val results = "\n=====================================================================\n" +
85 | s"TrainingData count: ${trainingData.count}\n" +
86 | s"TestData count: ${testData.count}\n" +
87 | "=====================================================================\n" +
88 | s"TestData MSE = ${testRegressionMetrics.meanSquaredError}\n" +
89 | s"TestData RMSE = ${testRegressionMetrics.rootMeanSquaredError}\n" +
90 | s"TestData R-squared = ${testRegressionMetrics.r2}\n" +
91 | s"TestData MAE = ${testRegressionMetrics.meanAbsoluteError}\n" +
92 | s"TestData explained variance = ${testRegressionMetrics.explainedVariance}\n" +
93 | "=====================================================================\n"
94 | println(results)
95 |
96 | val bestModel = cvModel.bestModel.asInstanceOf[RandomForestRegressionModel]
97 |
98 | println("Decison tree from best cross-validated model: " + bestModel.toDebugString)
99 |
100 | val featureImportances = bestModel.featureImportances.toArray
101 |
102 | val FI_to_List_sorted = featureImportances.toList.sorted.toArray
103 | println("Feature importance generated by the best model: ")
104 | for(x <- FI_to_List_sorted) println(x)
105 |
106 | spark.stop()
107 | }
108 | }
--------------------------------------------------------------------------------
/Chapter05/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | com.deri.sels
5 | PopulationClustering_v2
6 | 0.1-SNAPSHOT
7 |
8 | 2.4.0
9 | 2.11.8
10 | 3.22.1.1
11 | 2.4.1
12 | 0.23.0
13 |
14 |
15 |
16 |
17 | scala-tools.org
18 | Scala-tools Maven2 Repository
19 | http://scala-tools.org/repo-releases
20 |
21 |
22 |
23 |
24 | org.bdgenomics.adam
25 | adam-core_2.11
26 | ${adam.version}
27 |
28 |
29 |
30 | ai.h2o
31 | sparkling-water-core_2.11
32 | ${sparklingwater.version}
33 |
34 |
35 | ai.h2o
36 | sparkling-water-examples_2.11
37 | ${sparklingwater.version}
38 |
39 |
40 | org.apache.directory.studio
41 | org.apache.commons.io
42 | 2.4
43 |
44 |
45 | org.apache.spark
46 | spark-core_2.11
47 | ${spark.version}
48 |
49 |
50 |
51 | ai.h2o
52 | h2o-core
53 | ${h2o.version}
54 |
55 |
56 | ai.h2o
57 | h2o-scala_2.11
58 | ${h2o.version}
59 |
60 |
61 | ai.h2o
62 | h2o-algos
63 | ${h2o.version}
64 |
65 |
66 | ai.h2o
67 | h2o-app
68 | ${h2o.version}
69 |
70 |
71 | ai.h2o
72 | h2o-persist-hdfs
73 | ${h2o.version}
74 |
75 |
76 | scala-library
77 | org.scala-lang
78 | ${scala.version}
79 |
80 |
81 | ai.h2o
82 | google-analytics-java
83 | 1.1.2-H2O-CUSTOM
84 |
85 |
86 | joda-time
87 | joda-time
88 | 2.9.9
89 |
90 |
91 |
92 |
93 | snapshots-repo
94 | https://oss.sonatype.org/content/repositories/snapshots
95 |
96 | false
97 |
98 |
99 | true
100 | daily
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 | org.apache.maven.plugins
109 | maven-eclipse-plugin
110 | 2.9
111 |
112 | true
113 | false
114 |
115 |
116 |
117 |
118 | org.apache.maven.plugins
119 | maven-compiler-plugin
120 | 3.5.1
121 |
122 | ${jdk.version}
123 | ${jdk.version}
124 |
125 |
126 |
127 | org.apache.maven.plugins
128 | maven-shade-plugin
129 | 2.4.3
130 |
131 | true
132 |
133 |
134 |
135 |
136 | org.apache.maven.plugins
137 | maven-assembly-plugin
138 | 2.4.1
139 |
140 |
141 |
142 | jar-with-dependencies
143 |
144 |
145 |
146 |
147 | org.fit.genomics.PopStratClassification
148 |
149 |
150 |
151 |
152 | oozie.launcher.mapreduce.job.user.classpath.first
153 | true
154 |
155 |
156 |
157 |
158 |
159 | make-assembly
160 |
161 | package
162 |
163 | single
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
--------------------------------------------------------------------------------
/Chapter05/src/main/scala/org/fit/genomics/PCA.scala:
--------------------------------------------------------------------------------
1 | package org.fit.genomics
2 |
3 | import org.apache.spark.sql._
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.ml.feature.PCA
6 | import org.apache.spark.ml.linalg.Vectors
7 |
8 | object PCAExample {
9 | def main(args: Array[String]): Unit = {
10 | val spark: SparkSession = SparkSession.builder.appName("PopStrat").master("local[*]").getOrCreate()
11 |
12 | val data = Array(
13 | Vectors.dense(3.5, 2.0, 5.0, 6.3, 5.60, 2.4),
14 | Vectors.dense(4.40, 0.10, 3.0, 9.0, 7.0, 8.75),
15 | Vectors.dense(3.20, 2.40, 0.0, 6.0, 7.4, 3.34))
16 |
17 | val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")
18 | df.show(false)
19 |
20 | val pca = new PCA()
21 | .setInputCol("features")
22 | .setOutputCol("pcaFeatures")
23 | .setK(4)
24 | .fit(df)
25 |
26 | val result = pca.transform(df).select("pcaFeatures")
27 | result.show(false)
28 |
29 | }
30 | }
--------------------------------------------------------------------------------
/Chapter05/src/main/scala/org/fit/genomics/PopStratClustering.scala:
--------------------------------------------------------------------------------
1 | package org.fit.genomics
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.sql._
5 | import org.apache.spark.{ SparkConf, SparkContext }
6 | import org.apache.spark._
7 | import org.apache.spark.rdd.RDD
8 | import org.apache.spark.mllib.linalg.{ Vectors, Vector }
9 | import org.apache.spark.ml.clustering.KMeans
10 | import org.apache.spark.ml.evaluation.ClusteringEvaluator
11 | import org.apache.spark.SparkContext
12 | import org.apache.spark.sql.types.{ IntegerType, StringType, StructField, StructType }
13 | import org.apache.spark.ml.feature.{ VectorAssembler, Normalizer }
14 | import org.apache.spark.ml.Pipeline
15 | import org.apache.spark.ml.feature.VectorIndexer
16 | import org.apache.spark.ml.feature.PCA
17 |
18 | import water._
19 | import water.fvec.Frame
20 | import water.{ Job, Key }
21 | import water.fvec.Frame
22 | import hex.FrameSplitter
23 | import org.apache.spark.h2o._
24 | import org.apache.spark.h2o.H2OContext
25 |
26 | import org.bdgenomics.adam.rdd.ADAMContext._
27 | import org.bdgenomics.formats.avro.{ Genotype, GenotypeAllele }
28 |
29 | import java.io.File
30 | import java.io._
31 | import scala.collection.JavaConverters._
32 | import scala.collection.immutable.Range.inclusive
33 | import scala.io.Source
34 |
35 | object PopStratClusterings {
36 | def main(args: Array[String]): Unit = {
37 | val genotypeFile = "C:/Users/admin-karim/Downloads/1.vcf"
38 | val panelFile = "C:/Users/admin-karim/Downloads/genotypes.panel"
39 |
40 | val sparkSession: SparkSession = SparkSession.builder.appName("PopStrat").master("local[*]").getOrCreate()
41 | val sc: SparkContext = sparkSession.sparkContext
42 |
43 | val populations = Set("GBR", "MXL", "ASW", "CHB", "CLM")
44 | def extract(file: String, filter: (String, String) => Boolean): Map[String, String] = {
45 | Source
46 | .fromFile(file)
47 | .getLines()
48 | .map(line => {
49 | val tokens = line.split(Array('\t', ' ')).toList
50 | tokens(0) -> tokens(1)
51 | })
52 | .toMap
53 | .filter(tuple => filter(tuple._1, tuple._2))
54 | }
55 |
56 | val panel: Map[String, String] = extract(
57 | panelFile,
58 | (sampleID: String, pop: String) => populations.contains(pop))
59 | val allGenotypes: RDD[Genotype] = sc.loadGenotypes(genotypeFile).rdd
60 | val genotypes: RDD[Genotype] = allGenotypes.filter(genotype => {
61 | panel.contains(genotype.getSampleId)
62 | })
63 |
64 | // Convert the Genotype objects to our own SampleVariant objects to try and conserve memory
65 | case class SampleVariant(sampleId: String,
66 | variantId: Int,
67 | alternateCount: Int)
68 |
69 | def variantId(genotype: Genotype): String = {
70 | val name = genotype.getVariant.getContigName
71 | val start = genotype.getVariant.getStart
72 | val end = genotype.getVariant.getEnd
73 | s"$name:$start:$end"
74 | }
75 |
76 | def alternateCount(genotype: Genotype): Int = {
77 | genotype.getAlleles.asScala.count(_ != GenotypeAllele.REF)
78 | }
79 |
80 | def toVariant(genotype: Genotype): SampleVariant = {
81 | // Intern sample IDs as they will be repeated a lot
82 | new SampleVariant(genotype.getSampleId.intern(),
83 | variantId(genotype).hashCode(),
84 | alternateCount(genotype))
85 | }
86 |
87 | val variantsRDD: RDD[SampleVariant] = genotypes.map(toVariant)
88 | val variantsBySampleId: RDD[(String, Iterable[SampleVariant])] =
89 | variantsRDD.groupBy(_.sampleId)
90 | val sampleCount: Long = variantsBySampleId.count()
91 | println("Found " + sampleCount + " samples")
92 |
93 | val variantsByVariantId: RDD[(Int, Iterable[SampleVariant])] =
94 | variantsRDD.groupBy(_.variantId).filter {
95 | case (_, sampleVariants) => sampleVariants.size == sampleCount
96 | }
97 |
98 | val variantFrequencies: collection.Map[Int, Int] = variantsByVariantId
99 | .map {
100 | case (variantId, sampleVariants) =>
101 | (variantId, sampleVariants.count(_.alternateCount > 0))
102 | }
103 | .collectAsMap()
104 |
105 | val permittedRange = inclusive(11, 11)
106 | val filteredVariantsBySampleId: RDD[(String, Iterable[SampleVariant])] =
107 | variantsBySampleId.map {
108 | case (sampleId, sampleVariants) =>
109 | val filteredSampleVariants = sampleVariants.filter(
110 | variant =>
111 | permittedRange.contains(
112 | variantFrequencies.getOrElse(variant.variantId, -1)))
113 | (sampleId, filteredSampleVariants)
114 | }
115 |
116 | val sortedVariantsBySampleId: RDD[(String, Array[SampleVariant])] =
117 | filteredVariantsBySampleId.map {
118 | case (sampleId, variants) =>
119 | (sampleId, variants.toArray.sortBy(_.variantId))
120 | }
121 |
122 | println(s"Sorted by Sample ID RDD: " + sortedVariantsBySampleId.first())
123 |
124 | val header = StructType(
125 | Array(StructField("Region", StringType)) ++
126 | sortedVariantsBySampleId
127 | .first()
128 | ._2
129 | .map(variant => {
130 | StructField(variant.variantId.toString, IntegerType)
131 | }))
132 |
133 | val rowRDD: RDD[Row] = sortedVariantsBySampleId.map {
134 | case (sampleId, sortedVariants) =>
135 | val region: Array[String] = Array(panel.getOrElse(sampleId, "Unknown"))
136 | val alternateCounts: Array[Int] = sortedVariants.map(_.alternateCount)
137 | Row.fromSeq(region ++ alternateCounts)
138 | }
139 |
140 | // Create the SchemaRDD from the header and rows and convert the SchemaRDD into a Spark dataframe
141 | val sqlContext = sparkSession.sqlContext
142 | val schemaDF = sqlContext.createDataFrame(rowRDD, header).drop("Region")
143 | schemaDF.printSchema()
144 | schemaDF.show(10)
145 |
146 | println(schemaDF.columns.length)
147 |
148 | // Using vector assembler to create feature vector
149 | val featureCols = schemaDF.columns
150 | val assembler = new VectorAssembler()
151 | .setInputCols(featureCols)
152 | .setOutputCol("features")
153 |
154 | val assembleDF = assembler.transform(schemaDF).select("features")
155 | assembleDF.show()
156 |
157 | // Elbow method with reduced dimension
158 | val pca = new PCA()
159 | .setInputCol("features")
160 | .setOutputCol("pcaFeatures")
161 | .setK(5)
162 | .fit(assembleDF)
163 |
164 | val pcaDF = pca.transform(assembleDF).select("pcaFeatures").withColumnRenamed("pcaFeatures", "features")
165 | pcaDF.show()
166 |
167 | val iterations = 20
168 | for (i <- 2 to iterations) {
169 | // Trains a k-means model.
170 | val kmeans = new KMeans().setK(i).setSeed(12345L)
171 | val model = kmeans.fit(pcaDF)
172 |
173 | // Evaluate clustering by computing Within Set Sum of Squared Errors.
174 | val WCSS = model.computeCost(pcaDF)
175 | println("Within Set Sum of Squared Errors for k = " + i + " is " + WCSS)
176 | }
177 | /*
178 | Within Set Sum of Squared Errors for k = 2 is 135.0048361804504
179 | Within Set Sum of Squared Errors for k = 3 is 90.95271589232344
180 | Within Set Sum of Squared Errors for k = 4 is 73.03991105363087
181 | Within Set Sum of Squared Errors for k = 5 is 52.712937492025276
182 | Within Set Sum of Squared Errors for k = 6 is 35.0048649663809
183 | Within Set Sum of Squared Errors for k = 7 is 33.11707134428616
184 | Within Set Sum of Squared Errors for k = 8 is 30.546631341918243
185 | Within Set Sum of Squared Errors for k = 9 is 28.453155497711535
186 | Within Set Sum of Squared Errors for k = 10 is 24.93179715697327
187 | Within Set Sum of Squared Errors for k = 11 is 25.56839205985354
188 | Within Set Sum of Squared Errors for k = 12 is 18.76755804955161
189 | Within Set Sum of Squared Errors for k = 13 is 18.55123407031501
190 | Within Set Sum of Squared Errors for k = 14 is 16.140301237245204
191 | Within Set Sum of Squared Errors for k = 15 is 14.143806816130821
192 | Within Set Sum of Squared Errors for k = 16 is 15.017971347008297
193 | Within Set Sum of Squared Errors for k = 17 is 12.266417893931926
194 | Within Set Sum of Squared Errors for k = 18 is 11.108546956133177
195 | Within Set Sum of Squared Errors for k = 19 is 11.505990055606803
196 | Within Set Sum of Squared Errors for k = 20 is 12.26634441065655
197 | */
198 |
199 | // Evaluate clustering by computing Silhouette score
200 | val evaluator = new ClusteringEvaluator()
201 |
202 | for (k <- 2 to 20 by 1) {
203 | val kmeans = new KMeans().setK(k).setSeed(12345L)
204 | val model = kmeans.fit(pcaDF)
205 | val transformedDF = model.transform(pcaDF)
206 | val score = evaluator.evaluate(transformedDF)
207 | println("Silhouette with squared euclidean distance for k = " + k + " is " + score)
208 | }
209 | /*
210 | * Silhouette with squared euclidean distance for k = 2 is 0.9175803927739566
211 | Silhouette with squared euclidean distance for k = 3 is 0.8288633816548874
212 | Silhouette with squared euclidean distance for k = 4 is 0.6376477607336495
213 | Silhouette with squared euclidean distance for k = 5 is 0.6731472765720269
214 | Silhouette with squared euclidean distance for k = 6 is 0.6641908680884869
215 | Silhouette with squared euclidean distance for k = 7 is 0.5758081075880451
216 | Silhouette with squared euclidean distance for k = 8 is 0.588881352222969
217 | Silhouette with squared euclidean distance for k = 9 is 0.6485153435398991
218 | Silhouette with squared euclidean distance for k = 10 is 0.48949118556376964
219 | Silhouette with squared euclidean distance for k = 11 is 0.5371218728964895
220 | Silhouette with squared euclidean distance for k = 12 is 0.5569086502410784
221 | Silhouette with squared euclidean distance for k = 13 is 0.3990728491364654
222 | Silhouette with squared euclidean distance for k = 14 is 0.5311155969749914
223 | Silhouette with squared euclidean distance for k = 15 is 0.5457021641983345
224 | Silhouette with squared euclidean distance for k = 16 is 0.4891629883332554
225 | Silhouette with squared euclidean distance for k = 17 is 0.5452872742013583
226 | Silhouette with squared euclidean distance for k = 18 is 0.5304994251201304
227 | Silhouette with squared euclidean distance for k = 19 is 0.5327466913746908
228 | Silhouette with squared euclidean distance for k = 20 is 0.45336547054142284
229 | */
230 |
231 | val kmeansOptimal = new KMeans().setK(2).setSeed(12345L)
232 | val modelOptimal = kmeansOptimal.fit(pcaDF)
233 |
234 | // Making predictions
235 | val predictionsOptimalDF = modelOptimal.transform(pcaDF)
236 | predictionsOptimalDF.show()
237 |
238 | // Evaluate clustering by computing Silhouette score
239 | val evaluatorOptimal = new ClusteringEvaluator()
240 |
241 | val silhouette = evaluatorOptimal.evaluate(predictionsOptimalDF)
242 | println(s"Silhouette with squared euclidean distance = $silhouette")
243 |
244 | sparkSession.stop()
245 | }
246 | }
247 |
--------------------------------------------------------------------------------
/Chapter06/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.packt.AnomalyDetection
6 | RandomForest
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | ScalaMLQuickStartGuide
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 | 1.8
16 | 2.2.0
17 |
18 |
19 |
20 |
21 | jdk.tools
22 | jdk.tools
23 | 1.8.0_171
24 | system
25 | C:/Program Files/Java/jdk1.8.0_171/lib/tools.jar
26 |
27 |
28 | org.apache.directory.studio
29 | org.apache.commons.io
30 | 2.4
31 |
32 |
33 | org.apache.spark
34 | spark-core_2.11
35 | ${spark.version}
36 |
37 |
38 | com.github.tototoshi
39 | scala-csv_2.10
40 | 1.3.5
41 |
42 |
43 | org.apache.spark
44 | spark-sql_2.11
45 | ${spark.version}
46 |
47 |
48 | com.github.scopt
49 | scopt_2.11
50 | 3.3.0
51 |
52 |
53 | com.typesafe
54 | config
55 | 1.2.1
56 |
57 |
58 | org.apache.directory.api
59 | api-util
60 | 1.0.0
61 |
62 |
63 | commons-io
64 | commons-io
65 | 2.6
66 |
67 |
68 | com.esotericsoftware.kryo
69 | kryo
70 | 2.10
71 |
72 |
73 | edu.stanford.nlp
74 | stanford-corenlp
75 | 3.6.0
76 |
77 |
78 | edu.stanford.nlp
79 | stanford-corenlp
80 | 3.6.0
81 | models
82 |
83 |
84 | org.apache.hadoop
85 | hadoop-common
86 | 2.6.0
87 |
88 |
89 | org.sameersingh.scalaplot
90 | scalaplot
91 | 0.0.4
92 |
93 |
94 | org.apache.spark
95 | spark-mllib_2.11
96 | ${spark.version}
97 |
98 |
99 | org.apache.spark
100 | spark-graphx_2.11
101 | ${spark.version}
102 |
103 |
104 | org.apache.spark
105 | spark-yarn_2.11
106 | ${spark.version}
107 |
108 |
109 | org.apache.spark
110 | spark-network-shuffle_2.11
111 | ${spark.version}
112 |
113 |
114 | com.databricks
115 | spark-csv_2.11
116 | 1.3.0
117 |
118 |
119 | com.holdenkarau
120 | spark-testing-base_2.10
121 | 2.0.0_0.6.0
122 |
123 |
124 | com.databricks
125 | spark-avro_2.11
126 | 4.0.0
127 |
128 |
129 | org.apache.commons
130 | commons-math3
131 | 3.2
132 |
133 |
134 | org.apache.hive
135 | hive-exec
136 | 2.3.2
137 |
138 |
139 | junit
140 | junit
141 | 3.8.1
142 | test
143 |
144 |
145 |
146 |
147 |
148 |
149 | org.apache.maven.plugins
150 | maven-eclipse-plugin
151 | 2.9
152 |
153 | true
154 | false
155 |
156 |
157 |
158 |
159 | org.apache.maven.plugins
160 | maven-compiler-plugin
161 | 3.5.1
162 |
163 | ${jdk.version}
164 | ${jdk.version}
165 |
166 |
167 |
168 | maven-shade-plugin
169 | 2.4.3
170 |
171 |
172 | package
173 |
174 | shade
175 |
176 |
177 | false
178 |
179 |
180 |
181 | *:*
182 |
183 | META-INF/*.SF
184 | META-INF/*.DSA
185 | META-INF/*.RSA
186 |
187 |
188 |
189 |
190 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 | org.apache.maven.plugins
200 | maven-assembly-plugin
201 | 2.4.1
202 |
203 |
204 |
205 | jar-with-dependencies
206 |
207 |
208 |
209 |
210 | com.packt.ScalaML.ProductionEngineering.BoschProductionLinePerformance2
211 |
212 |
213 |
214 |
215 | oozie.launcher.mapreduce.job.user.classpath.first
216 | true
217 |
218 |
219 |
220 |
221 |
222 | make-assembly
223 |
224 | package
225 |
226 | single
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
--------------------------------------------------------------------------------
/Chapter06/src/main/scala/ScalaBookRecommendation/BookRecommendation.scala:
--------------------------------------------------------------------------------
1 | package ScalaBookRecommendation
2 |
3 | import org.apache.spark.sql.SparkSession
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.SQLContext
7 | import org.apache.spark.sql.SQLImplicits
8 | import org.apache.spark.sql._
9 | import org.apache.spark.sql.Dataset
10 | import org.apache.spark.mllib.recommendation.ALS
11 | import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
12 | import org.apache.spark.mllib.recommendation.Rating
13 | import scala.Tuple2
14 |
15 | import org.apache.spark.rdd.RDD
16 |
17 | object BookRecommendation {
18 | //Compute the RMSE to evaluate the model. Less the RMSE better the model and it's prediction capability.
19 | def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean): Double = {
20 | val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product)))
21 | val predictionsAndRatings = predictions.map { x => ((x.user, x.product), x.rating)
22 | }.join(data.map(x => ((x.user, x.product), x.rating))).values
23 | math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean())
24 | }
25 |
26 | def main(args: Array[String]) {
27 | val spark = SparkSession
28 | .builder
29 | .master("local[*]")
30 | .config("spark.sql.warehouse.dir", "E:/Exp/")
31 | .appName("BookRecommendation")
32 | .getOrCreate()
33 |
34 | import spark.implicits._
35 |
36 | println("Loading Ratings data...")
37 |
38 | val ratigsFile = "data/BX-Book-Ratings.csv"
39 | var ratingDF = spark.read.format("com.databricks.spark.csv")
40 | .option("delimiter", ";")
41 | .option("header", true)
42 | .load(ratigsFile)
43 |
44 | ratingDF = ratingDF.withColumnRenamed("User-ID", "UserID").withColumnRenamed("Book-Rating", "Rating")
45 | ratingDF.printSchema()
46 |
47 | /* Explore and Query with Spark DataFrames */
48 | val numRatings = ratingDF.count()
49 | val numUsers = ratingDF.select(ratingDF.col("UserID")).distinct().count()
50 | val numBooks = ratingDF.select(ratingDF.col("ISBN")).distinct().count()
51 | println("Got " + numRatings + " ratings from " + numUsers + " users on " + numBooks + " books") /* Got 1149780 ratings from 105283 users on 340556 books */
52 |
53 | val booksFile = "data/BX-Books.csv"
54 | var bookDF = spark.read.format("com.databricks.spark.csv").option("header", "true").option("delimiter", ";").load(booksFile)
55 | bookDF.show()
56 |
57 | bookDF = bookDF.select(bookDF.col("ISBN"), bookDF.col("Book-Title"), bookDF.col("Book-Author"), bookDF.col("Year-Of-Publication"))
58 | bookDF = bookDF.withColumnRenamed("Book-Title", "Title").withColumnRenamed("Book-Author", "Author").withColumnRenamed("Year-Of-Publication", "Year")
59 | bookDF.show(10)
60 | /*
61 | * +----------+--------------------+--------------------+----+
62 | | ISBN| Title| Author|Year|
63 | +----------+--------------------+--------------------+----+
64 | |0195153448| Classical Mythology| Mark P. O. Morford|2002|
65 | |0002005018| Clara Callan|Richard Bruce Wright|2001|
66 | |0060973129|Decision in Normandy| Carlo D'Este|1991|
67 | |0374157065|Flu: The Story of...| Gina Bari Kolata|1999|
68 | |0393045218|The Mummies of Ur...| E. J. W. Barber|1999|
69 | |0399135782|The Kitchen God's...| Amy Tan|1991|
70 | |0425176428|What If?: The Wor...| Robert Cowley|2000|
71 | |0671870432| PLEADING GUILTY| Scott Turow|1993|
72 | |0679425608|Under the Black F...| David Cordingly|1996|
73 | |074322678X|Where You'll Find...| Ann Beattie|2002|
74 | +----------+--------------------+--------------------+----+
75 | only showing top 10 rows
76 | */
77 |
78 | ratingDF.createOrReplaceTempView("ratings")
79 | bookDF.createOrReplaceTempView("books")
80 |
81 | spark.sql("SELECT max(Rating) FROM ratings").show()
82 |
83 | // Get the max, min ratings along with the count of users who have rated a book.
84 | val statDF = spark.sql("select books.Title, bookrates.maxr, bookrates.minr, bookrates.cntu "
85 | + "from(SELECT ratings.ISBN,max(ratings.Rating) as maxr,"
86 | + "min(ratings.Rating) as minr,count(distinct UserID) as cntu "
87 | + "FROM ratings group by ratings.ISBN) bookrates "
88 | + "join books on bookrates.ISBN=books.ISBN " + "order by bookrates.cntu desc")
89 |
90 | statDF.show(10)
91 | /*
92 | * +--------------------+----+----+----+
93 | | Title|maxr|minr|cntu|
94 | +--------------------+----+----+----+
95 | | Wild Animus| 9| 0|2502|
96 | |The Lovely Bones:...| 9| 0|1295|
97 | | The Da Vinci Code| 9| 0| 883|
98 | |Divine Secrets of...| 9| 0| 732|
99 | |The Red Tent (Bes...| 9| 0| 723|
100 | | A Painted House| 9| 0| 647|
101 | |The Secret Life o...| 9| 0| 615|
102 | |Snow Falling on C...| 9| 0| 614|
103 | | Angels & Demons| 9| 0| 586|
104 | |Where the Heart I...| 9| 0| 585|
105 | +--------------------+----+----+----+
106 | only showing top 10 rows
107 | */
108 |
109 | // Show the top 10 most-active users and how many times they rated a book
110 | val mostActiveReaders = spark.sql("SELECT ratings.UserID, count(*) as CT from ratings "
111 | + "group by ratings.UserID order by CT desc limit 10")
112 | mostActiveReaders.show()
113 | /*
114 | * +------+-----+
115 | |UserID| CT|
116 | +------+-----+
117 | | 11676|13602|
118 | |198711| 7550|
119 | |153662| 6109|
120 | | 98391| 5891|
121 | | 35859| 5850|
122 | |212898| 4785|
123 | |278418| 4533|
124 | | 76352| 3367|
125 | |110973| 3100|
126 | |235105| 3067|
127 | +------+-----+
128 | */
129 |
130 | // Find the movies that user 276744 rated higher than 5
131 | val ratingBySpecificReader = spark.sql(
132 | "SELECT ratings.UserID, ratings.ISBN,"
133 | + "ratings.Rating, books.Title FROM ratings JOIN books "
134 | + "ON books.ISBN=ratings.ISBN "
135 | + "where ratings.UserID=276744 and ratings.Rating > 4")
136 |
137 | ratingBySpecificReader.show(false)
138 |
139 | /*
140 | * +------+----------+------+---------------+
141 | |UserID|ISBN |Rating|Title |
142 | +------+----------+------+---------------+
143 | |276744|038550120X|7 |A Painted House|
144 | +------+----------+------+---------------+
145 | */
146 |
147 | // Feature engineering
148 | ratingDF = ratingDF.withColumn("ISBN_1", hash($"ISBN"))
149 | ratingDF = ratingDF.select("UserID", "ISBN_1", "Rating")
150 | ratingDF = ratingDF.withColumn("ISBN", abs($"ISBN_1"))
151 | ratingDF = ratingDF.select("UserID", "ISBN", "Rating")
152 |
153 | ratingDF.printSchema()
154 | /*
155 | * root
156 | |-- UserID: string (nullable = true)
157 | |-- ISBN: integer (nullable = false)
158 | |-- Rating: string (nullable = true)
159 | */
160 |
161 | val seed = 12345
162 | val splits = ratingDF.randomSplit(Array(0.60, 0.40), seed)
163 | val (trainingData, testData) = (splits(0), splits(1))
164 |
165 | trainingData.cache
166 | testData.cache
167 |
168 | val numTrainingSample = trainingData.count()
169 | val numTestSample = testData.count()
170 | println("Training: " + numTrainingSample + " test: " + numTestSample) // Training: 689144 test: 345774
171 |
172 | val trainRatingsRDD = trainingData.rdd.map(row => {
173 | val userID = row.getString(0)
174 | val ISBN = row.getInt(1)
175 | val ratings = row.getString(2)
176 | Rating(userID.toInt, ISBN, ratings.toDouble)
177 | })
178 |
179 | val testRatingsRDD = testData.rdd.map(row => {
180 | val userID = row.getString(0)
181 | val ISBN = row.getInt(1)
182 | val ratings = row.getString(2)
183 | Rating(userID.toInt, ISBN, ratings.toDouble)
184 | })
185 |
186 | val model : MatrixFactorizationModel = new ALS()
187 | .setIterations(10)
188 | .setBlocks(-1)
189 | .setAlpha(1.0)
190 | .setLambda(0.01)
191 | .setRank(25)
192 | .setSeed(1234579L)
193 | .setImplicitPrefs(false)
194 | .run(trainRatingsRDD)
195 |
196 | //Saving the model for future use
197 | //val savedALSModel = model.save(spark.sparkContext, "model/MovieRecomModel")
198 |
199 | //Load the workflow back
200 | //val same_model = MatrixFactorizationModel.load(spark.sparkContext, "model/MovieRecomModel/")
201 |
202 | //Book recommendation for a specific user. Get the top 10 book predictions for reader 276747
203 | println("Recommendations: (ISBN, Rating)")
204 | println("----------------------------------")
205 | val recommendationsUser = model.recommendProducts(276747, 10)
206 | recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println)
207 | println("----------------------------------")
208 |
209 | /*
210 | Recommendations: (ISBN => Rating)
211 | (1051401851,15.127044702142243)
212 | (2056910662,15.11531283195148)
213 | (1013412890,14.75898119158678)
214 | (603241602,14.53024153450836)
215 | (1868529062,14.180262929540024)
216 | (746990712,14.121654522195225)
217 | (1630827789,13.741728003481194)
218 | (1179316963,13.571754513473993)
219 | (505970947,13.506755847456258)
220 | (632523982,13.46591014905454)
221 | ----------------------------------
222 | */
223 |
224 | // Evaluating the Model: we expect lower RMSE because smaller the calculated error, the better the model
225 | var rmseTest = computeRmse(model, testRatingsRDD, true)
226 | println("Test RMSE: = " + rmseTest) //Less is better // Test RMSE: = 1.6867585251053991
227 |
228 | val new_user_ID = 300000 // new user ID randomly chosen
229 |
230 | //The format of each line is (UserID, ISBN, Rating)
231 | val new_user_ratings = Seq(
232 | (new_user_ID, 817930596, 15.127044702142243),
233 | (new_user_ID, 1149373895, 15.11531283195148),
234 | (new_user_ID, 1885291767, 14.75898119158678),
235 | (new_user_ID, 459716613, 14.53024153450836),
236 | (new_user_ID, 3362860, 14.180262929540024),
237 | (new_user_ID, 1178102612, 14.121654522195225),
238 | (new_user_ID, 158895996, 13.741728003481194),
239 | (new_user_ID, 1007741925, 13.571754513473993),
240 | (new_user_ID, 1033268461, 13.506755847456258),
241 | (new_user_ID, 651677816, 13.46591014905454))
242 |
243 | val new_user_ratings_RDD = spark.sparkContext.parallelize(new_user_ratings)
244 | val new_user_ratings_DF = spark.createDataFrame(new_user_ratings_RDD).toDF("UserID", "ISBN", "Rating")
245 |
246 | val newRatingsRDD = new_user_ratings_DF.rdd.map(row => {
247 | val userId = row.getInt(0)
248 | val movieId = row.getInt(1)
249 | val ratings = row.getDouble(2)
250 | Rating(userId, movieId, ratings)
251 | })
252 |
253 | val complete_data_with_new_ratings_RDD = trainRatingsRDD.union(newRatingsRDD)
254 |
255 | val newModel : MatrixFactorizationModel = new ALS()
256 | .setIterations(10)
257 | .setBlocks(-1)
258 | .setAlpha(1.0)
259 | .setLambda(0.01)
260 | .setRank(25)
261 | .setSeed(123457L)
262 | .setImplicitPrefs(false)
263 | .run(complete_data_with_new_ratings_RDD)
264 |
265 | // Making Predictions. Get the top 10 book predictions for user 276724
266 | //Book recommendation for a specific user. Get the top 10 book predictions for reader 276747
267 | println("Recommendations: (ISBN, Rating)")
268 | println("----------------------------------")
269 | val newPredictions = newModel.recommendProducts(276747, 10)
270 | newPredictions.map(rating => (rating.product, rating.rating)).foreach(println)
271 | println("----------------------------------")
272 |
273 | var newrmseTest = computeRmse(newModel, testRatingsRDD, true)
274 | println("Test RMSE: = " + newrmseTest) //Less is better
275 | }
276 | }
--------------------------------------------------------------------------------
/Chapter07/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | com.packt.AnomalyDetection
6 | RandomForest
7 | 0.0.1-SNAPSHOT
8 | jar
9 |
10 | ScalaMLQuickStartGuide
11 | http://maven.apache.org
12 |
13 |
14 | UTF-8
15 | 1.8
16 | 2.2.0
17 | 1.0.0-alpha
18 | 1.0.0-alpha
19 | 1.0.0-alpha
20 | 1.0.0-alpha
21 | 1.2.3
22 |
23 |
24 |
25 |
26 | jdk.tools
27 | jdk.tools
28 | 1.8.0_171
29 | system
30 | C:/Program Files/Java/jdk1.8.0_171/lib/tools.jar
31 |
32 |
33 | org.apache.directory.studio
34 | org.apache.commons.io
35 | 2.4
36 |
37 |
38 | org.deeplearning4j
39 | scalnet_2.11
40 | 1.0.0-alpha
41 |
42 |
43 | org.apache.spark
44 | spark-core_2.11
45 | ${spark.version}
46 |
47 |
48 | com.github.tototoshi
49 | scala-csv_2.10
50 | 1.3.5
51 |
52 |
53 | org.apache.spark
54 | spark-sql_2.11
55 | ${spark.version}
56 |
57 |
58 | com.github.scopt
59 | scopt_2.11
60 | 3.3.0
61 |
62 |
63 | com.typesafe
64 | config
65 | 1.2.1
66 |
67 |
68 | org.apache.directory.api
69 | api-util
70 | 1.0.0
71 |
72 |
73 | commons-io
74 | commons-io
75 | 2.6
76 |
77 |
78 | com.esotericsoftware.kryo
79 | kryo
80 | 2.10
81 |
82 |
83 | edu.stanford.nlp
84 | stanford-corenlp
85 | 3.6.0
86 |
87 |
88 | edu.stanford.nlp
89 | stanford-corenlp
90 | 3.6.0
91 | models
92 |
93 |
94 | org.apache.hadoop
95 | hadoop-common
96 | 2.6.0
97 |
98 |
99 | org.sameersingh.scalaplot
100 | scalaplot
101 | 0.0.4
102 |
103 |
104 | org.apache.spark
105 | spark-mllib_2.11
106 | ${spark.version}
107 |
108 |
109 | org.apache.spark
110 | spark-graphx_2.11
111 | ${spark.version}
112 |
113 |
114 | org.apache.spark
115 | spark-yarn_2.11
116 | ${spark.version}
117 |
118 |
119 | org.apache.spark
120 | spark-network-shuffle_2.11
121 | ${spark.version}
122 |
123 |
124 | com.databricks
125 | spark-csv_2.11
126 | 1.3.0
127 |
128 |
129 | com.holdenkarau
130 | spark-testing-base_2.10
131 | 2.0.0_0.6.0
132 |
133 |
134 | com.databricks
135 | spark-avro_2.11
136 | 4.0.0
137 |
138 |
139 | org.apache.commons
140 | commons-math3
141 | 3.2
142 |
143 |
144 | org.apache.hive
145 | hive-exec
146 | 2.3.2
147 |
148 |
149 | junit
150 | junit
151 | 3.8.1
152 | test
153 |
154 |
155 | org.nd4j
156 | nd4j-native
157 | ${nd4j.version}
158 |
159 |
160 | org.deeplearning4j
161 | deeplearning4j-ui_2.11
162 | ${dl4j.version}
163 |
164 |
165 | org.deeplearning4j
166 | deeplearning4j-core
167 | ${dl4j.version}
168 |
169 |
170 | org.deeplearning4j
171 | deeplearning4j-nlp
172 | ${dl4j.version}
173 |
174 |
175 | org.deeplearning4j
176 | deeplearning4j-zoo
177 | ${dl4j.version}
178 |
179 |
180 | org.deeplearning4j
181 | arbiter-deeplearning4j
182 | ${arbiter.version}
183 |
184 |
185 | org.deeplearning4j
186 | arbiter-ui_2.11
187 | ${arbiter.version}
188 |
189 |
190 | datavec-data-codec
191 | org.datavec
192 | ${datavec.version}
193 |
194 |
195 | org.apache.httpcomponents
196 | httpclient
197 | 4.3.5
198 |
199 |
200 | ch.qos.logback
201 | logback-classic
202 | ${logback.version}
203 |
204 |
205 | org.datavec
206 | datavec-data-image
207 | ${dl4j.version}
208 |
209 |
210 | org.bytedeco
211 | javacv-platform
212 | 1.4.1
213 |
214 |
215 | org.datavec
216 | datavec-hadoop
217 | ${datavec.version}
218 |
219 |
220 |
221 | org.deeplearning4j
222 | arbiter-deeplearning4j
223 | ${arbiter.version}
224 |
225 |
226 | org.deeplearning4j
227 | arbiter-ui_2.11
228 | ${arbiter.version}
229 |
230 |
231 | org.apache.httpcomponents
232 | httpclient
233 | 4.3.5
234 |
235 |
236 | ch.qos.logback
237 | logback-classic
238 | ${logback.version}
239 |
240 |
241 |
242 | jfree
243 | jfreechart
244 | 1.0.13
245 |
246 |
247 | org.jcodec
248 | jcodec
249 | 0.2.3
250 |
251 |
252 |
253 |
254 |
255 |
256 | org.apache.maven.plugins
257 | maven-eclipse-plugin
258 | 2.9
259 |
260 | true
261 | false
262 |
263 |
264 |
265 |
266 | org.apache.maven.plugins
267 | maven-compiler-plugin
268 | 3.5.1
269 |
270 | ${jdk.version}
271 | ${jdk.version}
272 |
273 |
274 |
275 | maven-shade-plugin
276 | 2.4.3
277 |
278 |
279 | package
280 |
281 | shade
282 |
283 |
284 | false
285 |
286 |
287 |
288 | *:*
289 |
290 | META-INF/*.SF
291 | META-INF/*.DSA
292 | META-INF/*.RSA
293 |
294 |
295 |
296 |
297 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 | org.apache.maven.plugins
307 | maven-assembly-plugin
308 | 2.4.1
309 |
310 |
311 |
312 | jar-with-dependencies
313 |
314 |
315 |
316 |
317 | com.packt.ScalaML.ProductionEngineering.BoschProductionLinePerformance2
318 |
319 |
320 |
321 |
322 | oozie.launcher.mapreduce.job.user.classpath.first
323 | true
324 |
325 |
326 |
327 |
328 |
329 | make-assembly
330 |
331 | package
332 |
333 | single
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
--------------------------------------------------------------------------------
/Chapter07/src/main/scala/GettingStartedDL/CancerDataPreprocessor.scala:
--------------------------------------------------------------------------------
1 | package GettingStartedDL
2 |
3 | import org.apache.spark._
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.functions._
6 | import org.apache.spark.sql.types._
7 | import org.apache.spark.sql._
8 | import org.apache.spark.sql.Dataset
9 | import org.apache.spark.ml.Pipeline
10 | import org.apache.spark.ml.classification.RandomForestClassifier
11 | import org.apache.spark.ml.classification.RandomForestClassificationModel
12 | import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
13 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
14 | import org.apache.spark.ml.feature.StringIndexer
15 | import org.apache.spark.ml.tuning.ParamGridBuilder
16 | import org.apache.spark.ml.tuning.CrossValidator
17 | import org.apache.spark.ml.feature.VectorAssembler
18 |
19 | object CancerDataPreprocessor {
20 | def main(args: Array[String]) = {
21 | val spark: SparkSession = SparkSession.builder().
22 | appName("churn")
23 | .master("local[*]")
24 | .config("spark.sql.warehouse.dir", "E:/Exp/")
25 | .config("spark.sql.crossJoin.enabled", "true")
26 | .getOrCreate()
27 |
28 | val data = spark.read.option("maxColumns", 25000).format("com.databricks.spark.csv")
29 | .option("header", "true") // Use first line of all files as header
30 | .option("inferSchema", "true") // Automatically infer data types
31 | .load("C:/Users/admin-karim/Desktop/old2/TCGA-PANCAN/TCGA-PANCAN-HiSeq-801x20531/data.csv"); // set your path accordingly
32 |
33 | val numFeatures = data.columns.length
34 | val numSamples = data.count()
35 | println("Number of features: " + numFeatures)
36 | println("Number of samples: " + numSamples)
37 |
38 | val numericDF = data.drop("id") // now 20531 features left
39 |
40 | val labels = spark.read.format("com.databricks.spark.csv").option("header", "true") // Use first line of all files as header
41 | .option("inferSchema", "true") // Automatically infer data types
42 | .load("C:/Users/admin-karim/Desktop/old2/TCGA-PANCAN/TCGA-PANCAN-HiSeq-801x20531/labels.csv")
43 |
44 | labels.show(10)
45 |
46 | val indexer = new StringIndexer().setInputCol("Class").setOutputCol("label").setHandleInvalid("skip"); // skip null/invalid values
47 | val indexedDF = indexer.fit(labels).transform(labels).select(col("label").cast(DataTypes.IntegerType)); // casting data types to integer
48 |
49 | indexedDF.show()
50 |
51 | val combinedDF = numericDF.join(indexedDF)
52 |
53 | val splits = combinedDF.randomSplit(Array(0.7, 0.3), 12345L) //70% for training, 30% for testing
54 | val trainingData = splits(0)
55 | val testData = splits(1)
56 |
57 | println(trainingData.count()); // number of samples in training set
58 | println(testData.count()); // number of samples in test set
59 |
60 | trainingData.coalesce(1).write
61 | .format("com.databricks.spark.csv")
62 | .option("header", "false")
63 | .option("delimiter", ",")
64 | .save("output/TCGA_train.csv")
65 |
66 | testData.coalesce(1).write
67 | .format("com.databricks.spark.csv")
68 | .option("header", "false")
69 | .option("delimiter", ",")
70 | .save("output/TCGA_test.csv")
71 |
72 | }
73 | }
--------------------------------------------------------------------------------
/Chapter07/src/main/scala/GettingStartedDL/CancerTypePrediction.scala:
--------------------------------------------------------------------------------
1 | package GettingStartedDL
2 |
3 | import java.io.File
4 | import java.io.IOException
5 | import org.datavec.api.records.reader.RecordReader
6 | import org.datavec.api.records.reader.impl.csv.CSVRecordReader
7 | import org.datavec.api.split.FileSplit
8 | import org.deeplearning4j.datasets.datavec.RecordReaderDataSetIterator
9 | import org.deeplearning4j.eval.Evaluation
10 | import org.deeplearning4j.nn.api.Layer
11 | import org.deeplearning4j.nn.api.OptimizationAlgorithm
12 | import org.deeplearning4j.nn.conf.MultiLayerConfiguration
13 | import org.deeplearning4j.nn.conf.NeuralNetConfiguration
14 | import org.deeplearning4j.nn.conf.layers.LSTM
15 | import org.deeplearning4j.nn.conf.layers.RnnOutputLayer
16 | import org.deeplearning4j.nn.multilayer.MultiLayerNetwork
17 | import org.deeplearning4j.nn.weights.WeightInit
18 | import org.deeplearning4j.optimize.listeners.ScoreIterationListener
19 | import org.nd4j.linalg.activations.Activation
20 | import org.nd4j.linalg.api.ndarray.INDArray
21 | import org.nd4j.linalg.dataset.DataSet
22 | import org.nd4j.linalg.dataset.api.iterator.DataSetIterator
23 | import org.nd4j.linalg.learning.config.Adam
24 | import org.nd4j.linalg.lossfunctions.LossFunctions.LossFunction
25 |
26 | object CancerTypePrediction {
27 | def readCSVDataset(csvFileClasspath:String, batchSize:Int, labelIndex:Int, numClasses:Int) : DataSetIterator = {
28 | val rr:RecordReader = new CSVRecordReader()
29 | val input:File = new File(csvFileClasspath)
30 | rr.initialize(new FileSplit(input))
31 | val iterator:DataSetIterator = new RecordReaderDataSetIterator(rr, batchSize, labelIndex, numClasses)
32 | return iterator
33 | }
34 |
35 | def main(args: Array[String]): Unit = {
36 | val numEpochs = 10
37 | // Show data paths
38 | val trainPath = "C:/Users/admin-karim/Desktop/old2/TCGA-PANCAN/TCGA_train.csv"
39 | val testPath = "C:/Users/admin-karim/Desktop/old2/TCGA-PANCAN/TCGA_test.csv"
40 |
41 | // ----------------------------------
42 | // Preparing training and test set.
43 | val labelIndex = 20531
44 | val numClasses = 5
45 | val batchSize = 128
46 |
47 | // This dataset is used for training
48 | val trainingDataIt: DataSetIterator = readCSVDataset(trainPath, batchSize, labelIndex, numClasses)
49 |
50 | // This is the data we want to classify
51 | val testDataIt:DataSetIterator = readCSVDataset(testPath, batchSize, labelIndex, numClasses)
52 |
53 | // ----------------------------------
54 | // Network hyperparameters
55 | val seed = 12345
56 | val numInputs = labelIndex
57 | val numOutputs = numClasses
58 | val numHiddenNodes = 5000
59 |
60 | //First LSTM layer
61 | val layer_0 = new LSTM.Builder()
62 | .nIn(numInputs)
63 | .nOut(numHiddenNodes)
64 | .activation(Activation.RELU)
65 | .build()
66 |
67 | //Second LSTM layer
68 | val layer_1 = new LSTM.Builder()
69 | .nIn(numHiddenNodes)
70 | .nOut(numHiddenNodes)
71 | .activation(Activation.RELU)
72 | .build()
73 |
74 | //Third LSTM layer
75 | val layer_2 = new LSTM.Builder()
76 | .nIn(numHiddenNodes)
77 | .nOut(numHiddenNodes)
78 | .activation(Activation.RELU)
79 | .build()
80 |
81 | //RNN output layer
82 | val layer_3 = new RnnOutputLayer.Builder()
83 | .activation(Activation.SOFTMAX)
84 | .lossFunction(LossFunction.MCXENT)
85 | .nIn(numHiddenNodes)
86 | .nOut(numOutputs)
87 | .build()
88 |
89 | // Create network configuration and conduct network training
90 | val LSTMconf: MultiLayerConfiguration = new NeuralNetConfiguration.Builder()
91 | .seed(seed) //Random number generator seed for improved repeatability. Optional.
92 | .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT)
93 | .weightInit(WeightInit.XAVIER)
94 | .updater(new Adam(5e-3))
95 | .l2(1e-5)
96 | .list()
97 | .layer(0, layer_0)
98 | .layer(1, layer_1)
99 | .layer(2, layer_2)
100 | .layer(3, layer_3)
101 | .pretrain(false).backprop(true).build()
102 |
103 | // Create and initialize multilayer network
104 | val model: MultiLayerNetwork = new MultiLayerNetwork(LSTMconf)
105 | model.init()
106 |
107 | //print the score with every 1 iteration
108 | model.setListeners(new ScoreIterationListener(1));
109 |
110 | //Print the number of parameters in the network (and for each layer)
111 | val layers = model.getLayers()
112 | var totalNumParams = 0
113 | var i = 0
114 |
115 | for (i <- 0 to layers.length-1) {
116 | val nParams = layers(i).numParams()
117 | println("Number of parameters in layer " + i + ": " + nParams)
118 | totalNumParams = totalNumParams + nParams
119 | }
120 |
121 | println("Total number of network parameters: " + totalNumParams)
122 |
123 | var j = 0
124 | println("Train model....")
125 | for (j <- 0 to numEpochs-1) {
126 | model.fit(trainingDataIt)
127 | }
128 |
129 | println("Evaluate model....")
130 | val eval: Evaluation = new Evaluation(5) //create an evaluation object with 10 possible classes
131 |
132 | while (testDataIt.hasNext()) {
133 | val next:DataSet = testDataIt.next()
134 | val output:INDArray = model.output(next.getFeatureMatrix()) //get the networks prediction
135 | eval.eval(next.getLabels(), output) //check the prediction against the true class
136 | }
137 |
138 | println(eval.stats())
139 | println("****************Example finished********************")
140 | }
141 | }
142 |
--------------------------------------------------------------------------------
/Chapter07/src/test/scala/com/packt/ScalaMLQuickStartGuide/AppTest.java:
--------------------------------------------------------------------------------
1 | package com.packt.ScalaMLQuickStartGuide;
2 |
3 | import junit.framework.Test;
4 | import junit.framework.TestCase;
5 | import junit.framework.TestSuite;
6 |
7 | /**
8 | * Unit test for simple App.
9 | */
10 | public class AppTest
11 | extends TestCase
12 | {
13 | /**
14 | * Create the test case
15 | *
16 | * @param testName name of the test case
17 | */
18 | public AppTest( String testName )
19 | {
20 | super( testName );
21 | }
22 |
23 | /**
24 | * @return the suite of tests being tested
25 | */
26 | public static Test suite()
27 | {
28 | return new TestSuite( AppTest.class );
29 | }
30 |
31 | /**
32 | * Rigourous Test :-)
33 | */
34 | public void testApp()
35 | {
36 | assertTrue( true );
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Packt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # Machine Learning with Scala Quick Start Guide
5 |
6 |
7 | This is the code repository for [Machine Learning with Scala Quick Start Guide](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-scala-quick-start-guide), published by Packt.
8 |
9 | **Leverage popular machine learning algorithms and techniques and implement them in Scala**
10 |
11 | ## What is this book about?
12 | Scala is a highly scalable integration of object-oriented nature and functional programming concepts that make it easy to build scalable and complex big data applications. This book is a handy guide for machine learning developers and data scientists who want to develop and train effective machine learning models in Scala.
13 |
14 | This book covers the following exciting features:
15 | * Get acquainted with JVM-based machine learning libraries for Scala such as Spark ML and Deeplearning4j
16 | * Learn RDDs, DataFrame, and Spark SQL for analyzing structured and unstructured data
17 | * Understand supervised and unsupervised learning techniques with best practices and pitfalls
18 | * Learn classification and regression analysis with linear regression, logistic regression, Naïve Bayes, support vector machine, and tree-based ensemble techniques
19 | * Learn effective ways of clustering analysis with dimensionality reduction techniques
20 |
21 | If you feel this book is for you, get your [copy](https://www.amazon.com/dp/1789345073) today!
22 |
23 |
25 |
26 |
27 | ## Instructions and Navigations
28 | All of the code is organized into folders. For example, Chapter02.
29 |
30 | The code will look like the following:
31 | ```
32 | rawTrafficDF.select("Hour (Coded)", "Immobilized bus", "Broken Truck",
33 | "Vehicle excess", "Fire", "Slowness in traffic (%)").show(5)
34 | ```
35 |
36 | **Following is what you need for this book:**
37 | This book is for machine learning developers looking to train machine learning models in Scala without spending too much time and effort. Some fundamental knowledge of Scala programming and some basics of statistics and linear algebra is all you need to get started with this book.
38 |
39 | With the following software and hardware list you can run all code files present in the book (Chapter 1-7).
40 |
41 | ### Software and Hardware List
42 |
43 | | Chapter | Software required | OS required |
44 | | -------- | ------------------------------------| -----------------------------------|
45 | | 1-3,6 | Spark: 2.3.0 (or higher), Hadoop: 2.7 (or higher), Java (JDK and JRE): 1.8+, Scala: 2.11.x (or higher), Eclipse Mars/Luna: latest, Maven Eclipse plugin: 2.9 or higher, Maven compiler plugin for Eclipse: 2.3.2 or higher, Maven assembly plugin for Eclipse: 2.4.1 or higher, Importantly, re-use the provided pom.xml file with Packt supplementary and change the version mentioned above and APIs. Then everything will be managed accordingly.| Windows, Mac OS X, and Linux (Any) |
46 | | 5 | Same as above plus the following: h2o version: 3.22.1.1, sparkling water version: 2.4.1, adam version: 0.23.0 | Windows, Mac OS X, and Linux (Any) |
47 | | 7 | Same as above PLUS the following: Spark csv_2.11 version: 1.3.0, ND4j backend version: - If GPU configured: nd4j-cuda-9.0-platform - Otherwise: nd4j-native, ND4j version: 1.0.0-alpha, DL4j version: 1.0.0-alpha, Datavec version: 1.0.0-alpha, Arbiter version: 1.0.0-alpha, Logback version: 1.2.3. | Windows, Mac OS X, and Linux (Any) |
48 |
49 |
50 | ## Code in Action
51 |
52 | Click on the following link to see the Code in Action:
53 |
54 | [http://bit.ly/2WhQf2i](http://bit.ly/2WhQf2i)
55 |
56 | ### Related products
57 | * Scala Machine Learning Projects [[Packt]](https://prod.packtpub.com/in/big-data-and-business-intelligence/scala-machine-learning-projects?utm_source=github&utm_medium=repository&utm_campaign=9781788479042) [[Amazon]](https://www.amazon.com/dp/1788479041)
58 |
59 | * Scala and Spark for Big Data Analytics [[Packt]](https://prod.packtpub.com/in/big-data-and-business-intelligence/scala-and-spark-big-data-analytics?utm_source=github&utm_medium=repository&utm_campaign=9781785280849) [[Amazon]](https://www.amazon.com/dp/1785280848)
60 |
61 | ## Get to Know the Author
62 | **Md. Rezaul Karim**
63 | Md. Rezaul Karim is a researcher, author, and data science enthusiast with a strong computer science background, plus 10 years of R&D experience in machine learning, deep learning, and data mining algorithms to solve emerging bioinformatics research problems by making them explainable. He is passionate about applied machine learning, knowledge graphs, and explainable artificial intelligence (XAI).
64 | Currently, he is working as a research scientist at Fraunhofer FIT, Germany. He is also a Ph.D. candidate at RWTH Aachen University, Germany. Before joining FIT, he worked as a researcher at the Insight Centre for Data Analytics, Ireland. Previously, he worked as a lead software engineer at Samsung Electronics, Korea.
65 |
66 | ### Suggestions and Feedback
67 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSdy7dATC6QmEL81FIUuymZ0Wy9vH1jHkvpY57OiMeKGqib_Ow/viewform) if you have any feedback or suggestions.
68 | ### Download a free PDF
69 |
70 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
71 |
https://packt.link/free-ebook/9781789345070
--------------------------------------------------------------------------------