├── .gitignore
├── README.md
├── build.sbt
├── output
    ├── AllstateClaimsSeverityRandomForestRegressor-log.txt
    └── AllstateClaimsSeverityRandomForestRegressor-submission.csv
├── project
    └── assembly.sbt
└── src
    └── main
        └── scala
            └── com
                └── adornes
                    └── spark
                        └── kaggle
                            ├── AllstateClaimsSeverityGBTRegressor.scala
                            └── AllstateClaimsSeverityRandomForestRegressor.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | # general
 2 | *~
 3 | *.log
 4 | tmp
 5 | dump
 6 | *.DS_Store
 7 | 
 8 | # Scala IDE
 9 | .idea
10 | .idea_modules
11 | .settings
12 | .cache*
13 | .project
14 | .classpath
15 | .scala_dependencies
16 | bin
17 | 
18 | # IntelliJ
19 | spark_scala_ml_examples.iml
20 | metastore_db
21 | 
22 | # SBT
23 | dist
24 | project/boot
25 | project/project
26 | project/target
27 | project/plugins/target
28 | project/.sbtserver*
29 | project/build.properties
30 | target
31 | .target
32 | .tmpBin
33 | .sbtserver*
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Spark Scala Machine Learning Examples
  2 | =====================================
  3 | 
  4 | This repository is part of a series on Apache Spark examples, aimed at demonstrating the implementation of Machine Learning solutions in  different programming languages supported by Spark. Java is the only language not covered, due to its many disadvantages (and not a single advantage) compared to the other languages. Check the other repositories:
  5 | 
  6 | * **Scala**  - You are here!
  7 | * **Python** - [github.com/adornes/spark_python_ml_examples](https://github.com/adornes/spark_python_ml_examples)
  8 | * **R**      - [github.com/adornes/spark_r_ml_examples](https://github.com/adornes/spark_r_ml_examples)
  9 | 
 10 | This repository aims at demonstrating how to build a [Spark 2.4](https://spark.apache.org/docs/latest/) application with [Scala](http://www.scala-lang.org/) for solving Machine Learning problems, packaged with [SBT](http://www.scala-sbt.org/) and ready to be run locally or on any cloud platform such as [AWS Elastic MapReduce (EMR)](https://aws.amazon.com/emr/).
 11 | 
 12 | Each Scala script in the package can be run as an individual application, as described in the next sections.  
 13 | 
 14 | ### Why Spark?
 15 | 
 16 | Since almost all personal computers nowadays have many Gigabytes of RAM (and it is in an accelerated growth) and powerful CPUs and GPUs, many real-world machine learning problems can be solved with a single computer and frameworks such as [ScikitLearn](http://scikit-learn.org/), with no need of a distributed system, this is, a cluster of many computers. Sometimes, though, data grows and keeps growing. Who never heard the term "Big Data"? When it happens, a non-distributed/scalable solution may solve for a short time, but afterwards such solution will need to be reviewed and maybe significantly changed.
 17 | 
 18 | Spark started as a research project at [UC Berkeley](http://www.berkeley.edu/) in the [AMPLab](https://amplab.cs.berkeley.edu/), a research group that focuses on big data analytics. Since then, it became an [Apache](https://www.apache.org/) project and has delivered many new releases, reaching a consistent maturity with a wide range of functionalities. Most of all, Spark can perform data processing over some Gigabytes or hundreds of Petabytes with basically the same programming code, only requiring a proper cluster of machines in the background (check [this link](https://databricks.com/blog/2014/10/10/spark-petabyte-sort.html)). In some very specific cases the developer may need to tune the process by changing granularity of data distribution and other related aspects, but in general there are plenty of providers that automate all this cluster configuration for the developer. For instance, the scripts in this repository used [AWS Elastic MapReduce (EMR)](https://aws.amazon.com/emr/), which plays exactly this role. 
 19 | 
 20 | 
 21 | ### Why Scala?
 22 | 
 23 | In my humble opinion, [Scala](https://www.scala-lang.org/) is a beautiful and very well-devised programming language, with a strong scientific background from professor [Martin Odersky's research team](https://scala.epfl.ch/) at [Ecole Polytechnique Fédérale de Lausanne](https://www.epfl.ch). 
 24 | 
 25 | In more technical terms, Scala was created with a strong functional paradigm, but also fully compatible with the imperative object-oriented paradigm from JVM platform, taking advantage of all JVM's decades of evolution and maturity. In summary, everything one does in Java can be done in Scala and much more with a much shorter and cleaner code.
 26 | 
 27 | It isn't a surprise that Spark is built precisely over Scala, although it also provides programming interfaces for [Python](https://www.python.org/), [R](https://www.r-project.org/) and, naturally, Java.
 28 | 
 29 | 
 30 | ### Scripts: AllstateClaimsSeverityGBTRegressor and AllstateClaimsSeverityRandomForestRegressor
 31 | 
 32 | [Allstate Corporation](https://www.allstate.com), the second largest insurance company in United States, founded in 1931, recently launched a Machine Learning recruitment challenge in partnership with [Kaggle](https://www.kaggle.com/c/allstate-claims-severity) asking for competitors, Data Science professionals and enthusiasts, to predict the cost, and hence the severity, of claims.
 33 |  
 34 | The competition organizers provide the competitors with more than 300.000 examples with masked and anonymous data consisting of more than 100 categorical and numerical attributes, thus being compliant with confidentiality constraints and still more than enough for building and evaluating a variety of Machine Learning techniques. 
 35 | 
 36 | These two Scala scripts obtain the training and test input datasets, from local or [S3](https://aws.amazon.com/s3/details/) environment, and train [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) and [Random Forest](https://en.wikipedia.org/wiki/Random_forest) models over it, respectively.
 37 | The objective is to demonstrate the use of [Spark 2.4](https://spark.apache.org/docs/latest/) Machine Learning pipelines with [Scala language](http://www.scala-lang.org/), [S3](https://aws.amazon.com/s3/details/) integration and some general good practices for building Machine Learning models. In order to keep this main objective, more sophisticated techniques (such as a thorough exploratory data analysis and feature engineering) are intentionally omitted.
 38 | 
 39 | 
 40 | #### Flow of Execution and Overall Learnings
 41 | 
 42 | Although not so labored in terms of Machine Learning techniques, these scripts provide many important learnings for building ML applications with Spark 2.4, Scala, SBT and finally running it. Some learnings are detailed as follows:  
 43 |  
 44 | * Both scripts provide a sophisticated command line interface with [scopt](https://github.com/scopt/scopt), through which the runtime can be configured with specific named parameters. It is detailed in the section [Running the Scripts Locally](#running-the-scripts-locally). You must add this to your `build.sbt` file:
 45 | 
 46 |     ```scala
 47 |     libraryDependencies += "com.github.scopt" %% "scopt" % "3.5.0"
 48 |     ```
 49 |     
 50 |     And your script code will include something like this: 
 51 |     
 52 |     ```scala
 53 |     val parser = new OptionParser[Params]("AllstateClaimsSeverityRandomForestRegressor") {
 54 |       head("AllstateClaimsSeverityRandomForestRegressor", "1.0")
 55 |     
 56 |       opt[String]("s3AccessKey").required().action((x, c) =>
 57 |         c.copy(s3AccessKey = x)).text("The access key is for S3")
 58 |     
 59 |       opt[String]("s3SecretKey").required().action((x, c) =>
 60 |         c.copy(s3SecretKey = x)).text("The secret key is for S3")
 61 |     ...
 62 |     ```
 63 |         
 64 |     ```scala
 65 |     parser.parse(args, Params()) match {
 66 |       case Some(params) =>
 67 |         process(params)
 68 |       case None =>
 69 |         throw new IllegalArgumentException("One or more parameters are invalid or missing")
 70 |     }
 71 |     ```
 72 |     
 73 | * In order for SBT to package a jar file containing this and other third-part libraries, you need to use the command `sbt assembly` instead of `sbt package`. For such, it is needed to use [sbt-assembly](https://github.com/sbt/sbt-assembly) and configure your project accordingly by creating a file `project/assembly.sbt` with the following content:
 74 |  
 75 |     ```scala
 76 |     resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
 77 |     
 78 |     addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3")
 79 |     ```
 80 |     
 81 | * The method `process` is called with a *case class* instance which encapsulates the parameters provided at the command line.
 82 |     
 83 |     ```scala
 84 |     case class Params(s3AccessKey: String = "", s3SecretKey: String = "",
 85 |                       trainInput: String = "", testInput: String = "",
 86 |                       outputFile: String = "",
 87 |                       algoNumTrees: Seq[Int] = Seq(3),
 88 |                       algoMaxDepth: Seq[Int] = Seq(4),
 89 |                       algoMaxBins: Seq[Int] = Seq(32),
 90 |                       numFolds: Int = 10,
 91 |                       trainSample: Double = 1.0,
 92 |                       testSample: Double = 1.0)
 93 |     ```
 94 |     
 95 |     ```scala
 96 |     def process(params: Params) {
 97 |        ...
 98 |     ```
 99 | 
100 | * *SparkSession.builder* is used for building a *Spark session*. It was introduced in Spark 2.4 and is recommended to be used in place of the old *SparkConf* and *SparkContext*. [This link](https://databricks.com/blog/2016/08/15/how-to-use-sparksession-in-apache-spark-2-0.html) provides a good description of this new strategy and the equivalence with the old one.
101 |     
102 |     ```scala
103 |     val sparkSession = SparkSession.builder.
104 |       appName("AllstateClaimsSeverityRandomForestRegressor")
105 |       .getOrCreate()
106 |     ```
107 | 
108 | * The access to S3 is configured with **s3a** support, which compared to the predecessor **s3n** improves the support to large files (no more 5GB limit) and provides higher performance. For more information on this, check [this](https://wiki.apache.org/hadoop/AmazonS3), [this](https://aws.amazon.com/premiumsupport/knowledge-center/emr-file-system-s3/) and [this](http://stackoverflow.com/questions/30385981/how-to-access-s3a-files-from-apache-spark) links.
109 | 
110 |     ```scala
111 |     sparkSession.conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
112 |     sparkSession.conf.set("spark.hadoop.fs.s3a.access.key", params.s3AccessKey)
113 |     sparkSession.conf.set("spark.hadoop.fs.s3a.secret.key", params.s3SecretKey)
114 |     ```
115 | 
116 | * Besides using the new **sparkSession.read.csv** method, the reading process also includes important settings: It is set to read the header of the CSV file, which is directly applied to the columns' names of the dataframe created; and **inferSchema** property is set to *true*. Without the **inferSchema** configuration, the float values would be considered as *strings* which would later cause the **VectorAssembler** to raise an ugly error: `java.lang.IllegalArgumentException: Data type StringType is not supported`. Finally, both raw dataframes are *cached* since they are again used later in the code for *fitting* the **StringIndexer** transformations and it wouldn't be good to read the CSV files from the filesystem or S3 once again.
117 | 
118 |     ```scala
119 |     val trainInput = sparkSession.read
120 |       .option("header", "true")
121 |       .option("inferSchema", "true")
122 |       .csv(params.trainInput)
123 |       .cache
124 |     
125 |     val testInput = sparkSession.read
126 |       .option("header", "true")
127 |       .option("inferSchema", "true")
128 |       .csv(params.testInput)
129 |       .cache
130 |     ```
131 |   
132 | * The column "loss" is renamed to "label". For some reason, even after using the *setLabelCol* on the regression model, it still looks for a column called "label", raising an ugly error: `org.apache.spark.sql.AnalysisException: cannot resolve 'label' given input columns`. It may be hardcoded somewhere in Spark's  source code.
133 |  
134 | * The content of *train.csv* is split into *training* and *validation* data, 70% and 30%, respectively. The content of "test.csv" is reserved for building the final CSV file for submission on Kaggle. Both original dataframes are sampled according to command line parameters, which is particularly useful for running fast executions in your local machine;
135 |   
136 |     ```scala
137 |     val data = trainInput.withColumnRenamed("loss", "label")
138 |       .sample(false, params.trainSample)
139 |     
140 |     val splits = data.randomSplit(Array(0.7, 0.3))
141 |     val (trainingData, validationData) = (splits(0), splits(1))
142 |     
143 |     trainingData.cache
144 |     validationData.cache
145 |     
146 |     val testData = testInput.sample(false, params.testSample).cache
147 |     ```
148 |   
149 | * By using a custom function *isCateg* the column names are filtered and a [StringIndexer](http://spark.apache.org/docs/latest/ml-features.html#stringindexer) is created for each categorical column, aimed at creating a new numerical column according to the custom function *categNewCol*. Note: It is a weak feature engineering, since it is wrong for a learning model to assume that the categories have an order among them (one is greater or less than the other). Whenever categories are confirmed to be unordered, it is better to use some other technique such as [OneHotEncoder](http://spark.apache.org/docs/latest/ml-features.html#onehotencoder), which yields a different new column for each category holding a boolean (0/1) value;
150 | 
151 |     ```scala
152 |     def isCateg(c: String): Boolean = c.startsWith("cat")
153 |     def categNewCol(c: String): String = if (isCateg(c)) s"idx_${c}" else c
154 |     
155 |     val stringIndexerStages = trainingData.columns.filter(isCateg)
156 |       .map(c => new StringIndexer()
157 |         .setInputCol(c)
158 |         .setOutputCol(categNewCol(c))
159 |         .fit(trainInput.select(c).union(testInput.select(c))))
160 |     ```
161 |   
162 | * There are some very important aspects to be considered when building a feature transformation such as StringIndexer or OneHotEncoder. Such transformations need to be *fitted* before being included in the pipeline and the *fit* process needs to be done over a dataset that contains all possible categories. For instance, if you fit a StringIndexer over the training dataset and afterwards, when the pipeline is used to predict an outcome over another dataset (validation, test, etc.), it faces some unseen category, then it will fail and raise the error: `org.apache.spark.SparkException: Failed to execute user defined function($anonfun$4: (string) => double) ... Caused by: org.apache.spark.SparkException: Unseen label: XYZ ... at org.apache.spark.ml.feature.StringIndexerModel`. This is the reason why the scripts' code fits the StringIndexer transformations over a union of original data from `train.csv` and `test.csv`, bypassing the sampling and split parts.
163 |  
164 | * After the sequence of StringIndexer transformations, the next transformation in the pipeline is the [VectorAssembler](http://spark.apache.org/docs/latest/ml-features.html#vectorassembler), which groups a set of columns into a new "features" column to be considered by the regression model. The filter for only feature columns is performed with the custom function *onlyFeatureCols*. Additionally, the custom function *removeTooManyCategs* is used to filter out some few columns which contain a number of distinct categories much higher than the supported by the default parameter *maxBins* (for RandomForest). In a seriously competitive scenario, it would be better to perform some exploratory analysis to understand these features,  their impact on the outcome variable and which feature engineering techniques could be applied.
165 | 
166 |     ```scala
167 |     def removeTooManyCategs(c: String): Boolean = !(c matches "cat(109$|110$|112$|113$|116$)")
168 |     
169 |     def onlyFeatureCols(c: String): Boolean = !(c matches "id|label")
170 |     
171 |     val featureCols = trainingData.columns
172 |       .filter(removeTooManyCategs)
173 |       .filter(onlyFeatureCols)
174 |       .map(categNewCol)
175 |     
176 |     val assembler = new VectorAssembler()
177 |       .setInputCols(featureCols)
178 |       .setOutputCol("features")
179 |     ```
180 |   
181 | * The very last stage in the pipeline is the regression model, which in these scripts is [GBTRegressor](http://spark.apache.org/docs/2.4.4/api/java/org/apache/spark/ml/regression/GBTRegressor.html) and [RandomForestRegressor](http://spark.apache.org/docs/2.4.4/api/java/org/apache/spark/ml/regression/RandomForestRegressor.html).
182 | 
183 |     ```scala
184 |     val algo = new RandomForestRegressor().setFeaturesCol("features").setLabelCol("label")
185 |     
186 |     val pipeline = new Pipeline().setStages((stringIndexerStages :+ assembler) :+ algo)
187 |     ```
188 |   
189 | * It is interesting to run the pipeline a set of times with different *hyperparameters* for the transformations and the learning algorithm in order to find the combination that best fits the data (see [Hyperparameter optimization](https://en.wikipedia.org/wiki/Hyperparameter_optimization)). It is also important to evaluate each combination against a separated slice of the data (see [K-fold Cross Validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics))). For accomplishing such objectives, a [CrossValidator](http://spark.apache.org/docs/latest/api/java/org/apache/spark/ml/tuning/CrossValidator.html) is used in conjunction with a [ParamGridBuilder](http://spark.apache.org/docs/latest/api/java/org/apache/spark/ml/tuning/ParamGridBuilder.html) (more documentation on (this link)[http://spark.apache.org/docs/latest/ml-tuning.html]) queueing  executions with distinct combinations of *hyperparameters* according to which was parametrized in the command line.
190 | 
191 |     ```scala
192 |     val paramGrid = new ParamGridBuilder()
193 |       .addGrid(algo.numTrees, params.algoNumTrees)
194 |       .addGrid(algo.maxDepth, params.algoMaxDepth)
195 |       .addGrid(algo.maxBins, params.algoMaxBins)
196 |       .build()
197 |     
198 |     val cv = new CrossValidator()
199 |       .setEstimator(pipeline)
200 |       .setEvaluator(new RegressionEvaluator)
201 |       .setEstimatorParamMaps(paramGrid)
202 |       .setNumFolds(params.numFolds)
203 |     
204 |     val cvModel = cv.fit(trainingData)
205 |     ```
206 |   
207 | * Note: As observed by [this post](https://databricks.com/blog/2015/01/21/random-forests-and-boosting-in-mllib.html) the Random Forest model is much faster than GBT on Spark. I experienced an execution about 20 times slower with GBT compared to Random Forest with equivalent *hyperparameters*.
208 | 
209 | * With an instance of [CrossValidatorModel](http://spark.apache.org/docs/latest/api/java/org/apache/spark/ml/tuning/CrossValidatorModel.html) already trained, it is time for evaluating the model over the whole training and the validation datasets. From the result of predictions it is possible to easily obtain evaluation metrics with [RegressionMetrics](http://spark.apache.org/docs/latest/api/java/org/apache/spark/mllib/evaluation/RegressionMetrics.html). Additionally, the instance of the best model can be obtained, providing thus access to some other interesting attributes, such as *featureImportances*.
210 | 
211 |     ```scala
212 |     val trainPredictionsAndLabels = cvModel.transform(trainingData).select("label", "prediction")
213 |       .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd
214 |     
215 |     val validPredictionsAndLabels = cvModel.transform(validationData).select("label", "prediction")
216 |       .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd
217 |     
218 |     val trainRegressionMetrics = new RegressionMetrics(trainPredictionsAndLabels)
219 |     val validRegressionMetrics = new RegressionMetrics(validPredictionsAndLabels)
220 |     
221 |     val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel]
222 |     val featureImportances = bestModel.stages.last.asInstanceOf[RandomForestRegressionModel].featureImportances.toArray
223 |     ```
224 |   
225 | * Finally, the model can be used to predict the answer for the *test* dataset and save a csv file ready to be submitted on Kaggle! Again, Spark 2.4 simplifies the process. The function `coalesce` gathers all partitions into 1 only, thus saving a single output file (not many). 
226 |  
227 |     ```scala
228 |     cvModel.transform(testData)
229 |       .select("id", "prediction")
230 |       .withColumnRenamed("prediction", "loss")
231 |       .coalesce(1)
232 |       .write.format("csv")
233 |       .option("header", "true")
234 |       .save(params.outputFile)
235 |     ```
236 |   
237 | 
238 | #### Running the Scripts Locally
239 | 
240 | Assuming you have your local environment all set up with Java 8 or higher, Scala 2.11.x, Spark 2.4 and [SBT](https://www.scala-sbt.org/), you can build the project with *sbt assembly*, which creates a *fat* JAR of your project with all of its dependencies (this is needed for running on AWS later). 
241 | You will be able to find the JAR file in this path:
242 | 
243 | ```
244 | target/scala-2.11/Spark Scala Machine Learning Examples-assembly-1.0.jar
245 | ```
246 | 
247 | Having successfully built the JAR file, you can run the desired script (here, AllstateClaimsSeverityRandomForestRegressor) with the following command structure:
248 | 
249 | ```
250 | spark-submit --class com.adornes.spark.kaggle.AllstateClaimsSeverityRandomForestRegressor target/scala-2.11/Spark Scala Machine Learning Examples-assembly-1.0.jar --s3AccessKey YOUR_AWS_ACCESS_KEY_HERE --s3SecretKey YOUR_AWS_SECRET_KEY_HERE --trainInput "file:///path/to/the/train.csv" --testInput "file:///path/to/the/test.csv" --outputFile  "file:///path/to/any/name/for/submission.csv" --algoNumTrees 3 --algoMaxDepth 3 --algoMaxBins 32 --numFolds 5 --trainSample 0.01 --testSample 0.01
251 | ```
252 | 
253 | As previously mentioned, [scopt](https://github.com/scopt/scopt) is the tool that enables the nice names for parameters at command line. If you type something wrong, it will output the sample usage as follows:
254 | 
255 | ```
256 | AllstateClaimsSeverityRandomForestRegressor 1.0
257 | Usage: AllstateClaimsSeverityRandomForestRegressor [options]
258 | 
259 |   --s3AccessKey <value>    The access key for S3
260 |   --s3SecretKey <value>    The secret key for S3
261 |   --trainInput <file>      Path to file/directory for training data
262 |   --testInput <file>       Path to file/directory for test data
263 |   --outputFile <file>      Path to output file
264 |   --algoNumTrees <n1[,n2,n3...]>
265 |                            One or more options for number of trees for RandomForest model. Default: 3
266 |   --algoMaxDepth <n1[,n2,n3...]>
267 |                            One or more values for depth limit
268 |   --algoMaxBins <n1[,n2,n3...]>
269 |                            One or more values for depth limit
270 |   --numFolds <value>       Number of folds for K-fold Cross Validation
271 |   --trainSample <value>    Sample fraction from 0.0 to 1.0 for train data
272 |   --testSample <value>     Sample fraction from 0.0 to 1.0 for test data
273 | ```
274 | 
275 | #### Running the Scripts on AWS Elastic MapReduce (EMR)
276 | 
277 | **EMR** plays the role of abstracting most of the background setup for a cluster with Spark/Hadoop ecosystems. You can actually build as many clusters as you want (and can afford). By the way, the cost for EC2 instances used with EMR is considerably reduced (it is detailed [here](https://aws.amazon.com/emr/pricing)).
278 |  
279 |  Although considerably abstracting the cluster configuration, EMR allows the user to customize almost any of the background details through the *advanced* options of the steps of creating a cluster. For instance, for these Spark scripts, you'll need to customize the Java version, according to [this link](http://docs.aws.amazon.com/ElasticMapReduce/latest/ReleaseGuide/emr-configure-apps.html#configuring-java8). Besides that, everything is created using the options provided. So, going step by step, log in to your AWS console, in the *Services* tab look for *EMR*, select to create a cluster, choose *Go to advanced options* on the top of the screen and fill the options as follows: 
280 |  
281 |  * **Vendor** - Leave it as *Amazon*
282 |  
283 |  * **Release** - Choose *emr-5.1.0*. Select *Hadoop* and *Spark*. I'd also recommend you to select *Zeppelin* (for working with notebooks) and *Ganglia* (for detailed monitoring of your cluster).
284 |  
285 |  * **Edit software settings (optional)** - Ensure the option *Enter configuration* is selected and copy here the configurations of [the aforementioned link](http://docs.aws.amazon.com/ElasticMapReduce/latest/ReleaseGuide/emr-configure-apps.html#configuring-java8)
286 |    
287 |  * **Add steps** - You don't need to do it at this moment. I prefer to do it later, after your cluster is started and ready for processing stuff. Click Next for *Hardware* settings.
288 |  
289 |  * **Hardware** - You can leave it as default (and can also resize it later) but maybe 2 core instances can be increased to 4 or more. Don't forget that your choice will have costs. Click Next for *General Cluster Settings*.
290 |  
291 |  * **Cluster name** - Give some name to your cluster. Feel free to leave all other options with the default values. Click Next for *Security*.
292 |  
293 |  * **EC2 Key Pair** - It is useful if want to log into your EC2 instances via ssh. You can either create a Key Pair or choose some existent if you already have one. Leave the remaining options with the default values and click on *Create Cluster*.
294 |  
295 |  Now you'll have an overview of your cluster's basic data, including the state of your instances. When they indicate to be ready for processing steps, go to the **Steps** tab, click on **Add step** and fill the options as follows:
296 |  
297 |  * **Step type** - Select *Spark application*
298 |   
299 |  * **Application location** - Navigate through your S3 buckets and select the jar file there. You'll need to have already uploaded it to S3.
300 |  
301 |  * **Spark-submit options** - Type here `--class com.adornes.spark.kaggle.AllstateClaimsSeverityRandomForestRegressor` indicating the class that holds the code that you want to run.
302 |  
303 |  * **Arguments** - Here you type the rest of the command arguments as demonstrated before, but this time indicating S3 paths as follows:
304 |  
305 |  ```
306 |  --s3AccessKey YOUR_AWS_ACCESS_KEY_HERE --s3SecretKey YOUR_AWS_SECRET_KEY_HERE 
307 |  --trainInput "s3:/path/to/the/train.csv" --testInput "s3:/path/to/the/test.csv" 
308 |  --outputFile  "s3:/path/to/any/name/for/submission.csv" 
309 |  --algoNumTrees 20,40,60 --algoMaxDepth 5,7,9 --algoMaxBins 32 --numFolds 10 
310 |  --trainSample 1.0 --testSample 1.0
311 |  ```
312 | 
313 | That's it! In the list of steps you will see your step running and will also have access to system logs. Detailed logs will be saved to the path defined in your cluster configuration. Additionally, EMR allows the user to clone both steps and clusters, being thus not required to type everything again.
314 | 
315 | 
316 | #### Submission on Kaggle
317 | 
318 | As mentioned along the explanations, many improvements could/should be done in terms of exploratory data analysis, feature engineering, evaluating other models (starting by the simplest ones, as Linear Regression) and then decreasing the predictions error.
319 |  
320 | For being over-simplistic, this model achieved a Mean Absolute Error (MAE) of 1286 in the [public leaderboard](https://www.kaggle.com/c/allstate-claims-severity/leaderboard), far from the top positions.
321 | 
322 | The submission file and the detailed metrics of the model evaluation can be found under the `output` directory.
323 | 
324 | 
325 | ### Corrections/Suggestions or just a Hello!
326 | 
327 | Don't hesitate to contact me directly or create *pull requests* here if you have any correction or suggestion for the code or for this documentation! Thanks! 
328 | 
329 | * [Github](https://www.github.com/adornes)
330 | * [Twitter](https://twitter.com/daniel_adornes)
331 | * [LinkedIn](https://www.linkedin.com/in/adornes)
332 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "Spark Scala Machine Learning Examples"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.11.12"
 6 | 
 7 | libraryDependencies ++= Seq(
 8 | 	"org.apache.spark" %% "spark-core" % "2.4.4" % "provided",
 9 | 	"org.apache.spark" %% "spark-sql" % "2.4.4" % "provided",
10 | 	"org.apache.spark" %% "spark-streaming" % "2.4.4" % "provided",
11 | 	"org.apache.spark" %% "spark-mllib" % "2.4.4" % "provided",
12 | 	"com.github.nscala-time" %% "nscala-time" % "1.8.0",
13 | 	"com.github.scopt" %% "scopt" % "3.5.0"
14 | )


--------------------------------------------------------------------------------
/output/AllstateClaimsSeverityRandomForestRegressor-log.txt:
--------------------------------------------------------------------------------
  1 | =====================================================================
  2 | Param sample: 1.0
  3 | TrainingData count: 132171
  4 | ValidationData count: 56147
  5 | TestData count: 125546
  6 | =====================================================================
  7 | Param algoNumTrees = 20,40,60
  8 | Param algoMaxDepth = 5,7,9
  9 | Param algoMaxBins = 32
 10 | Param numFolds = 10
 11 | =====================================================================
 12 | Training data MSE = 3681815.1485177097
 13 | Training data RMSE = 1918.805656786979
 14 | Training data R-squared = 0.013440897216495884
 15 | Training data MAE = 1271.3400430944828
 16 | Training data Explained variance = 8556790.405602483
 17 | =====================================================================
 18 | Validation data MSE = 4062304.6811123574
 19 | Validation data RMSE = 2015.5159838394627
 20 | Validation data R-squared = -0.12222960855493237
 21 | Validation data MAE = 1296.2964023458937
 22 | Validation data Explained variance = 8144098.777486679
 23 | =====================================================================
 24 | RandomForest features importances:
 25 |  	idx_cat1 = 0.005028745039598049
 26 | 	idx_cat2 = 0.0021552609815733434
 27 | 	idx_cat3 = 3.0048559050388534E-4
 28 | 	idx_cat4 = 0.0016932466039807567
 29 | 	idx_cat5 = 5.901278104981875E-4
 30 | 	idx_cat6 = 8.681729331579526E-4
 31 | 	idx_cat7 = 0.01537119341424913
 32 | 	idx_cat8 = 1.5562522400310087E-4
 33 | 	idx_cat9 = 0.0027689655083528737
 34 | 	idx_cat10 = 0.007155596990720177
 35 | 	idx_cat11 = 6.36553722115706E-4
 36 | 	idx_cat12 = 0.030652297306660227
 37 | 	idx_cat13 = 5.183044605123435E-4
 38 | 	idx_cat14 = 2.1849405111811035E-4
 39 | 	idx_cat15 = 0.0
 40 | 	idx_cat16 = 1.9471234518384843E-4
 41 | 	idx_cat17 = 1.0138978708103575E-4
 42 | 	idx_cat18 = 4.4170721855415664E-5
 43 | 	idx_cat19 = 9.158210025616964E-5
 44 | 	idx_cat20 = 1.4935995658907235E-5
 45 | 	idx_cat21 = 4.119020589966203E-6
 46 | 	idx_cat22 = 0.0
 47 | 	idx_cat23 = 5.341214984360372E-4
 48 | 	idx_cat24 = 9.087766666711655E-5
 49 | 	idx_cat25 = 6.873184702609509E-4
 50 | 	idx_cat26 = 0.0010248119033011054
 51 | 	idx_cat27 = 5.424487043126234E-4
 52 | 	idx_cat28 = 1.8149534642085313E-4
 53 | 	idx_cat29 = 1.7122301305868374E-4
 54 | 	idx_cat30 = 8.5380153892165E-5
 55 | 	idx_cat31 = 8.685238535346974E-4
 56 | 	idx_cat32 = 8.32831710687748E-5
 57 | 	idx_cat33 = 5.041809127997581E-5
 58 | 	idx_cat34 = 1.0012490753213105E-4
 59 | 	idx_cat35 = 1.0594890081895466E-4
 60 | 	idx_cat36 = 3.394012251782487E-4
 61 | 	idx_cat37 = 6.804772905045125E-4
 62 | 	idx_cat38 = 0.0014270614890801209
 63 | 	idx_cat39 = 2.0062333283960747E-4
 64 | 	idx_cat40 = 1.3353224637358663E-4
 65 | 	idx_cat41 = 1.1729616183993322E-4
 66 | 	idx_cat42 = 9.357089165678633E-5
 67 | 	idx_cat43 = 6.661970948343636E-5
 68 | 	idx_cat44 = 0.0023358013213218916
 69 | 	idx_cat45 = 1.3536973829642932E-4
 70 | 	idx_cat46 = 1.170100621650221E-4
 71 | 	idx_cat47 = 6.969811757521523E-5
 72 | 	idx_cat48 = 1.2425339650636349E-5
 73 | 	idx_cat49 = 5.063781292197735E-4
 74 | 	idx_cat50 = 6.230892327181806E-4
 75 | 	idx_cat51 = 1.2661758798943284E-4
 76 | 	idx_cat52 = 6.302401411714715E-4
 77 | 	idx_cat53 = 0.0035539363029276337
 78 | 	idx_cat54 = 9.181059248331235E-5
 79 | 	idx_cat55 = 4.6858024695073275E-6
 80 | 	idx_cat56 = 6.48715279660446E-7
 81 | 	idx_cat57 = 0.06019822381483131
 82 | 	idx_cat58 = 1.2200330753011028E-4
 83 | 	idx_cat59 = 1.6060536091916817E-4
 84 | 	idx_cat60 = 7.869143790943007E-5
 85 | 	idx_cat61 = 7.396009192911049E-4
 86 | 	idx_cat62 = 1.8079351020614657E-4
 87 | 	idx_cat63 = 3.3217296411480813E-5
 88 | 	idx_cat64 = 4.834053701340416E-6
 89 | 	idx_cat65 = 9.664508946602006E-5
 90 | 	idx_cat66 = 3.399231342432101E-4
 91 | 	idx_cat67 = 7.309263398999398E-5
 92 | 	idx_cat68 = 1.064876132722343E-5
 93 | 	idx_cat69 = 5.337556951869969E-5
 94 | 	idx_cat70 = 1.1220342269549823E-6
 95 | 	idx_cat71 = 0.0011939270664671263
 96 | 	idx_cat72 = 0.0071364712841564815
 97 | 	idx_cat73 = 3.6270302363713573E-4
 98 | 	idx_cat74 = 1.1475649018451946E-4
 99 | 	idx_cat75 = 6.659970507315859E-4
100 | 	idx_cat76 = 3.0895149348635515E-4
101 | 	idx_cat77 = 3.597245107197787E-5
102 | 	idx_cat78 = 7.164318166749448E-4
103 | 	idx_cat79 = 0.18170463248743407
104 | 	idx_cat80 = 0.23668169735844358
105 | 	idx_cat81 = 0.024070106350853
106 | 	idx_cat82 = 8.193314957616162E-4
107 | 	idx_cat83 = 4.3023345769190507E-4
108 | 	idx_cat84 = 0.0011768880278218994
109 | 	idx_cat85 = 9.688497827365217E-4
110 | 	idx_cat86 = 6.865188919078531E-4
111 | 	idx_cat87 = 0.03179515460389307
112 | 	idx_cat88 = 1.0205717060317864E-4
113 | 	idx_cat89 = 0.017098398654764528
114 | 	idx_cat90 = 3.556885357704126E-4
115 | 	idx_cat91 = 0.002612301931937975
116 | 	idx_cat92 = 4.948617396897984E-4
117 | 	idx_cat93 = 3.807829869309961E-4
118 | 	idx_cat94 = 0.006359926218632514
119 | 	idx_cat95 = 6.717451149633222E-4
120 | 	idx_cat96 = 6.570057578617208E-4
121 | 	idx_cat97 = 7.078649043782743E-4
122 | 	idx_cat98 = 2.3478723190687074E-4
123 | 	idx_cat99 = 0.004135859129994216
124 | 	idx_cat100 = 0.03179210320504898
125 | 	idx_cat101 = 0.04782290936078323
126 | 	idx_cat102 = 6.772011726028159E-4
127 | 	idx_cat103 = 0.009148078873089847
128 | 	idx_cat104 = 0.006046207394349975
129 | 	idx_cat105 = 0.018650867261714832
130 | 	idx_cat106 = 0.014275096582530055
131 | 	idx_cat107 = 0.004989442761452682
132 | 	idx_cat108 = 0.01472352265843266
133 | 	idx_cat111 = 0.006260738380104533
134 | 	idx_cat114 = 0.01104325098823224
135 | 	idx_cat115 = 0.03199186041329475
136 | 	cont1 = 0.002350462359494991
137 | 	cont2 = 0.04375495663224423
138 | 	cont3 = 0.0060374435529630495
139 | 	cont4 = 0.003304488953686153
140 | 	cont5 = 0.001490359624130676
141 | 	cont6 = 0.003355965065476908
142 | 	cont7 = 0.03236892648274379
143 | 	cont8 = 0.0022601992717203824
144 | 	cont9 = 0.0021073289350355415
145 | 	cont10 = 0.001487150540405543
146 | 	cont11 = 0.015041708991505781
147 | 	cont12 = 0.01426341511387624
148 | 	cont13 = 0.002113235959298971
149 | 	cont14 = 0.003514151313343825
150 | =====================================================================


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Resolver.url("artifactory", url("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
2 | 
3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.6")


--------------------------------------------------------------------------------
/src/main/scala/com/adornes/spark/kaggle/AllstateClaimsSeverityGBTRegressor.scala:
--------------------------------------------------------------------------------
  1 | package com.adornes.spark.kaggle
  2 | 
  3 | import org.apache.spark.ml.feature.StringIndexer
  4 | import org.apache.spark.ml.feature.VectorAssembler
  5 | import org.apache.spark.ml.regression.{GBTRegressor, GBTRegressionModel}
  6 | import org.apache.spark.ml.{Pipeline, PipelineModel}
  7 | import org.apache.spark.ml.evaluation.RegressionEvaluator
  8 | import org.apache.spark.ml.tuning.ParamGridBuilder
  9 | import org.apache.spark.ml.tuning.CrossValidator
 10 | 
 11 | import org.apache.spark.sql._
 12 | import org.apache.spark.sql.functions._
 13 | 
 14 | import org.apache.spark.mllib.evaluation.RegressionMetrics
 15 | 
 16 | import scopt.OptionParser
 17 | 
 18 | import org.apache.log4j.LogManager
 19 | 
 20 | /**
 21 |   * Simple and silly solution for the "Allstate Claims Severity" competition on Kaggle
 22 |   * Competition page: https://www.kaggle.com/c/allstate-claims-severity
 23 |   */
 24 | object AllstateClaimsSeverityGBTRegressor {
 25 | 
 26 |   /*
 27 |    * case class for parsing command line params
 28 |    */
 29 | 
 30 |   case class Params(s3AccessKey: String = "", s3SecretKey: String = "",
 31 |                     trainInput: String = "", testInput: String = "",
 32 |                     outputFile: String = "",
 33 |                     algoMaxIter: Seq[Int] = Seq(30),
 34 |                     algoMaxDepth: Seq[Int] = Seq(3),
 35 |                     numFolds: Int = 10,
 36 |                     trainSample: Double = 1.0,
 37 |                     testSample: Double = 1.0)
 38 | 
 39 |   /*
 40 |    * Computation logic
 41 |    */
 42 |   def process(params: Params) {
 43 | 
 44 |     /*
 45 |      * Initializing Spark session and logging
 46 |      */
 47 | 
 48 |     val sparkSession = SparkSession.builder.
 49 |       appName("AllstateClaimsSeverityGBTRegressor")
 50 |       .getOrCreate()
 51 | 
 52 |     import sparkSession.implicits._
 53 | 
 54 |     val log = LogManager.getRootLogger
 55 | 
 56 |     // ****************************
 57 |     log.info("Loading input data")
 58 |     // ****************************
 59 | 
 60 |     if (params.trainInput.startsWith("s3://")) {
 61 |       sparkSession.conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
 62 |       sparkSession.conf.set("spark.hadoop.fs.s3a.access.key", params.s3AccessKey)
 63 |       sparkSession.conf.set("spark.hadoop.fs.s3a.secret.key", params.s3SecretKey)
 64 |     }
 65 | 
 66 |     // *************************************************
 67 |     log.info("Reading data from train.csv file")
 68 |     // *************************************************
 69 | 
 70 |     val trainInput = sparkSession.read
 71 |       .option("header", "true")
 72 |       .option("inferSchema", "true")
 73 |       .csv(params.trainInput)
 74 |       .cache
 75 | 
 76 |     val testInput = sparkSession.read
 77 |       .option("header", "true")
 78 |       .option("inferSchema", "true")
 79 |       .csv(params.testInput)
 80 |       .cache
 81 | 
 82 |     // *******************************************
 83 |     log.info("Preparing data for training model")
 84 |     // *******************************************
 85 | 
 86 |     val data = trainInput.withColumnRenamed("loss", "label")
 87 |       .sample(false, params.trainSample)
 88 | 
 89 |     val splits = data.randomSplit(Array(0.7, 0.3))
 90 |     val (trainingData, validationData) = (splits(0), splits(1))
 91 | 
 92 |     trainingData.cache
 93 |     validationData.cache
 94 | 
 95 |     val testData = testInput.sample(false, params.testSample).cache
 96 | 
 97 |     // **************************************************
 98 |     log.info("Building Machine Learning pipeline")
 99 |     // **************************************************
100 | 
101 |     // StringIndexer for categorical columns (OneHotEncoder should be evaluated as well)
102 |     def isCateg(c: String): Boolean = c.startsWith("cat")
103 |     def categNewCol(c: String): String = if (isCateg(c)) s"idx_${c}" else c
104 | 
105 |     val stringIndexerStages = trainingData.columns.filter(isCateg)
106 |       .map(c => new StringIndexer()
107 |         .setInputCol(c)
108 |         .setOutputCol(categNewCol(c))
109 |         .fit(trainInput.select(c).union(testInput.select(c))))
110 | 
111 |     // Function to remove categorical columns with too many categories
112 |     def removeTooManyCategs(c: String): Boolean = !(c matches "cat(109$|110$|112$|113$|116$)")
113 | 
114 |     // Function to select only feature columns (omit id and label)
115 |     def onlyFeatureCols(c: String): Boolean = !(c matches "id|label")
116 | 
117 |     // Definitive set of feature columns
118 |     val featureCols = trainingData.columns
119 |       .filter(removeTooManyCategs)
120 |       .filter(onlyFeatureCols)
121 |       .map(categNewCol)
122 | 
123 |     // VectorAssembler for training features
124 |     val assembler = new VectorAssembler()
125 |       .setInputCols(featureCols)
126 |       .setOutputCol("features")
127 | 
128 |     // Estimator algorithm
129 |     val algo = new GBTRegressor().setFeaturesCol("features").setLabelCol("label")
130 | 
131 |     // Building the Pipeline for transformations and predictor
132 |     val pipeline = new Pipeline().setStages((stringIndexerStages :+ assembler) :+ algo)
133 | 
134 | 
135 |     // ***********************************************************
136 |     log.info("Preparing K-fold Cross Validation and Grid Search")
137 |     // ***********************************************************
138 | 
139 |     val paramGrid = new ParamGridBuilder()
140 |       .addGrid(algo.maxIter, params.algoMaxIter)
141 |       .addGrid(algo.maxDepth, params.algoMaxDepth)
142 |       .build()
143 | 
144 |     val cv = new CrossValidator()
145 |       .setEstimator(pipeline)
146 |       .setEvaluator(new RegressionEvaluator)
147 |       .setEstimatorParamMaps(paramGrid)
148 |       .setNumFolds(params.numFolds)
149 | 
150 | 
151 |     // ************************************************************
152 |     log.info("Training model with GradientBoostedTrees algorithm")
153 |     // ************************************************************
154 | 
155 |     val cvModel = cv.fit(trainingData)
156 | 
157 | 
158 |     // **********************************************************************
159 |     log.info("Evaluating model on train and test data and calculating RMSE")
160 |     // **********************************************************************
161 | 
162 |     val trainPredictionsAndLabels = cvModel.transform(trainingData).select("label", "prediction")
163 |       .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd
164 | 
165 |     val validPredictionsAndLabels = cvModel.transform(validationData).select("label", "prediction")
166 |       .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd
167 | 
168 |     val trainRegressionMetrics = new RegressionMetrics(trainPredictionsAndLabels)
169 |     val validRegressionMetrics = new RegressionMetrics(validPredictionsAndLabels)
170 | 
171 |     val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel]
172 |     val featureImportances = bestModel.stages.last.asInstanceOf[GBTRegressionModel].featureImportances.toArray
173 | 
174 |     val output = "\n=====================================================================\n" +
175 |       s"Param trainSample: ${params.trainSample}\n" +
176 |       s"Param testSample: ${params.testSample}\n" +
177 |       s"TrainingData count: ${trainingData.count}\n" +
178 |       s"ValidationData count: ${validationData.count}\n" +
179 |       s"TestData count: ${testData.count}\n" +
180 |       "=====================================================================\n" +
181 |       s"Param maxIter = ${params.algoMaxIter.mkString(",")}\n" +
182 |       s"Param maxDepth = ${params.algoMaxDepth.mkString(",")}\n" +
183 |       s"Param numFolds = ${params.numFolds}\n" +
184 |       "=====================================================================\n" +
185 |       s"Training data MSE = ${trainRegressionMetrics.meanSquaredError}\n" +
186 |       s"Training data RMSE = ${trainRegressionMetrics.rootMeanSquaredError}\n" +
187 |       s"Training data R-squared = ${trainRegressionMetrics.r2}\n" +
188 |       s"Training data MAE = ${trainRegressionMetrics.meanAbsoluteError}\n" +
189 |       s"Training data Explained variance = ${trainRegressionMetrics.explainedVariance}\n" +
190 |       "=====================================================================\n" +
191 |       s"Validation data MSE = ${validRegressionMetrics.meanSquaredError}\n" +
192 |       s"Validation data RMSE = ${validRegressionMetrics.rootMeanSquaredError}\n" +
193 |       s"Validation data R-squared = ${validRegressionMetrics.r2}\n" +
194 |       s"Validation data MAE = ${validRegressionMetrics.meanAbsoluteError}\n" +
195 |       s"Validation data Explained variance = ${validRegressionMetrics.explainedVariance}\n" +
196 |       "=====================================================================\n" +
197 |       //  s"CV params explained: ${cvModel.explainParams}\n" +
198 |       //  s"GBT params explained: ${bestModel.stages.last.asInstanceOf[GBTRegressionModel].explainParams}\n" +
199 |       s"GBT features importances:\n ${featureCols.zip(featureImportances).map(t => s"\t${t._1} = ${t._2}").mkString("\n")}\n" +
200 |       "=====================================================================\n"
201 | 
202 |     log.info(output)
203 | 
204 | 
205 |     // *****************************************
206 |     log.info("Run prediction over test dataset")
207 |     // *****************************************
208 | 
209 |     // Predicts and saves file ready for Kaggle!
210 |     if(!params.outputFile.isEmpty){
211 |       cvModel.transform(testData)
212 |         .select("id", "prediction")
213 |         .withColumnRenamed("prediction", "loss")
214 |         .coalesce(1)
215 |         .write.format("csv")
216 |         .option("header", "true")
217 |         .save(params.outputFile)
218 |     }
219 |   }
220 | 
221 | 
222 |   /*
223 |    * entry point - main method
224 |    */
225 |   def main(args: Array[String]) {
226 | 
227 |     /*
228 |      * Reading command line parameters
229 |      */
230 | 
231 |     val parser = new OptionParser[Params]("AllstateClaimsSeverityGBTRegressor") {
232 |       head("AllstateClaimsSeverityGBTRegressor", "1.0")
233 | 
234 |       opt[String]("s3AccessKey").required().action((x, c) =>
235 |         c.copy(s3AccessKey = x)).text("The access key for S3")
236 | 
237 |       opt[String]("s3SecretKey").required().action((x, c) =>
238 |         c.copy(s3SecretKey = x)).text("The secret key for S3")
239 | 
240 |       opt[String]("trainInput").required().valueName("<file>").action((x, c) =>
241 |         c.copy(trainInput = x)).text("Path to file/directory for training data")
242 | 
243 |       opt[String]("testInput").required().valueName("<file>").action((x, c) =>
244 |         c.copy(testInput = x)).text("Path to file/directory for test data")
245 | 
246 |       opt[String]("outputFile").valueName("<file>").action((x, c) =>
247 |         c.copy(outputFile = x)).text("Path to output file")
248 | 
249 |       opt[Seq[Int]]("algoMaxIter").valueName("<n1[,n2,n3...]>").action((x, c) =>
250 |         c.copy(algoMaxIter = x)).text("One or more values for limit of iterations. Default: 30")
251 | 
252 |       opt[Seq[Int]]("algoMaxDepth").valueName("<n1[,n2,n3...]>").action((x, c) =>
253 |         c.copy(algoMaxDepth = x)).text("One or more values for depth limit. Default: 3")
254 | 
255 |       opt[Int]("numFolds").action((x, c) =>
256 |         c.copy(numFolds = x)).text("Number of folds for K-fold Cross Validation. Default: 10")
257 | 
258 |       opt[Double]("trainSample").action((x, c) =>
259 |         c.copy(trainSample = x)).text("Sample fraction from 0.0 to 1.0 for train data")
260 | 
261 |       opt[Double]("testSample").action((x, c) =>
262 |         c.copy(testSample = x)).text("Sample fraction from 0.0 to 1.0 for test data")
263 | 
264 |     }
265 | 
266 |     parser.parse(args, Params()) match {
267 |       case Some(params) =>
268 |         process(params)
269 |       case None =>
270 |         throw new IllegalArgumentException("One or more parameters are invalid or missing")
271 |     }
272 |   }
273 | }


--------------------------------------------------------------------------------
/src/main/scala/com/adornes/spark/kaggle/AllstateClaimsSeverityRandomForestRegressor.scala:
--------------------------------------------------------------------------------
  1 | package com.adornes.spark.kaggle
  2 | 
  3 | import org.apache.spark.ml.feature.StringIndexer
  4 | import org.apache.spark.ml.feature.VectorAssembler
  5 | import org.apache.spark.ml.regression.{RandomForestRegressor, RandomForestRegressionModel}
  6 | import org.apache.spark.ml.{Pipeline, PipelineModel}
  7 | import org.apache.spark.ml.evaluation.RegressionEvaluator
  8 | import org.apache.spark.ml.tuning.ParamGridBuilder
  9 | import org.apache.spark.ml.tuning.CrossValidator
 10 | 
 11 | import org.apache.spark.sql._
 12 | import org.apache.spark.sql.functions._
 13 | 
 14 | import org.apache.spark.mllib.evaluation.RegressionMetrics
 15 | 
 16 | import scopt.OptionParser
 17 | 
 18 | import org.apache.log4j.LogManager
 19 | 
 20 | /**
 21 |   * Simple and silly solution for the "Allstate Claims Severity" competition on Kaggle
 22 |   * Competition page: https://www.kaggle.com/c/allstate-claims-severity
 23 |   */
 24 | object AllstateClaimsSeverityRandomForestRegressor {
 25 | 
 26 |   /*
 27 |    * case class for parsing command line params
 28 |    */
 29 | 
 30 |   case class Params(s3AccessKey: String = "", s3SecretKey: String = "",
 31 |                     trainInput: String = "", testInput: String = "",
 32 |                     outputFile: String = "",
 33 |                     algoNumTrees: Seq[Int] = Seq(3),
 34 |                     algoMaxDepth: Seq[Int] = Seq(4),
 35 |                     algoMaxBins: Seq[Int] = Seq(32),
 36 |                     numFolds: Int = 10,
 37 |                     trainSample: Double = 1.0,
 38 |                     testSample: Double = 1.0)
 39 | 
 40 |   /*
 41 |    * Computation logic
 42 |    */
 43 |   def process(params: Params) {
 44 | 
 45 |     /*
 46 |      * Initializing Spark session and logging
 47 |      */
 48 | 
 49 |     val sparkSession = SparkSession.builder.
 50 |       appName("AllstateClaimsSeverityRandomForestRegressor")
 51 |       .getOrCreate()
 52 | 
 53 |     import sparkSession.implicits._
 54 | 
 55 |     val log = LogManager.getRootLogger
 56 | 
 57 | 
 58 |     // ****************************
 59 |     log.info("Loading input data")
 60 |     // ****************************
 61 | 
 62 |     if (params.trainInput.startsWith("s3://")) {
 63 |       sparkSession.conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
 64 |       sparkSession.conf.set("spark.hadoop.fs.s3a.access.key", params.s3AccessKey)
 65 |       sparkSession.conf.set("spark.hadoop.fs.s3a.secret.key", params.s3SecretKey)
 66 |     }
 67 | 
 68 |     // *************************************************
 69 |     log.info("Reading data from train.csv file")
 70 |     // *************************************************
 71 | 
 72 |     val trainInput = sparkSession.read
 73 |       .option("header", "true")
 74 |       .option("inferSchema", "true")
 75 |       .csv(params.trainInput)
 76 |       .cache
 77 | 
 78 |     val testInput = sparkSession.read
 79 |       .option("header", "true")
 80 |       .option("inferSchema", "true")
 81 |       .csv(params.testInput)
 82 |       .cache
 83 | 
 84 |     // *******************************************
 85 |     log.info("Preparing data for training model")
 86 |     // *******************************************
 87 | 
 88 |     val data = trainInput.withColumnRenamed("loss", "label")
 89 |       .sample(false, params.trainSample)
 90 | 
 91 |     val splits = data.randomSplit(Array(0.7, 0.3))
 92 |     val (trainingData, validationData) = (splits(0), splits(1))
 93 | 
 94 |     trainingData.cache
 95 |     validationData.cache
 96 | 
 97 |     val testData = testInput.sample(false, params.testSample).cache
 98 | 
 99 |     // **************************************************
100 |     log.info("Building Machine Learning pipeline")
101 |     // **************************************************
102 | 
103 |     // StringIndexer for categorical columns (OneHotEncoder should be evaluated as well)
104 |     def isCateg(c: String): Boolean = c.startsWith("cat")
105 |     def categNewCol(c: String): String = if (isCateg(c)) s"idx_${c}" else c
106 | 
107 |     val stringIndexerStages = trainingData.columns.filter(isCateg)
108 |       .map(c => new StringIndexer()
109 |         .setInputCol(c)
110 |         .setOutputCol(categNewCol(c))
111 |         .fit(trainInput.select(c).union(testInput.select(c))))
112 | 
113 |     // Function to remove categorical columns with too many categories
114 |     def removeTooManyCategs(c: String): Boolean = !(c matches "cat(109$|110$|112$|113$|116$)")
115 | 
116 |     // Function to select only feature columns (omit id and label)
117 |     def onlyFeatureCols(c: String): Boolean = !(c matches "id|label")
118 | 
119 |     // Definitive set of feature columns
120 |     val featureCols = trainingData.columns
121 |       .filter(removeTooManyCategs)
122 |       .filter(onlyFeatureCols)
123 |       .map(categNewCol)
124 | 
125 |     // VectorAssembler for training features
126 |     val assembler = new VectorAssembler()
127 |       .setInputCols(featureCols)
128 |       .setOutputCol("features")
129 | 
130 |     // Estimator algorithm
131 |     val algo = new RandomForestRegressor().setFeaturesCol("features").setLabelCol("label")
132 | 
133 |     // Building the Pipeline for transformations and predictor
134 |     val pipeline = new Pipeline().setStages((stringIndexerStages :+ assembler) :+ algo)
135 | 
136 | 
137 |     // ***********************************************************
138 |     log.info("Preparing K-fold Cross Validation and Grid Search")
139 |     // ***********************************************************
140 | 
141 |     val paramGrid = new ParamGridBuilder()
142 |       .addGrid(algo.numTrees, params.algoNumTrees)
143 |       .addGrid(algo.maxDepth, params.algoMaxDepth)
144 |       .addGrid(algo.maxBins, params.algoMaxBins)
145 |       .build()
146 | 
147 |     val cv = new CrossValidator()
148 |       .setEstimator(pipeline)
149 |       .setEvaluator(new RegressionEvaluator)
150 |       .setEstimatorParamMaps(paramGrid)
151 |       .setNumFolds(params.numFolds)
152 | 
153 | 
154 |     // ************************************************************
155 |     log.info("Training model with RandomForest algorithm")
156 |     // ************************************************************
157 | 
158 |     val cvModel = cv.fit(trainingData)
159 | 
160 | 
161 |     // **********************************************************************
162 |     log.info("Evaluating model on train and test data and calculating RMSE")
163 |     // **********************************************************************
164 | 
165 |     val trainPredictionsAndLabels = cvModel.transform(trainingData).select("label", "prediction")
166 |       .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd
167 | 
168 |     val validPredictionsAndLabels = cvModel.transform(validationData).select("label", "prediction")
169 |       .map { case Row(label: Double, prediction: Double) => (label, prediction) }.rdd
170 | 
171 |     val trainRegressionMetrics = new RegressionMetrics(trainPredictionsAndLabels)
172 |     val validRegressionMetrics = new RegressionMetrics(validPredictionsAndLabels)
173 | 
174 |     val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel]
175 |     val featureImportances = bestModel.stages.last.asInstanceOf[RandomForestRegressionModel].featureImportances.toArray
176 | 
177 |     val output = "\n=====================================================================\n" +
178 |       s"Param trainSample: ${params.trainSample}\n" +
179 |       s"Param testSample: ${params.testSample}\n" +
180 |       s"TrainingData count: ${trainingData.count}\n" +
181 |       s"ValidationData count: ${validationData.count}\n" +
182 |       s"TestData count: ${testData.count}\n" +
183 |       "=====================================================================\n" +
184 |       s"Param algoNumTrees = ${params.algoNumTrees.mkString(",")}\n" +
185 |       s"Param algoMaxDepth = ${params.algoMaxDepth.mkString(",")}\n" +
186 |       s"Param algoMaxBins = ${params.algoMaxBins.mkString(",")}\n" +
187 |       s"Param numFolds = ${params.numFolds}\n" +
188 |       "=====================================================================\n" +
189 |       s"Training data MSE = ${trainRegressionMetrics.meanSquaredError}\n" +
190 |       s"Training data RMSE = ${trainRegressionMetrics.rootMeanSquaredError}\n" +
191 |       s"Training data R-squared = ${trainRegressionMetrics.r2}\n" +
192 |       s"Training data MAE = ${trainRegressionMetrics.meanAbsoluteError}\n" +
193 |       s"Training data Explained variance = ${trainRegressionMetrics.explainedVariance}\n" +
194 |       "=====================================================================\n" +
195 |       s"Validation data MSE = ${validRegressionMetrics.meanSquaredError}\n" +
196 |       s"Validation data RMSE = ${validRegressionMetrics.rootMeanSquaredError}\n" +
197 |       s"Validation data R-squared = ${validRegressionMetrics.r2}\n" +
198 |       s"Validation data MAE = ${validRegressionMetrics.meanAbsoluteError}\n" +
199 |       s"Validation data Explained variance = ${validRegressionMetrics.explainedVariance}\n" +
200 |       "=====================================================================\n" +
201 |       //  s"CV params explained: ${cvModel.explainParams}\n" +
202 |       //  s"RandomForest params explained: ${bestModel.stages.last.asInstanceOf[RandomForestRegressionModel].explainParams}\n" +
203 |       s"RandomForest features importances:\n ${featureCols.zip(featureImportances).map(t => s"\t${t._1} = ${t._2}").mkString("\n")}\n" +
204 |       "=====================================================================\n"
205 | 
206 |     log.info(output)
207 | 
208 | 
209 |     // *****************************************
210 |     log.info("Run prediction over test dataset")
211 |     // *****************************************
212 | 
213 |     // Predicts and saves file ready for Kaggle!
214 |     if(!params.outputFile.isEmpty){
215 |       cvModel.transform(testData)
216 |         .select("id", "prediction")
217 |         .withColumnRenamed("prediction", "loss")
218 |         .coalesce(1)
219 |         .write.format("csv")
220 |         .option("header", "true")
221 |         .save(params.outputFile)
222 |     }
223 |   }
224 | 
225 | 
226 |   /*
227 |    * entry point - main method
228 |    */
229 |   def main(args: Array[String]) {
230 | 
231 |     /*
232 |      * Reading command line parameters
233 |      */
234 | 
235 |     val parser = new OptionParser[Params]("AllstateClaimsSeverityRandomForestRegressor") {
236 |       head("AllstateClaimsSeverityRandomForestRegressor", "1.0")
237 | 
238 |       opt[String]("s3AccessKey").required().action((x, c) =>
239 |         c.copy(s3AccessKey = x)).text("The access key for S3")
240 | 
241 |       opt[String]("s3SecretKey").required().action((x, c) =>
242 |         c.copy(s3SecretKey = x)).text("The secret key for S3")
243 | 
244 |       opt[String]("trainInput").required().valueName("<file>").action((x, c) =>
245 |         c.copy(trainInput = x)).text("Path to file/directory for training data")
246 | 
247 |       opt[String]("testInput").required().valueName("<file>").action((x, c) =>
248 |         c.copy(testInput = x)).text("Path to file/directory for test data")
249 | 
250 |       opt[String]("outputFile").valueName("<file>").action((x, c) =>
251 |         c.copy(outputFile = x)).text("Path to output file")
252 | 
253 |       opt[Seq[Int]]("algoNumTrees").valueName("<n1[,n2,n3...]>").action((x, c) =>
254 |         c.copy(algoNumTrees = x)).text("One or more options for number of trees for RandomForest model. Default: 3")
255 | 
256 |       opt[Seq[Int]]("algoMaxDepth").valueName("<n1[,n2,n3...]>").action((x, c) =>
257 |         c.copy(algoMaxDepth = x)).text("One or more values for depth limit. Default: 4")
258 | 
259 |       opt[Seq[Int]]("algoMaxBins").valueName("<n1[,n2,n3...]>").action((x, c) =>
260 |         c.copy(algoMaxBins = x)).text("One or more values for depth limit. Default: 32")
261 | 
262 |       opt[Int]("numFolds").action((x, c) =>
263 |         c.copy(numFolds = x)).text("Number of folds for K-fold Cross Validation. Default: 10")
264 | 
265 |       opt[Double]("trainSample").action((x, c) =>
266 |         c.copy(trainSample = x)).text("Sample fraction from 0.0 to 1.0 for train data")
267 | 
268 |       opt[Double]("testSample").action((x, c) =>
269 |         c.copy(testSample = x)).text("Sample fraction from 0.0 to 1.0 for test data")
270 | 
271 |     }
272 | 
273 |     parser.parse(args, Params()) match {
274 |       case Some(params) =>
275 |         process(params)
276 |       case None =>
277 |         throw new IllegalArgumentException("One or more parameters are invalid or missing")
278 |     }
279 |   }
280 | }


--------------------------------------------------------------------------------