├── .gitignore
├── LICENSE
├── README.md
└── labs
    ├── BagOfWordsMeetsBagsOfPopcorn
        ├── BagOfWordsMeetsBagsOfPopcorn.snb
        ├── README.md
        └── images
        │   ├── recallPrecision.png
        │   └── roc.png
    ├── DLFramework
        ├── DLFramework.snb
        └── README.md
    ├── DataAnalysisToolbox
        ├── DataAnalysisToolbox.snb
        ├── README.md
        ├── images
        │   ├── ageHist.png
        │   ├── ageHistPerClass.png
        │   ├── ageHistPerClassStacked.png
        │   └── plotFunction.png
        └── titanic.csv
    ├── IntroToMLandSparkMLPipelines
        ├── Intro To Machine Learning and SparkML Pipelines.snb
        ├── README.md
        └── data
        │   └── data.adult.csv
    ├── IntroToMachineLearning
        ├── IntroToMachineLearning.snb
        ├── README.md
        ├── data.adult.csv
        └── images
        │   ├── ageHistData.png
        │   ├── cgainHistData.png
        │   ├── fnlwgtHistData.png
        │   ├── lrAvgMetrics.png
        │   ├── rfAvgMetrics.png
        │   ├── rfAvgMetrics2.png
        │   └── treeAvgMetrics.png
    └── TitanicSurvivalExploration
        ├── README.md
        ├── TitanicSurvivalExploration.snb
        └── data
            └── titanic_train.csv


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .coverage
 41 | .coverage.*
 42 | .cache
 43 | nosetests.xml
 44 | coverage.xml
 45 | *,cover
 46 | .hypothesis/
 47 | 
 48 | # Translations
 49 | *.mo
 50 | *.pot
 51 | 
 52 | # Django stuff:
 53 | *.log
 54 | local_settings.py
 55 | 
 56 | # Flask instance folder
 57 | instance/
 58 | 
 59 | # Scrapy stuff:
 60 | .scrapy
 61 | 
 62 | # Sphinx documentation
 63 | docs/_build/
 64 | 
 65 | # PyBuilder
 66 | target/
 67 | 
 68 | # IPython Notebook
 69 | .ipynb_checkpoints
 70 | 
 71 | # pyenv
 72 | .python-version
 73 | 
 74 | # celery beat schedule file
 75 | celerybeat-schedule
 76 | 
 77 | # dotenv
 78 | .env
 79 | 
 80 | # virtualenv
 81 | venv/
 82 | ENV/
 83 | 
 84 | # Spyder project settings
 85 | .spyderproject
 86 | 
 87 | 
 88 | *.class
 89 | *.log
 90 | 
 91 | # sbt specific
 92 | .cache
 93 | .history
 94 | .lib/
 95 | dist/*
 96 | target/
 97 | lib_managed/
 98 | src_managed/
 99 | project/boot/
100 | project/plugins/project/
101 | 
102 | # Scala-IDE specific
103 | .scala_dependencies
104 | .worksheet


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Andrey Romanov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # spark-notebook-ml-labs
 2 | All labs are implemented in [Spark Notebook](https://github.com/andypetrella/spark-notebook). In particular spark-notebook-0.6.3 with scala-2.10.5 and spark-1.6.1 was used for the most of the labs.
 3 | In these labs we are going to get familiar with tools for data analysis and machine learning:
 4 | * [breeze](https://github.com/scalanlp/breeze)
 5 | * [spark dataframes](http://spark.apache.org/docs/latest/sql-programming-guide)
 6 | * [spark.ml](http://spark.apache.org/docs/latest/ml-guide.html)
 7 | * spark-notebook visualization capabilities
 8 | 
 9 | Available labs:
10 | * [Data Analysis Toolbox](https://github.com/drewnoff/spark-notebook-ml-labs/tree/master/labs/DataAnalysisToolbox)
11 | * [Titanic Survival Exploration](https://github.com/drewnoff/spark-notebook-ml-labs/tree/master/labs/TitanicSurvivalExploration)
12 | * [Introduction To Machine Learning and Spark ML Pipelines](https://github.com/drewnoff/spark-notebook-ml-labs/tree/master/labs/IntroToMLandSparkMLPipelines)
13 | * [Bag of Words Meets Bags of Popcorn](https://github.com/drewnoff/spark-notebook-ml-labs/tree/master/labs/BagOfWordsMeetsBagsOfPopcorn)
14 | * [Neural Networks & Backpropagation with ND4J](https://github.com/drewnoff/spark-notebook-ml-labs/tree/master/labs/DLFramework)
15 | 


--------------------------------------------------------------------------------
/labs/BagOfWordsMeetsBagsOfPopcorn/images/recallPrecision.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/BagOfWordsMeetsBagsOfPopcorn/images/recallPrecision.png


--------------------------------------------------------------------------------
/labs/BagOfWordsMeetsBagsOfPopcorn/images/roc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/BagOfWordsMeetsBagsOfPopcorn/images/roc.png


--------------------------------------------------------------------------------
/labs/DLFramework/README.md:
--------------------------------------------------------------------------------
  1 | # Neural Networks & Backpropagation with ND4J
  2 | 
  3 | In this lab we're going to implement a small framework for training neural networks for classification tasks using [ND4J](http://nd4j.org/) numerical computing library .
  4 | 
  5 | This lab is not intended to provide full explanation of underlying theory. Recommended materials: [deeplearningbook.org](http://www.deeplearningbook.org/), [Introduction to Deep Learning leacture slides](https://m2dsupsdlclass.github.io/lectures-labs/).
  6 | 
  7 | Our framework will support following neural network layers.
  8 | 
  9 | <img src="http://telegra.ph/file/175a34024bc45651d0be6.png" width=500>
 10 | </img>
 11 | 
 12 |  - **Fully-connected layer (or dense layer)**. Neurons in a fully connected layer have full connections to all activations in the previous layer. Their activations can hence be computed with a matrix multiplication followed by a bias offset.*
 13 |  
 14 |   <img src="http://telegra.ph/file/d2c25b153883ab5964ac9.png" align="center" border="0" alt="\mathrm{Dense} \equiv f\left(\textbf{x}\right)=\textbf{W}\textbf{x}+\textbf{b}" width="197" height="19" />,
 15 |  
 16 |  where
 17 |  <img src="http://telegra.ph/file/382e7cd46918ea933991c.png" align="center" border="0" alt="\textbf{W}\in\mathbb{R}^{(k,n)}" width="237" height="19" /> - weight matrix,
 18 |  <img src="http://telegra.ph/file/ba96de4353ecf27f45a2a.png" align="center" border="0" alt="\textbf{b}\in\mathbb{R}^k" width="215" height="19" /> - bias offset.
 19 |  
 20 |  
 21 |  - **Sigmoid activation layer**. 
 22 |  
 23 |     <img src="http://telegra.ph/file/c8fd772f3bab71cc82c30.png" align="center" border="0" alt="\mathrm{Sigmoid} \equiv f\left(\textbf{x}\right)=\frac{1}{1+\exp^{\textbf{-x}}}" width="229" height="46" />
 24 |     
 25 |     
 26 |  - **[Dropout layer](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf)**. It's introduced to prevent overfitting.
 27 |  It takes parameter $d$ which is equal to probability of individual neuron being "dropped out" during the *training stage* independently for each training example. The removed nodes are then reinserted into the network with their original weights. At *testing stage* we're using the full network with each neuron's output weighted by a factor of $1-d$, so the expected value of the output of any neuron is the same as in the training stages.
 28 |  
 29 |     <img src="http://telegra.ph/file/e965e8a3ac05cbb893efe.png" align="center" border="0" alt=" $$\mathrm{Dropout_{train}} \equiv f\left(\textbf{x}\right)=\textbf{m}\odot\textbf{x}$$    $$\textbf{m} \in \left\{0,1\right\}^{n}$$    $$p\left(m_{i}=0\right)=d$$        $$\mathrm{Dropout_{test}}\equiv f\left(\textbf{x}\right)=\left(1-d\right)\textbf{x}$$" width="444" height="76" />
 30 |     
 31 |     
 32 |  - **Softmax classifier layer**. It's a generalization of binary Logistic Regression classifier to multiple classes. The Softmax classifier gives normalized class probabilities as its output.
 33 |   
 34 |   <img src="http://telegra.ph/file/110df4eb679e501246a78.png" align="center" border="0" alt=" $$\mathrm{Softmax}_{i} \equiv p_{i}\left(\textbf{x}\right)=\frac{e^{x_{i}}}{\sum_{j}{e^{x_{j}}}}$$" width="204" height="36" />
 35 |    
 36 |    We will use the Softmax classifier together with **cross-entropy loss** which is a generalization of binary log loss for multiple classes.
 37 |    The cross-entropy between a “true” distribution $p$ and an estimated distribution $q$ is defined as:
 38 |    
 39 |    <img src="http://telegra.ph/file/f8a1df033c2f8e31d790e.png" align="center" border="0" alt="$$\mathcal{L}=-\sum_{i}{p_{i}\log{q_{i}}}$$" width="137" height="22" />
 40 |    
 41 |    The Softmax classifier is hence minimizing the cross-entropy between the estimated class probabilities and the “true” distribution, where "true" distribution <img src="http://telegra.ph/file/62ef1f14d98148e90ea40.png" align="center" border="0" alt="$\textbf{p}=\left[p_{1}...p_{i}...\right]$" width="114" height="19" /> with only one element is equal to $1$ (true class) and all the other are equal to $0$. 
 42 | 
 43 | ## Install ND4J
 44 | 
 45 | ### Prerequisites
 46 | 
 47 |  - [JavaCPP](http://nd4j.org/getstarted#javacpp)
 48 |  - [BLAS (ATLAS, MKL, or OpenBLAS)](http://nd4j.org/getstarted#blas)
 49 |  
 50 | These will vary depending on whether you’re running on CPUs or GPUs.
 51 | The default backend for CPUs is `nd4j-native-platform`, and for CUDA it is `nd4j-cuda-7.5-platform`.
 52 | 
 53 | Assuming the default backend for CPUs is used, `customDeps` section of Spark Notebook metadata (`Edit` -> `Edit Notebook Metadata`) should look like following:
 54 | 
 55 | ```
 56 |  "customDeps": [
 57 |     "org.bytedeco % javacpp % 1.3.2",
 58 |     "org.nd4j % nd4j-native-platform % 0.8.0",
 59 |     "org.nd4j %% nd4s % 0.8.0",
 60 |     "org.deeplearning4j % deeplearning4j-core % 0.8.0"
 61 |   ]
 62 | ```
 63 | 
 64 | **[ND4J user guide](http://nd4j.org/userguide)** might be of the great help to track neural network components implementation.
 65 | 
 66 | ```scala
 67 | import org.nd4j.linalg.factory.Nd4j
 68 | import org.nd4j.linalg.api.ndarray.INDArray
 69 | import org.nd4j.linalg.ops.transforms.Transforms
 70 | import org.nd4s.Implicits._
 71 | 
 72 | import org.nd4j.linalg.cpu.nativecpu.rng.CpuNativeRandom
 73 | ```
 74 | 
 75 | ```scala
 76 | val rngSEED = 181
 77 | val RNG = new CpuNativeRandom(rngSEED)
 78 | ```
 79 | 
 80 | ## Sigmoid & Softmax functions 
 81 | 
 82 | First let's implement **`sigmoid`** and **`sigmoidGrad`** functions:
 83 | 
 84 |  - **`sigmoid`** function applies sigmoid transformation in an element-wise manner to each row of the input;
 85 |  - **`sigmoidGrad`** computes the gradient for the sigmoid function. It takes sigmoid function value as an input. 
 86 |  
 87 | ```scala
 88 | def sigmoid(x: INDArray): INDArray = {
 89 |   Transforms.pow(Transforms.exp(-x) + 1, -1)
 90 | }
 91 |  
 92 | 
 93 | def sigmoidGrad(f: INDArray): INDArray = {
 94 |   f * (-f + 1)
 95 | }
 96 | ```
 97 | 
 98 | We used [`Transform ops`](http://nd4j.org/userguide#opstransform) to apply element-wise `exp` and `pow`.
 99 | 
100 | **`softmax`** computes the softmax function for each row of the input.
101 | 
102 | ```scala
103 | def softmax(x: INDArray): INDArray = {
104 |   val exps = Transforms.exp(x.addColumnVector(-x.max(1)))
105 |   exps.divColumnVector(exps.sum(1))
106 | }
107 | ```
108 | 
109 | In addition to previously seen `Transforms ops` we also used [`Vector ops`](http://nd4j.org/userguide#opsbroadcast) here to subtract from each row its max element and divide each row by the sum of its elements. 
110 | 
111 | ```scala
112 | def sigmoidTest(): Unit = {
113 |   val x = Array(Array(1, 2), Array(-1, -2)).toNDArray
114 |   val f = sigmoid(x)
115 |   val g = sigmoidGrad(f)
116 |   val sigmoidVals = Array(Array(0.73105858, 0.88079708),
117 |                           Array(0.26894142, 0.11920292)).toNDArray
118 |   val gradVals = Array(Array(0.19661193, 0.10499359),
119 |                        Array(0.19661193, 0.10499359)).toNDArray
120 |   assert((f - Transforms.abs(sigmoidVals)).max(1) < 1e-6)
121 |   assert((g - Transforms.abs(gradVals)).max(1) < 1e-6)
122 |   println("sigmoid tests passed")
123 | }
124 | 
125 | 
126 | def softmaxTest(): Unit = {
127 |   val x = Array(Array(1001, 1002), 
128 |                 Array(3, 4)).toNDArray
129 |   val logits = softmax(x)
130 |   val expectedLogits = Array(Array(0.26894142, 0.73105858),
131 |                              Array(0.26894142, 0.73105858)).toNDArray
132 |   assert((logits - Transforms.abs(expectedLogits)).max(1) < 1e-6)
133 |   assert(
134 |     (softmax(Array(1, 1).toNDArray) - Transforms.abs(Array(0.5, 0.5).toNDArray)).max(1) < 1e-6
135 |   )
136 |   println("softmax tests passed")
137 | }
138 | ```
139 | 
140 | ```scala
141 | sigmoidTest
142 | softmaxTest
143 | ```
144 | 
145 | ## Network Layers
146 | 
147 | Let's define `NetLayer` trait for building network layers. We need to provide two methods:
148 |  - `forwardProp` for forward propagation of input through the neural network in order to generate the network's output.
149 |  - `backProp` for delta backpropagation and weights update. 
150 |    `backProp` takes the weight's output gradients with respect to layer's inputs. The weight's output gradient and input activation are multiplied to find the gradient of the weight. A ratio (gets tuned by `learningRate`) of the weight's gradient is subtracted from the weight.
151 | 
152 | ```scala
153 | trait NetLayer {
154 |   def forwardProp(inputs: INDArray, isTrain: Boolean): INDArray
155 |   def backProp(outputsGrad: INDArray): INDArray
156 | }
157 | ```
158 | 
159 | ```scala
160 | class Dense(inputDim: Int, outputDim: Int, val learningRate: Double) extends NetLayer {
161 |   private val W = Nd4j.rand(Array(inputDim, outputDim), -0.01, 0.01, RNG)
162 |   private val b = Nd4j.rand(Array(1, outputDim), -0.01, 0.01, RNG)
163 |   private var _inputs = Nd4j.zeros(1, inputDim)
164 |   
165 |   def forwardProp(inputs: INDArray, isTrain: Boolean): INDArray = {
166 |     _inputs = inputs
167 |     (inputs mmul W) addRowVector b
168 |   }
169 |   
170 |   def backProp(outputsGrad: INDArray): INDArray = {
171 |     val gradW = _inputs.T mmul outputsGrad
172 |     val gradb = outputsGrad.sum(0)
173 |     val prop = outputsGrad mmul W.T
174 |     W -= gradW * learningRate
175 |     b -= gradb * learningRate
176 |     prop
177 |   }
178 | }
179 | ```
180 | 
181 | ```scala
182 | class SigmoidActivation extends NetLayer {
183 |   private var _outputs = Nd4j.zeros(1)
184 |   
185 |   def forwardProp(inputs: INDArray, isTrain: Boolean): INDArray = {
186 |     _outputs = sigmoid(inputs)
187 |     _outputs
188 |   }
189 |   
190 |   def backProp(outputsGrad: INDArray): INDArray = {
191 |     outputsGrad * sigmoidGrad(_outputs)
192 |   }
193 | }
194 | ```
195 | 
196 | ```scala
197 | class Dropout(val dropRate: Double = 0.0) extends NetLayer {
198 |   var mask: INDArray = Nd4j.zeros(1)
199 |   def forwardProp(inputs: INDArray, isTrain: Boolean): INDArray = {
200 |     if (isTrain) {
201 |       mask = Nd4j.zeros(1, inputs.shape()(1))
202 |       Nd4j.choice(Array(0, 1).toNDArray, Array(dropRate, 1 - dropRate).toNDArray, mask)
203 |       inputs.mulRowVector(mask)
204 |     } else {
205 |       inputs * (1 - dropRate)
206 |     }    
207 |   }
208 |   
209 |   def backProp(outputsGrad: INDArray): INDArray = {
210 |     outputsGrad.mulRowVector(mask)
211 |   }
212 | }
213 | ```
214 | 
215 | We assume that the **Softmax** is always the last layer of the network.
216 | 
217 | Also it can be shown that the gradient of cross-entropy loss of the outputs of softmax layer with respect to softmax layer's input has a simple form:
218 | 
219 | <img src="http://telegra.ph/file/7d07acf33088acd4bebc4.png" align="center" border="0" alt=" $$\frac{\partial \mathcal{L}}{\partial x_{i}}=g_{i}-p_{i}$$" width="97" height="28" />
220 |  
221 |  So to start backpropagation stage let's take the `Softmax` output probabilities alongside with true labels as an input for `backProp` method of the `Softmax` layer.
222 | 
223 | 
224 | ```scala
225 | import org.nd4s.Implicits._
226 | 
227 | class Softmax extends NetLayer {
228 |   def forwardProp(inputs: INDArray, isTrain: Boolean): INDArray = {
229 |     softmax(inputs)
230 |   }
231 |   
232 |   def backProp(outputsGrad: INDArray): INDArray = {
233 |     val predictions = outputsGrad(0, ->)
234 |     val labels = outputsGrad(1, ->)
235 |     predictions - labels
236 |   }
237 | }
238 | ```
239 | 
240 | ```scala
241 | def crossEntropy(predictions: INDArray, labels: INDArray): Double = {
242 |   val cost = - (Transforms.log(predictions) * labels).sumNumber.asInstanceOf[Double]
243 |   cost / labels.shape()(0)
244 | }
245 | ```
246 | 
247 | ```scala
248 | def accuracy(predictions: INDArray, labels: INDArray): Double = {
249 |   val samplesNum = labels.shape()(0)
250 |   val matchesNum = (Nd4j.argMax(predictions, 1) eq Nd4j.argMax(labels, 1)).sumNumber.asInstanceOf[Double]
251 |   100.0 * matchesNum / samplesNum
252 | }
253 | ```
254 | 
255 | ## Neural Network
256 | 
257 | ```scala
258 | import org.nd4j.linalg.dataset.api.iterator.DataSetIterator
259 | import org.nd4j.linalg.dataset.DataSet
260 | ```
261 | 
262 | ```scala
263 | case class Metric(epoch: Int, acc: Double, loss: Double)
264 | ```
265 | 
266 | We will use the class called `DataSetIterator` to fetch `DataSet`s. 
267 | 
268 | ```scala
269 | import scala.collection.JavaConverters._
270 | 
271 | 
272 | case class NeuralNet(layers: Vector[NetLayer] = Vector()) {
273 |   
274 |   def addLayer(layer: NetLayer): NeuralNet = {
275 |     this.copy(layers :+ layer)
276 |   }
277 |   
278 |   def fit(trainData: DataSetIterator, numEpochs: Int, validationData: DataSet): Seq[Metric] = {
279 |     val history = (1 to numEpochs).foldLeft(List[Metric]()){ (history, epoch) =>
280 |       trainData.reset()
281 |       trainData.asScala.foreach ( ds => trainBatch(ds.getFeatures, ds.getLabels) )
282 |       
283 |       // validate on validation Dataset
284 |       val prediction = this.predict(validationData.getFeatures)
285 |       val loss = crossEntropy(prediction, validationData.getLabels)
286 |       val acc = accuracy(prediction, validationData.getLabels)
287 |       
288 |       println(s"Epoch: $epoch/$numEpochs - loss: $loss - acc: $acc")
289 | 
290 |       Metric(epoch, acc, loss) :: history
291 |     }
292 |     history.reverse
293 |   }
294 |   
295 |   def predict(X: INDArray): INDArray = {
296 |     layers.foldLeft(X){
297 |       (input, layer) => layer.forwardProp(input, isTrain=false)
298 |     }
299 |   }
300 |     
301 |   private def trainBatch(X: INDArray, Y: INDArray): Unit = {
302 |     val YPredict = layers.foldLeft(X){
303 |       (input, layer) => layer.forwardProp(input, isTrain=true)
304 |     }
305 |     val shape = Y.shape
306 |     layers.reverse.foldLeft(
307 |       Nd4j.vstack(YPredict, Y).reshape(2, shape(0), shape(1))
308 |     ){
309 |       (deriv, layer) => layer.backProp(deriv)
310 |     }
311 |   }  
312 | }
313 | ```
314 | 
315 | ## MNIST
316 | 
317 | Now let's apply our framework to build neural network for MNIST dataset classification.
318 | The `DatasetIterator` implementation called `MnistDataSetIterator` is available in `deeplearning4j` to iterate over MNIST dataset.
319 | 
320 | ```scala
321 | import org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator
322 | ```
323 | 
324 | ```scala
325 | val learningRate = 0.01
326 | val batchSize = 128
327 | 
328 | val mnistTrain = new MnistDataSetIterator(batchSize, true, rngSEED)
329 | val mnistTest = new MnistDataSetIterator(batchSize, false, rngSEED)
330 | 
331 | val inputDim = mnistTest.next.getFeatures.shape()(1)
332 | val totalTestExamples = mnistTest.numExamples()
333 | ```
334 | 
335 | ```scala
336 | val model = NeuralNet()
337 |             .addLayer(new Dense(inputDim=inputDim, outputDim=512, learningRate=learningRate))
338 |             .addLayer(new SigmoidActivation())
339 |             .addLayer(new Dropout(dropRate=0.3))
340 |             .addLayer(new Dense(512, 512, learningRate))
341 |             .addLayer(new SigmoidActivation())
342 |             .addLayer(new Dropout(0.3))
343 |             .addLayer(new Dense(512, 10, learningRate))
344 |             .addLayer(new Softmax())
345 | ```
346 | 
347 | ```scala
348 | val history = model.fit(mnistTrain, 40, (new MnistDataSetIterator(totalTestExamples, false, rngSEED)).next)
349 | ```
350 | > Epoch: 1/40 - loss: 0.57162705078125 - acc: 81.94
351 | Epoch: 2/40 - loss: 0.348628173828125 - acc: 89.34
352 | Epoch: 3/40 - loss: 0.273960546875 - acc: 91.83
353 | Epoch: 4/40 - loss: 0.2305306396484375 - acc: 92.76
354 | Epoch: 5/40 - loss: 0.20194395751953126 - acc: 93.76
355 | Epoch: 6/40 - loss: 0.17214320068359376 - acc: 94.87
356 | Epoch: 7/40 - loss: 0.15777041015625 - acc: 95.29
357 | Epoch: 8/40 - loss: 0.1411923583984375 - acc: 95.75
358 | Epoch: 9/40 - loss: 0.1371442138671875 - acc: 95.65
359 | Epoch: 10/40 - loss: 0.1223932373046875 - acc: 96.2
360 | Epoch: 11/40 - loss: 0.11889525146484375 - acc: 96.35
361 | Epoch: 12/40 - loss: 0.11355523681640625 - acc: 96.5
362 | Epoch: 13/40 - loss: 0.10255557861328125 - acc: 96.63
363 | Epoch: 14/40 - loss: 0.10248739013671875 - acc: 96.67
364 | Epoch: 15/40 - loss: 0.10121082153320313 - acc: 96.76
365 | Epoch: 16/40 - loss: 0.09314661254882813 - acc: 97.05
366 | Epoch: 17/40 - loss: 0.0908234619140625 - acc: 97.09
367 | Epoch: 18/40 - loss: 0.08782809448242188 - acc: 97.21
368 | Epoch: 19/40 - loss: 0.084460498046875 - acc: 97.25
369 | Epoch: 20/40 - loss: 0.08508148803710938 - acc: 97.32
370 | Epoch: 21/40 - loss: 0.08242890625 - acc: 97.49
371 | Epoch: 22/40 - loss: 0.07931015014648438 - acc: 97.55
372 | Epoch: 23/40 - loss: 0.07825602416992188 - acc: 97.6
373 | Epoch: 24/40 - loss: 0.07847127685546874 - acc: 97.47
374 | Epoch: 25/40 - loss: 0.07547276611328126 - acc: 97.6
375 | Epoch: 26/40 - loss: 0.074110009765625 - acc: 97.64
376 | Epoch: 27/40 - loss: 0.07486264038085938 - acc: 97.69
377 | Epoch: 28/40 - loss: 0.07151276245117187 - acc: 97.73
378 | Epoch: 29/40 - loss: 0.07469411010742187 - acc: 97.76
379 | Epoch: 30/40 - loss: 0.06966272583007813 - acc: 97.88
380 | Epoch: 31/40 - loss: 0.066982666015625 - acc: 97.84
381 | Epoch: 32/40 - loss: 0.06796741333007812 - acc: 97.87
382 | Epoch: 33/40 - loss: 0.06789564208984375 - acc: 97.95
383 | Epoch: 34/40 - loss: 0.065538916015625 - acc: 98.03
384 | Epoch: 35/40 - loss: 0.066549365234375 - acc: 97.88
385 | Epoch: 36/40 - loss: 0.06736263427734375 - acc: 97.83
386 | Epoch: 37/40 - loss: 0.0646685302734375 - acc: 97.98
387 | Epoch: 38/40 - loss: 0.0628564208984375 - acc: 97.97
388 | Epoch: 39/40 - loss: 0.0657330322265625 - acc: 98.0
389 | Epoch: 40/40 - loss: 0.063365771484375 - acc: 97.98
390 | 
391 | ```scala
392 | CustomPlotlyChart(history,
393 |                   layout="{title: 'Accuracy on validation set', xaxis: {title: 'epoch'}, yaxis: {title: '%'}}",
394 |                   dataOptions="{mode: 'lines'}",
395 |                   dataSources="{x: 'epoch', y: 'acc'}")
396 | ```
397 | 
398 | <img src="http://telegra.ph/file/df9016b6205a2f69685c8.png" width=900>
399 | </img>
400 | 
401 | ```scala
402 | CustomPlotlyChart(history,
403 |                   layout="{title: 'Cross entropy on validation set', xaxis: {title: 'epoch'}, yaxis: {title: 'loss'}}",
404 |                   dataOptions="""{
405 |                     mode: 'lines', 
406 |                     line: {
407 |                           color: 'green',
408 |                           width: 3
409 |                           }
410 |                     }""",
411 |                   dataSources="{x: 'epoch', y: 'loss'}")
412 | ```
413 | 
414 | <img src="http://telegra.ph/file/d6c6553512a5d991e5f37.png" width=900>
415 | </img>
416 | 
417 | ## On your own:
418 |  - Implement [rectified linear unit (ReLU)](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) activation.
419 |  - Add support for *L2* regularization on `Dense` layer weights.
420 |  - Train similar neural network with `relu` activation instead of `sigmoid` activation and added support for `L2` regularization. Compare obtained results.
421 | 


--------------------------------------------------------------------------------
/labs/DataAnalysisToolbox/README.md:
--------------------------------------------------------------------------------
  1 | ## Data Analysis Toolbox
  2 | In this lab we are going to get familiar with **Breeze** numerical processing library, Spark **DataFrames** (distributed collections of data organized into named columns) and **C3 Charts** library in a way of solving little challenges. At the beginning of each section are reference materials necessary for solving the problems.
  3 | ### Breeze
  4 | * [Quick start tutorial](https://github.com/scalanlp/breeze/wiki/Quickstart)
  5 | 
  6 | ```scala
  7 | import breeze.linalg._
  8 | import breeze.stats.{mean, stddev}
  9 | import breeze.stats.distributions._
 10 | ```
 11 | 
 12 | 
 13 | ><pre>
 14 | > import breeze.linalg._
 15 | > import breeze.stats.{mean, stddev}
 16 | > import breeze.stats.distributions._
 17 | ></pre>
 18 | 
 19 | 
 20 | 
 21 | ** Problem 1.** Implement a method that takes Matrix X and two sequences ii and jj of equal size as an input and produces breeze.linalg.DenseVector[Double] of elements [X[ii[0], jj[0]], X[ii[1], jj[1]], ..., X[ii[N-1], jj[N-1]]].
 22 | 
 23 | ```scala
 24 | def constructVector(X: Matrix[Double], ii: Seq[Int], jj: Seq[Int]): DenseVector[Double] = ???
 25 | ```
 26 | 
 27 | 
 28 | ><pre>
 29 | > constructVector: (X: breeze.linalg.Matrix[Double], ii: Seq[Int], jj: Seq[Int])breeze.linalg.DenseVector[Double]
 30 | ></pre>
 31 | 
 32 | 
 33 | 
 34 | 
 35 | ```scala
 36 | // Solution for problem 1
 37 | def constructVector(X: Matrix[Double], ii: Seq[Int], jj: Seq[Int]): DenseVector[Double] =
 38 |   DenseVector(ii.zip(jj).map(ix => X(ix._1, ix._2)).toArray)
 39 | 
 40 | constructVector(DenseMatrix((1.0,2.0,3.0), 
 41 |                             (4.0,5.0,6.0), 
 42 |                             (7.0, 8.0, 9.0)), 
 43 |                 List(0, 1, 2), List(0, 1, 2))
 44 | ```
 45 | 
 46 | 
 47 | ><pre>
 48 | > constructVector: (X: breeze.linalg.Matrix[Double], ii: Seq[Int], jj: Seq[Int])breeze.linalg.DenseVector[Double]
 49 | > res4: breeze.linalg.DenseVector[Double] = DenseVector(1.0, 5.0, 9.0)
 50 | ></pre>
 51 | 
 52 | > DenseVector(1.0, 5.0, 9.0)
 53 | 
 54 | ** Problem 2. ** Write a method to calculate the product of nonzero elements on the diagonal of a rectangular matrix. For example, for X = Matrix((1.0, 0.0, 1.0), (2.0, 0.0, 2.0), (3.0, 0.0, 3.0), (4.0, 4.0, 4.0)) the answer is Some(3). If there are no nonzero elements, the method should return None.
 55 | 
 56 | ```scala
 57 | def nonzeroProduct(X: Matrix[Double]): Option[Double] = ???
 58 | ```
 59 | 
 60 | 
 61 | ><pre>
 62 | > nonzeroProduct: (X: breeze.linalg.Matrix[Double])Option[Double]
 63 | ></pre>
 64 | 
 65 | 
 66 | 
 67 | 
 68 | ```scala
 69 | // Solution for problem 2
 70 | def nonzeroProduct(X: Matrix[Double]): Option[Double] =
 71 |   (0 until min(X.rows, X.cols)).map(i => X(i, i)).filter(_ != 0) match {
 72 |   case Seq() => None
 73 |   case xs => Some(xs.reduce(_ * _))
 74 | }
 75 | 
 76 | nonzeroProduct(Matrix((1.0, 0.0, 1.0), (2.0, 0.0, 2.0), (3.0, 0.0, 3.0), (4.0, 4.0, 4.0)))
 77 | ```
 78 | 
 79 | 
 80 | ><pre>
 81 | > nonzeroProduct: (X: breeze.linalg.Matrix[Double])Option[Double]
 82 | > res7: Option[Double] = Some(3.0)
 83 | ></pre>
 84 | 
 85 | > Some(3.0)
 86 | 
 87 | ** Problem 3. ** Write a method to find the maximum element of the vector with the preceding zero element. For example, for Vector(6, 2, 0, 3, 0, 0, 5, 7, 0) the answer is Some(5). If there are no such an elements, the method should return None.
 88 | 
 89 | ```scala
 90 | def maxAfterZeroElement(vec: Vector[Double]): Option[Double] = ???
 91 | ```
 92 | 
 93 | 
 94 | ><pre>
 95 | > maxAfterZeroElement: (vec: breeze.linalg.Vector[Double])Option[Double]
 96 | ></pre>
 97 | 
 98 | 
 99 | 
100 | 
101 | ```scala
102 | def maxAfterZeroElement(vec: Vector[Double]): Option[Double] =
103 |   vec.toArray.foldLeft((None, false): (Option[Double], Boolean))(
104 |     (prev: (Option[Double], Boolean), el: Double) =>
105 |     if (el == 0) {
106 |       (prev._1, true)
107 |     } else {
108 |       prev match {
109 |         case (p, false) => (p, false)
110 |         case (None, true) => (Some(el), false)
111 |         case (Some(m), true) => ({if (el > m) Some(el) else Some(m)}, false)
112 |       }
113 |     }
114 |   )._1
115 | ```
116 | 
117 | 
118 | ><pre>
119 | > maxAfterZeroElement: (vec: breeze.linalg.Vector[Double])Option[Double]
120 | ></pre>
121 | 
122 | 
123 | 
124 | ** Problem 4. ** Write a method that takes Matrix X and some number Double v and returns closest matrix element to given number v. For example: for X = new DenseMatrix(2, 5, DenseVector.range(0, 10).mapValues(_.toDouble).toArray) and v = 3.6 the answer would be 4.0.
125 | 
126 | ```scala
127 | def closestValue(X: DenseMatrix[Double], v: Double): Double = ???
128 | ```
129 | 
130 | 
131 | ><pre>
132 | > closestValue: (X: breeze.linalg.DenseMatrix[Double], v: Double)Double
133 | ></pre>
134 | 
135 | 
136 | 
137 | 
138 | ```scala
139 | // Solution for problem 4
140 | import scala.math.abs
141 | 
142 | def closestValue(X: DenseMatrix[Double], v: Double): Double =
143 |   X(argmin(X.map(e => abs(e - v))))
144 | ```
145 | 
146 | 
147 | ><pre>
148 | > import scala.math.abs
149 | > closestValue: (X: breeze.linalg.DenseMatrix[Double], v: Double)Double
150 | ></pre>
151 | 
152 | 
153 | 
154 | 
155 | ```scala
156 | // Another solution for problem 4
157 | import breeze.numerics.abs
158 | 
159 | def closestValue(X: DenseMatrix[Double], v: Double): Double =
160 |   X(argmin(abs(X - v)))
161 | ```
162 | 
163 | 
164 | ><pre>
165 | > import breeze.numerics.abs
166 | > closestValue: (X: breeze.linalg.DenseMatrix[Double], v: Double)Double
167 | ></pre>
168 | 
169 | 
170 | 
171 | ** Problem 5. ** Write a method that takes Matrix X and scales each column of this matrix by subtracting mean value and dividing by standard deviation of the column. For testing one can generate random matrix. Avoid division by zero.
172 | 
173 | ```scala
174 | def scale(X: DenseMatrix[Double]): Unit = ???
175 | ```
176 | 
177 | 
178 | ><pre>
179 | > scale: (X: breeze.linalg.DenseMatrix[Double])Unit
180 | ></pre>
181 | 
182 | 
183 | 
184 | 
185 | ```scala
186 | // Solution for problem 5
187 | def scale(X: DenseMatrix[Double]): Unit = {
188 |   val mm = mean(X(::, *))    // using broadcasting
189 |   val std = stddev(X(::, *)) // https://github.com/scalanlp/breeze/wiki/Quickstart#broadcasting
190 |   (0 until X.cols).foreach{i =>
191 |     if (std(0, i) == 0.0) {
192 |       X(::, i) := 0.0
193 |     } else {
194 |       X(::, i) := (X(::, i) - mm(0, i)) :/ std(0, i)
195 |     }
196 |   }
197 | }
198 | ```
199 | 
200 | 
201 | ><pre>
202 | > scale: (X: breeze.linalg.DenseMatrix[Double])Unit
203 | ></pre>
204 | 
205 | 
206 | 
207 | 
208 | ```scala
209 | // Another solution for problem 5
210 | def scale(X: DenseMatrix[Double]): Unit =
211 |   (0 until X.cols).map{i =>
212 |     val col = X(::, i)
213 |     val std = stddev(col)
214 |     if (std != 0.0) {
215 |       X(::, i) := (col - mean(col)) / std
216 |     } else {
217 |       X(::, i) := DenseVector.zeros[Double](col.size)
218 |     }
219 |   }
220 | ```
221 | 
222 | 
223 | ><pre>
224 | > scale: (X: breeze.linalg.DenseMatrix[Double])Unit
225 | ></pre>
226 | 
227 | 
228 | 
229 | 
230 | ```scala
231 | // Let's test our scale method on random data
232 | val nd = new Gaussian(12, 20)
233 | val m = DenseMatrix.rand(10, 3, nd)
234 | println(m)
235 | println("============")
236 | scale(m)
237 | println(m)
238 | ```
239 | 
240 | 
241 | ><pre>
242 | > 15.590452840444563  26.751701453651677   -3.87442957211206    
243 | > 20.327157147052404  4.872835405186789    -1.723076564770194   
244 | > 8.623837647458954   -12.515032706820008  17.23652514034355    
245 | > -22.6959606971933   -3.5252869052855402  -28.569802562830404  
246 | > 5.084148521366598   6.537587281421278    1.27947368109675     
247 | > 45.550604542120766  33.63584014298664    14.398835562651708   
248 | > 28.39067989774948   21.884251067827837   26.21188242480804    
249 | > 35.760270426060366  33.15913097645061    43.652905311745315   
250 | > -6.957271573704126  30.631777233387844   4.858850308567796    
251 | > 32.17744687777203   8.983683803901943    4.909365750891229    
252 | > ============
253 | > -0.02858428109928919  0.714489638531793    -0.6056134391326071   
254 | > 0.1990918470152323    -0.6204508202172598  -0.4943741445319128   
255 | > -0.36344410568028807  -1.6813727428933674  0.48596367654601474   
256 | > -1.8688727663822855   -1.132862808340346   -1.882528878864535    
257 | > -0.5335840727948753   -0.5188758744023809  -0.3391222773517981   
258 | > 1.411491116397139     1.1345258158947291   0.33923620511713787   
259 | > 0.5866760236928795    0.41750183136681246  0.9500494912429874    
260 | > 0.9409054052767336    1.1054393745747901   1.8518666623146085    
261 | > -1.1123712899791023   0.9512327188133928   -0.1540446402595702   
262 | > 0.7686921235538567    -0.3696271333281644  -0.15143265508032536  
263 | > nd: breeze.stats.distributions.Gaussian = Gaussian(12.0, 20.0)
264 | > m: breeze.linalg.DenseMatrix[Double] = 
265 | > -0.02858428109928919  0.714489638531793    -0.6056134391326071   
266 | > 0.1990918470152323    -0.6204508202172598  -0.4943741445319128   
267 | > -0.36344410568028807  -1.6813727428933674  0.48596367654601474   
268 | > -1.8688727663822855   -1.132862808340346   -1.882528878864535    
269 | > -0.5335840727948753   -0.5188758744023809  -0.3391222773517981   
270 | > 1.411491116397139     1.1345258158947291   0.33923620511713787   
271 | > 0.5866760236928795    0.41750183136681246  0.9500494912429874    
272 | > 0.9409054052767336    1.1054393745747901   1.8518666623146085    
273 | > -1.1123712899791023   0.9512327188133928   -0.1540446402595702   
274 | > 0.7686921235538567    -0.3696271333281644  -0.15143265508032536  
275 | ></pre>
276 | 
277 | 
278 | 
279 | ** Problem 6. ** Implement a method that for given matrix X finds:
280 | * the determinant
281 | * the trace
282 | * max and min elements
283 | * Frobenius Norm
284 | * eigenvalues
285 | * inverse matrix
286 | 
287 | For testing one can generate random matrix from normal distribution $N(10, 1)$.
288 | 
289 | ```scala
290 | def getStats(X: Matrix[Double]): Unit = ???
291 | ```
292 | 
293 | 
294 | ><pre>
295 | > getStats: (X: breeze.linalg.Matrix[Double])Unit
296 | ></pre>
297 | 
298 | 
299 | 
300 | 
301 | ```scala
302 | // Solution for problem 6
303 | def getStats(X: DenseMatrix[Double]): String = {
304 |   val dt = det(X)
305 |   val tr = trace(X)
306 |   val minE = min(X)
307 |   val maxE = max(X)
308 |   val frob = breeze.linalg.norm(X.toDenseVector)
309 |   val ev = eig(X).eigenvalues
310 |   val invM = inv(X)
311 |   
312 |   s"""Stats:
313 | determinant: $dt
314 | trace: $tr
315 | min element: $minE
316 | max element: $maxE
317 | Frobenius Norm: $frob
318 | eigenvalues: $ev
319 | inverse matrix:\n$invM""".stripMargin 
320 | }
321 | ```
322 | 
323 | 
324 | ><pre>
325 | > getStats: (X: breeze.linalg.DenseMatrix[Double])String
326 | ></pre>
327 | 
328 | 
329 | 
330 | 
331 | ```scala
332 | // Let's test our scale method on random data
333 | val nd = new Gaussian(10, 1)
334 | val X = DenseMatrix.rand(4, 4, nd)
335 | ```
336 | 
337 | 
338 | ><pre>
339 | > nd: breeze.stats.distributions.Gaussian = Gaussian(10.0, 1.0)
340 | > X: breeze.linalg.DenseMatrix[Double] = 
341 | > 10.15867550081024   10.713391519035639  10.18898336794234   11.633517053992334  
342 | > 9.077895190590993   10.687077605375258  9.75691251834008    10.289451974113568  
343 | > 12.419948133142773  8.799359381094582   12.333412584337028  9.616047767507087   
344 | > 9.018762639197664   11.122058811926983  9.603119538562519   10.441697550864596  
345 | ></pre>
346 | 
347 | 
348 | 
349 | 
350 | ```scala
351 | println(getStats(X))
352 | ```
353 | 
354 | 
355 | ><pre>
356 | > Stats:
357 | > determinant: -14.64894396592202
358 | > trace: 43.62086324138712
359 | > min element: 8.799359381094582
360 | > max element: 12.419948133142773
361 | > Frobenius Norm: 41.681818838737364
362 | > eigenvalues: DenseVector(41.461632636433905, 1.182643130384728, 1.182643130384728, -0.20605565581625584)
363 | > inverse matrix:
364 | > 0.37634342430946144  -4.699111409373191   0.45067158561397047   3.796260506021671    
365 | > -0.3874775018168392  -1.7712409918032728  0.09247520887399419   2.0919567065524114   
366 | > -0.6039672881460412  4.807753751137877    -0.20653804947039545  -3.8745431604779714  
367 | > 0.6431296809327107   1.523754431265583    -0.297806479947489    -1.8480459447576396  
368 | ></pre>
369 | 
370 | 
371 | 
372 | ### DataFrames
373 | * https://databricks.com/blog/2015/02/17/introducing-dataframes-in-spark-for-large-scale-data-science.html
374 | * http://spark.apache.org/docs/latest/sql-programming-guide.html
375 | 
376 | In this lab we will be using [data](https://www.kaggle.com/c/titanic/download/train.csv) from [Titanic dataset](https://www.kaggle.com/c/titanic/data).
377 | To load data from csv file direct to Spark's Dataframe we will use [spark-csv](http://spark-packages.org/package/databricks/spark-csv) package.
378 | To add spark-csv package to spark notebook one could add "com.databricks:spark-csv_2.10:1.4.0" (or "com.databricks:spark-csv_2.11:1.4.0" for Scala 2.11) dependency into customDeps conf section. Alternatively one could specify this dependency in `--packages` command line option while submiting spark application to a cluster (`spark-submit`) or launching spark shell (`spark-shell`). 
379 | 
380 | ```scala
381 | import org.apache.spark.sql.SQLContext
382 | ```
383 | 
384 | 
385 | ><pre>
386 | > import org.apache.spark.sql.SQLContext
387 | ></pre>
388 | 
389 | 
390 | 
391 | 
392 | ```scala
393 | val sqlContext = new SQLContext(sc)
394 | 
395 | val df = sqlContext.read
396 |     .format("com.databricks.spark.csv")
397 |     .option("header", "true")
398 |     .option("inferSchema", "true")
399 |     .load("notebooks/labs/DataAnalysisToolbox/titanic.csv")
400 | ```
401 | 
402 | 
403 | ><pre>
404 | > sqlContext: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@31b5f894
405 | > df: org.apache.spark.sql.DataFrame = [PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]
406 | ></pre>
407 | 
408 | 
409 | 
410 | 
411 | ```scala
412 | // df.show()
413 | df.limit(5)
414 | ```
415 | 
416 | 
417 | ><pre>
418 | > res26: org.apache.spark.sql.DataFrame = [PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]
419 | ></pre>
420 | 
421 | 
422 | 
423 | **Problem 1.** Describe given dataset by answering following questions. How many women and men were on board? How many passengers were in each class? What is the average/minimum/maximum age of passengers? What can you say about the number of the surviving passengers?
424 | 
425 | ```scala
426 | // Solution for problem 1
427 | import org.apache.spark.sql.functions.{min, max, mean}
428 | 
429 | df.groupBy("Sex").count().show()
430 | df.groupBy("Pclass").count().show()
431 | df.select(mean("Age").alias("Average Age"), min("Age"), max("Age")).show()
432 | 
433 | val totalPassengers = df.count()
434 | val survived = df.groupBy("Survived").count()
435 | survived.withColumn("%", (survived("count") / totalPassengers) * 100).show()
436 | ```
437 | 
438 | 
439 | ><pre>
440 | > +------+-----+
441 | > |   Sex|count|
442 | > +------+-----+
443 | > |female|  314|
444 | > |  male|  577|
445 | > +------+-----+
446 | > 
447 | > +------+-----+
448 | > |Pclass|count|
449 | > +------+-----+
450 | > |     1|  216|
451 | > |     2|  184|
452 | > |     3|  491|
453 | > +------+-----+
454 | > 
455 | > +-----------------+--------+--------+
456 | > |      Average Age|min(Age)|max(Age)|
457 | > +-----------------+--------+--------+
458 | > |29.69911764705882|    0.42|    80.0|
459 | > +-----------------+--------+--------+
460 | > 
461 | > +--------+-----+-----------------+
462 | > |Survived|count|                %|
463 | > +--------+-----+-----------------+
464 | > |       0|  549|61.61616161616161|
465 | > |       1|  342|38.38383838383838|
466 | > +--------+-----+-----------------+
467 | > 
468 | > import org.apache.spark.sql.functions.{min, max, mean}
469 | > totalPassengers: Long = 891
470 | > survived: org.apache.spark.sql.DataFrame = [Survived: int, count: bigint]
471 | ></pre>
472 | 
473 | 
474 | 
475 | **Problem 2.** Is it true that women were more likely to survive than men? Who had more chances to survive: the passenger with a cheap ticket or the passenger with an expensive one? Is that true that youngest passengers had more chances to survive?
476 | 
477 | ```scala
478 | import org.apache.spark.sql.functions.{sum, count}
479 | import org.apache.spark.sql.types.IntegerType
480 | ```
481 | 
482 | 
483 | ><pre>
484 | > import org.apache.spark.sql.functions.{sum, count}
485 | > import org.apache.spark.sql.types.IntegerType
486 | ></pre>
487 | 
488 | 
489 | 
490 | 
491 | ```scala
492 | // Answer for q1
493 | df.groupBy("Sex")
494 |        .agg((sum("Survived") / count("Survived"))
495 |        .alias("survived part"))
496 | .show()
497 | ```
498 | 
499 | 
500 | ><pre>
501 | > +------+-------------------+
502 | > |   Sex|      survived part|
503 | > +------+-------------------+
504 | > |female| 0.7420382165605095|
505 | > |  male|0.18890814558058924|
506 | > +------+-------------------+
507 | ></pre>
508 | 
509 | 
510 | 
511 | Women were more likely to survive.
512 | 
513 | ```scala
514 | // Answer for q2
515 | val survivedByFareRange = df.select(df("Survived"), 
516 |                                   ((df("Fare") / (df("SibSp") + df("Parch") + 1) / 5).cast(IntegerType)
517 |                                   ).alias("fareRange"))
518 | 
519 | survivedByFareRange.groupBy("fareRange")
520 |                    .agg((sum("Survived") / count("Survived")).alias("Survived part"),
521 |                       count("Survived").alias("passengers num"))
522 | .sort("fareRange")
523 | .show()
524 | ```
525 | 
526 | 
527 | ><pre>
528 | > +---------+-------------------+--------------+
529 | > |fareRange|      Survived part|passengers num|
530 | > +---------+-------------------+--------------+
531 | > |        0|0.26744186046511625|            86|
532 | > |        1|0.27058823529411763|           425|
533 | > |        2| 0.4122137404580153|           131|
534 | > |        3| 0.5652173913043478|            23|
535 | > |        4| 0.2222222222222222|             9|
536 | > |        5| 0.5714285714285714|            70|
537 | > |        6|             0.5625|            32|
538 | > |        7|               0.56|            25|
539 | > |        8|                0.6|            15|
540 | > |        9|               0.75|             8|
541 | > |       10| 0.4166666666666667|            12|
542 | > |       11|                0.8|            10|
543 | > |       13|                1.0|             3|
544 | > |       14|               0.25|             4|
545 | > |       15| 0.6666666666666666|             9|
546 | > |       16|                1.0|             3|
547 | > |       17|                1.0|             3|
548 | > |       18|                1.0|             1|
549 | > |       21|                1.0|             3|
550 | > |       22|                1.0|             2|
551 | > +---------+-------------------+--------------+
552 | > only showing top 20 rows
553 | > 
554 | > survivedByFareRange: org.apache.spark.sql.DataFrame = [Survived: int, fareRange: int]
555 | ></pre>
556 | 
557 | 
558 | 
559 | We can see that passengers with cheapest tickets had lowest chances to survive. To obtain ticket cost per passenger we had to divide ticket fare by number of persons (one person itself + number of Siblings/Spouses aboard + number of parents/children aboard) included in fare.
560 | 
561 | ```scala
562 | // Answer for q3
563 | val survivedByAgeDecade = df.select(df("Survived"), 
564 |                                     ((df("Age") / 10).cast(IntegerType)).alias("decade"))
565 | survivedByAgeDecade.filter(survivedByAgeDecade("decade").isNotNull).
566 |                 groupBy("decade")
567 |                 .agg((sum("Survived") / count("Survived")).alias("Survived part"),
568 |                       count("Survived").alias("passengers num"))
569 | .sort("decade")
570 | .show()
571 | ```
572 | 
573 | 
574 | ><pre>
575 | > +------+-------------------+--------------+
576 | > |decade|      Survived part|passengers num|
577 | > +------+-------------------+--------------+
578 | > |     0| 0.6129032258064516|            62|
579 | > |     1| 0.4019607843137255|           102|
580 | > |     2|               0.35|           220|
581 | > |     3|  0.437125748502994|           167|
582 | > |     4|0.38202247191011235|            89|
583 | > |     5| 0.4166666666666667|            48|
584 | > |     6| 0.3157894736842105|            19|
585 | > |     7|                0.0|             6|
586 | > |     8|                1.0|             1|
587 | > +------+-------------------+--------------+
588 | > 
589 | > survivedByAgeDecade: org.apache.spark.sql.DataFrame = [Survived: int, decade: int]
590 | ></pre>
591 | 
592 | 
593 | 
594 | Here we can see that youngest passengers had more chances to survive
595 | **Problem 3.** Find all features with missing values. Suggest ways of handling features with missing values  and specify their advantages nad disadvantages. Apply these methods to a given data set.
596 | **A.** Missing values can be replaced by the mean, the median or the most frequent value. The mean is not a robust tool since it is largely influenced by outliers and is better suited for normaly distributed features. The median is a more robust estimator for data with high magnitude variables and is generally used for skewed distributions. Fost frequent value is better suited for categorical features.
597 | 
598 | ```scala
599 | df.columns.filter(col => df.filter(df(col).isNull).count > 0)
600 | ```
601 | 
602 | 
603 | ><pre>
604 | > res37: Array[String] = Array(Age)
605 | ></pre>
606 | 
607 | 
608 | 
609 | 
610 | ```scala
611 | // using mean value
612 | val meanAge = df.select(mean("Age")).first.getDouble(0)
613 | df.select("Age").na.fill(meanAge).limit(10)
614 | ```
615 | 
616 | 
617 | ><pre>
618 | > meanAge: Double = 29.69911764705882
619 | > res39: org.apache.spark.sql.DataFrame = [Age: double]
620 | ></pre>
621 | 
622 | 
623 | 
624 | 
625 | ```scala
626 | // using median value
627 | import org.apache.spark.SparkContext._
628 | 
629 | def getMedian(rdd: RDD[Double]): Double = {
630 |   val sorted = rdd.sortBy(identity).zipWithIndex().map {
631 |     case (v, idx) => (idx, v)
632 |   }
633 | 
634 |   val count = sorted.count()
635 | 
636 |   if (count % 2 == 0) {
637 |     val l = count / 2 - 1
638 |     val r = l + 1
639 |     (sorted.lookup(l).head + sorted.lookup(r).head).toDouble / 2
640 |   } else sorted.lookup(count / 2).head.toDouble
641 | }
642 | val ageRDD = df.filter(df("Age").isNotNull).select("Age").map(row => row.getDouble(0))
643 | val medianAge = getMedian(ageRDD)
644 | 
645 | df.select("Age").na.fill(medianAge).limit(10)
646 | ```
647 | 
648 | 
649 | ><pre>
650 | > import org.apache.spark.SparkContext._
651 | > getMedian: (rdd: org.apache.spark.rdd.RDD[Double])Double
652 | > ageRDD: org.apache.spark.rdd.RDD[Double] = MapPartitionsRDD[282] at map at <console>:91
653 | > medianAge: Double = 28.0
654 | > res41: org.apache.spark.sql.DataFrame = [Age: double]
655 | ></pre>
656 | 
657 | 
658 | 
659 | ### C3 Charts
660 | * http://c3js.org/examples.html
661 | * also have a look at `viz/Simple & Flexible Custom C3 Charts` notebook supplied with spark-notebook distribution.
662 | 
663 | ```scala
664 | import notebook.front.widgets.CustomC3Chart
665 | ```
666 | 
667 | 
668 | ><pre>
669 | > import notebook.front.widgets.CustomC3Chart
670 | ></pre>
671 | 
672 | 
673 | 
674 | ** Problem 1. ** Plot funtion y(x) with blue color and it's confidence interval with green shaded area on the graph using data generated by following function.
675 | 
676 | ```scala
677 | import breeze.linalg._
678 | import breeze.numerics._
679 | import breeze.stats.distributions._
680 | import math.{Pi=>pi}
681 | 
682 | val genData = () => {
683 |   val x = linspace(0, 30, 100)
684 |   val y = sin(x*pi/6.0) + DenseVector.rand(x.size, new Gaussian(0, 0.02))
685 |   val error = DenseVector.rand(y.size, new Gaussian(0.1, 0.02))
686 |   (x, y, error)
687 | }
688 | ```
689 | 
690 | 
691 | ><pre>
692 | > import breeze.linalg._
693 | > import breeze.numerics._
694 | > import breeze.stats.distributions._
695 | > import math.{Pi=>pi}
696 | > genData: () => (breeze.linalg.DenseVector[Double], breeze.linalg.DenseVector[Double], breeze.linalg.DenseVector[Double]) = <function0>
697 | ></pre>
698 | 
699 | 
700 | 
701 | 
702 | ```scala
703 | // Incomplete solution (follow the issue https://github.com/c3js/c3/issues/402)
704 | 
705 | val (x, y, error) = genData()
706 | 
707 | case class Point(x: Double, y: Double, plusError: Double, minusError: Double)
708 | 
709 | val plotData = x.toArray.zip(y.toArray).zip(error.toArray).map(pp => Point(pp._1._1, 
710 |                                                                            pp._1._2, 
711 |                                                                            pp._1._2 + pp._2,
712 |                                                                            pp._1._2 - pp._2))
713 | CustomC3Chart(plotData,
714 |               """{ data: { x: 'x', 
715 |                           types: {y: 'line', plusError: 'line', minusError: 'line'},
716 |                           colors: {y: 'blue',
717 |                                    plusError: 'green',
718 |                                    minusError: 'green'}
719 |                          },
720 |                     point: {
721 |                       show: false
722 |                     }
723 |                   }""")
724 | ```
725 | 
726 | 
727 | ><pre>
728 | > x: breeze.linalg.DenseVector[Double] = DenseVector(0.0, 0.30303030303030304, 0.6060606060606061, 0.9090909090909092, 1.2121212121212122, 1.5151515151515151, 1.8181818181818183, 2.121212121212121, 2.4242424242424243, 2.7272727272727275, 3.0303030303030303, 3.3333333333333335, 3.6363636363636367, 3.9393939393939394, 4.242424242424242, 4.545454545454546, 4.848484848484849, 5.151515151515151, 5.454545454545455, 5.757575757575758, 6.0606060606060606, 6.363636363636364, 6.666666666666667, 6.96969696969697, 7.272727272727273, 7.575757575757576, 7.878787878787879, 8.181818181818182, 8.484848484848484, 8.787878787878789, 9.090909090909092, 9.393939393939394, 9.696969696969697, 10.0, 10.303030303030303, 10.606060606060606, 10.90909090909091, 11.212121212121213, 11.515151515151516, 11.818181818181...
729 | ></pre>
730 | 
731 | <img src='https://github.com/drewnoff/spark-notebook-ml-labs/blob/master/labs/DataAnalysisToolbox/images/plotFunction.png?raw=true' alt='plot' height='252' width='978'></img>
732 | 
733 | 
734 | ** Problem 2. ** Plot histogram of ages for each passenger class (use data from Titanic dataset).
735 | 
736 | ```scala
737 | // Let's start with histogram of ages of all passengers.
738 | val ageRdd = df.select("Age").rdd.map(r => r.getAs[Double](0))
739 | val ageHist = ageRdd.histogram(10)
740 | 
741 | case class AgeHistPoint(ageBucket: Double, age: Long)
742 | 
743 | val ageHistData = ageHist._1.zip(ageHist._2).map(pp => AgeHistPoint(pp._1, pp._2))
744 | 
745 | CustomC3Chart(ageHistData,
746 |              chartOptions = """
747 |              { data: { x: 'ageBucket', 
748 |                        type: 'bar'},
749 |                bar: {
750 |                      width: {ratio: 0.9}
751 |                     },
752 |               axis: {
753 |                     y: {
754 |                       label: 'Count'
755 |                       }
756 |                    }
757 |              }
758 |              """)
759 | ```
760 | 
761 | 
762 | ><pre>
763 | > ageRdd: org.apache.spark.rdd.RDD[Double] = MapPartitionsRDD[312] at map at <console>:36
764 | > ageHist: (Array[Double], Array[Long]) = (Array(0.0, 8.0, 16.0, 24.0, 32.0, 40.0, 48.0, 56.0, 64.0, 72.0, 80.0),Array(227, 33, 164, 181, 123, 74, 50, 26, 11, 2))
765 | > defined class AgeHistPoint
766 | > ageHistData: Array[AgeHistPoint] = Array(AgeHistPoint(0.0,227), AgeHistPoint(8.0,33), AgeHistPoint(16.0,164), AgeHistPoint(24.0,181), AgeHistPoint(32.0,123), AgeHistPoint(40.0,74), AgeHistPoint(48.0,50), AgeHistPoint(56.0,26), AgeHistPoint(64.0,11), AgeHistPoint(72.0,2))
767 | > res47: notebook.front.widgets.CustomC3Chart[Array[AgeHistPoint]] = <CustomC3Chart widget>
768 | ></pre>
769 | 
770 | <img src='https://github.com/drewnoff/spark-notebook-ml-labs/blob/master/labs/DataAnalysisToolbox/images/ageHist.png?raw=true' alt='hist' height='252' width='978'></img>
771 | 
772 | 
773 | ```scala
774 | // Now let's expand our solution.
775 | val buckets = linspace(0, 100, 11).toArray
776 | val p1AgesHist = df.filter(df("Pclass")===1)
777 |                    .select("Age")
778 |                    .rdd
779 |                    .map(r => r.getAs[Double](0))
780 |                    .histogram(buckets)
781 | val p2AgesHist = df.filter(df("Pclass")===2)
782 |                    .select("Age")
783 |                    .rdd
784 |                    .map(r => r.getAs[Double](0))
785 |                    .histogram(buckets)
786 | val p3AgesHist = df.filter(df("Pclass")===3)
787 |                    .select("Age")
788 |                    .rdd
789 |                    .map(r => r.getAs[Double](0))
790 |                    .histogram(buckets)
791 | ```
792 | 
793 | 
794 | ><pre>
795 | > buckets: Array[Double] = Array(0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0)
796 | > p1AgesHist: Array[Long] = Array(33, 18, 34, 50, 37, 27, 13, 3, 1, 0)
797 | > p2AgesHist: Array[Long] = Array(28, 18, 53, 48, 18, 15, 3, 1, 0, 0)
798 | > p3AgesHist: Array[Long] = Array(178, 66, 133, 69, 34, 6, 3, 2, 0, 0)
799 | ></pre>
800 | 
801 | 
802 | 
803 | 
804 | ```scala
805 | case class AgeHistPoint(ageBucket: Double, c1: Long, c2: Long, c3: Long)
806 | 
807 | val ageHistData = (0 until buckets.length - 1).map(i => AgeHistPoint(buckets(i), p1AgesHist(i), p2AgesHist(i), p3AgesHist(i))).toArray
808 | ```
809 | 
810 | 
811 | ><pre>
812 | > defined class AgeHistPoint
813 | > ageHistData: Array[AgeHistPoint] = Array(AgeHistPoint(0.0,33,28,178), AgeHistPoint(10.0,18,18,66), AgeHistPoint(20.0,34,53,133), AgeHistPoint(30.0,50,48,69), AgeHistPoint(40.0,37,18,34), AgeHistPoint(50.0,27,15,6), AgeHistPoint(60.0,13,3,3), AgeHistPoint(70.0,3,1,2), AgeHistPoint(80.0,1,0,0), AgeHistPoint(90.0,0,0,0))
814 | ></pre>
815 | 
816 | 
817 | 
818 | 
819 | ```scala
820 | CustomC3Chart(ageHistData,
821 |              chartOptions = """
822 |              { data: { x: 'ageBucket', 
823 |                        type: 'bar'},
824 |                bar: {
825 |                      width: {ratio: 0.9}
826 |                     },
827 |                axis: {
828 |                     y: {label: 'Count'}
829 |                    }
830 |              }
831 |              """)
832 | ```
833 | 
834 | 
835 | ><pre>
836 | > res51: notebook.front.widgets.CustomC3Chart[Array[AgeHistPoint]] = <CustomC3Chart widget>
837 | ></pre>
838 | 
839 | <img src='https://github.com/drewnoff/spark-notebook-ml-labs/blob/master/labs/DataAnalysisToolbox/images/ageHistPerClass.png?raw=true' alt='ageHistPerClassStacked' height='252' width='978'></img>
840 | 
841 | 
842 | ```scala
843 | // Using stacked bar chart
844 | CustomC3Chart(ageHistData,
845 |              chartOptions = """
846 |              { data: { x: 'ageBucket', 
847 |                        type: 'bar',
848 |                        groups: [['c1', 'c2', 'c3']]},
849 |                bar: {
850 |                      width: {ratio: 0.9}
851 |                     },
852 |                axis: {
853 |                     y: {label: 'Count'}
854 |                    }
855 |              }
856 |              """)
857 | ```
858 | 
859 | 
860 | ><pre>
861 | > res53: notebook.front.widgets.CustomC3Chart[Array[AgeHistPoint]] = <CustomC3Chart widget>
862 | ></pre>
863 | 
864 | <img src='https://github.com/drewnoff/spark-notebook-ml-labs/blob/master/labs/DataAnalysisToolbox/images/ageHistPerClassStacked.png?raw=true' alt='ageHistPerClassStacked' height='252' width='978'></img>
865 | 


--------------------------------------------------------------------------------
/labs/DataAnalysisToolbox/images/ageHist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/DataAnalysisToolbox/images/ageHist.png


--------------------------------------------------------------------------------
/labs/DataAnalysisToolbox/images/ageHistPerClass.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/DataAnalysisToolbox/images/ageHistPerClass.png


--------------------------------------------------------------------------------
/labs/DataAnalysisToolbox/images/ageHistPerClassStacked.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/DataAnalysisToolbox/images/ageHistPerClassStacked.png


--------------------------------------------------------------------------------
/labs/DataAnalysisToolbox/images/plotFunction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/DataAnalysisToolbox/images/plotFunction.png


--------------------------------------------------------------------------------
/labs/IntroToMLandSparkMLPipelines/Intro To Machine Learning and SparkML Pipelines.snb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "metadata" : {
  3 |     "name" : "Intro To Machine Learning and SparkML Pipelines",
  4 |     "user_save_timestamp" : "1199-01-01T03:00:00.000Z",
  5 |     "auto_save_timestamp" : "1970-01-01T03:00:00.000Z",
  6 |     "language_info" : {
  7 |       "name" : "scala",
  8 |       "file_extension" : "scala",
  9 |       "codemirror_mode" : "text/x-scala"
 10 |     },
 11 |     "trusted" : true,
 12 |     "customLocalRepo" : null,
 13 |     "customRepos" : null,
 14 |     "customDeps" : null,
 15 |     "customImports" : null,
 16 |     "customArgs" : null,
 17 |     "customSparkConf" : {
 18 |       "spark.app.name" : "ScalaIO Machine Learning Pipeline",
 19 |       "spark.master" : "local[4]",
 20 |       "spark.executor.memory" : "2G"
 21 |     }
 22 |   },
 23 |   "cells" : [ {
 24 |     "metadata" : {
 25 |       "id" : "2DD07D009297418F8AD85CE169ABCD6F"
 26 |     },
 27 |     "cell_type" : "markdown",
 28 |     "source" : "# Introduction to Machine Learning and Spark ML Pipelines"
 29 |   }, {
 30 |     "metadata" : {
 31 |       "id" : "3300198B3B0943B080DC7DBE9884D190"
 32 |     },
 33 |     "cell_type" : "markdown",
 34 |     "source" : "<div style=\"text-align:center\">\n  <img src=\"https://gitlab.com/droff/ph/raw/477d2a011575887dfb65d36dc3ff4c116f3bf586/logos/Spark-logo.png\" width=\"192\" height=\"100\" style=\"margin-right:70px\">\n  <img src=\"https://gitlab.com/droff/ph/raw/477d2a011575887dfb65d36dc3ff4c116f3bf586/logos/spark-notebook-logo.png\" width=\"111\" height=\"128\">\n</div>"
 35 |   }, {
 36 |     "metadata" : {
 37 |       "id" : "C426E89F077D4458812ADBD3017E7300"
 38 |     },
 39 |     "cell_type" : "markdown",
 40 |     "source" : "# Machine learning Pipeline"
 41 |   }, {
 42 |     "metadata" : {
 43 |       "id" : "3D7218E315774439978B9F859CCC5CE1"
 44 |     },
 45 |     "cell_type" : "markdown",
 46 |     "source" : "In this lab we are going to learn how to teach machine learning models, how to correctly set up an experiment, how to tune model hyperparameters and how to compare models. Also we'are going to get familiar with spark.ml package as soon as all of the work we'are going to get done using this package."
 47 |   }, {
 48 |     "metadata" : {
 49 |       "id" : "5EEC312DD9A34425884B50E36008151E"
 50 |     },
 51 |     "cell_type" : "markdown",
 52 |     "source" : "* http://spark.apache.org/docs/latest/ml-guide.html\n* http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.package"
 53 |   }, {
 54 |     "metadata" : {
 55 |       "id" : "F1CC9FFD55F045598503E6FDF35276E8"
 56 |     },
 57 |     "cell_type" : "markdown",
 58 |     "source" : "## Evaluation Metrics\nModel training and model quality assessment is performed on independent sets of examples. As a rule, the available examples are divided into two subsets: training (train) and control (test). The choice of the proportions of the split is a compromise. Indeed, the large size of the training leads to better quality of algorithms, but more noisy estimation of the model on the control. Conversely, the large size of the test sample leads to a less noisy assessment of the quality, however, models are less accurate.\n\nMany classification models produce estimation of belonging to the class $\\tilde{h}(x) \\in R$ (for example, the probability of belonging to the class 1). They then make a decision about the class of the object by comparing the estimates with a certain threshold $\\theta$:\n\n$h(x) = +1$,  if $\\tilde{h}(x) \\geq \\theta$, $h(x) = -1$, if $\\tilde{h}(x) < \\theta$\n\nIn this case, we can consider metrics that are able to work with estimates of belonging to a class.\nIn this lab, we will work with [AUC-ROC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) metric. Detailed understanding of the operating principle of AUC-ROC metric is not required to perform the lab.\n## Model Hyperparameter Tuning\nIn machine learning problems it is necessary to distinguish the parameters of the model and hyperparameters (structural parameters). The model parameters are adjusted during the training (e.g., weights in the linear model or the structure of the decision tree), while hyperparameters are set in advance (for example, the regularization in linear model or maximum depth of the decision tree). Each model usually has many hyperparameters, and there is no universal set of hyperparameters optimal working in all tasks, for each task one should choose a different set of hyperparameters. _Grid search_ is commonly used to optimize model hyperparameters: for each parameter several values are selected and combination of parameter values where the model shows the best quality (in terms of the metric that is being optimized) is selected. However, in this case, it is necessary to correctly assess the constructed model, namely to do the split into training and test sample. There are several ways how it can be implemented:\n\n - Split the available samples into training and test samples. In this case, the comparison of a large number of models in the search of parameters leads to a situation when the best model on test data does not maintain its quality on new data. We can say that there is overfitting on the test data.\n - To eliminate the problem described above, it is possible to split data into 3 disjoint sub-samples: `train`, `validation` and `test`. The `validation` set is used for models comparison, and `test` set is used for the final quality assessment and comparison of families of models with selected parameters.\n - Another way to compare models is [cross-validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics). There are different schemes of cross-validation:\n  - Leave-one-out cross-validation\n  - K-fold cross-validation\n  - Repeated random sub-sampling validation\n  \nCross-validation is computationally expensive operation, especially if you are doing a grid search with a very large number of combinations. So there are a number of compromises:\n - the grid can be made more sparse, touching fewer values for each parameter, however, we must not forget that in such case one can skip a good combination of parameters;\n - cross-validation can be done with a smaller number of partitions or folds, but in this case the quality assessment of cross-validation becomes more noisy and increases the risk to choose a suboptimal set of parameters due to the random nature of the split;\n - the parameters can be optimized sequentially (greedy) — one after another, and not to iterate over all combinations; this strategy does not always lead to the optimal set;\n - enumerate only small number of randomly selected combinations of values of hyperparameters."
 59 |   }, {
 60 |     "metadata" : {
 61 |       "id" : "681A5B7696EE4E75941D30477B87D473"
 62 |     },
 63 |     "cell_type" : "markdown",
 64 |     "source" : "## Data\n\nWe'are going to solve binary classification problem by building the algorithm which determines whether a person makes over 50K a year. Following variables are available:\n* age\n* workclass\n* fnlwgt\n* education\n* education-num\n* marital-status\n* occupation\n* relationship\n* race\n* sex\n* capital-gain\n* capital-loss\n* hours-per-week\n\nMore on this data one can read in [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names)"
 65 |   }, {
 66 |     "metadata" : {
 67 |       "trusted" : true,
 68 |       "input_collapsed" : false,
 69 |       "collapsed" : false,
 70 |       "id" : "3986C01E03884C09B551F434EAB5DA89"
 71 |     },
 72 |     "cell_type" : "code",
 73 |     "source" : "val spark = sparkSession",
 74 |     "outputs" : [ ]
 75 |   }, {
 76 |     "metadata" : {
 77 |       "trusted" : true,
 78 |       "input_collapsed" : false,
 79 |       "collapsed" : false,
 80 |       "id" : "80AD7CC4C72645D78E62C11F9F1C838D"
 81 |     },
 82 |     "cell_type" : "code",
 83 |     "source" : "val df = spark.read\n  .option(\"header\", \"true\")\n  .option(\"inferSchema\", \"true\")\n  .csv(\"notebooks/spark-notebook-ml-labs/labs/IntroToMLandSparkMLPipelines/data/data.adult.csv\")  ",
 84 |     "outputs" : [ ]
 85 |   }, {
 86 |     "metadata" : {
 87 |       "trusted" : true,
 88 |       "input_collapsed" : false,
 89 |       "collapsed" : false,
 90 |       "id" : "5CC83488592D4FFBA9D63A226935D96D"
 91 |     },
 92 |     "cell_type" : "code",
 93 |     "source" : "df.limit(5)",
 94 |     "outputs" : [ ]
 95 |   }, {
 96 |     "metadata" : {
 97 |       "id" : "44EA054CC3514338B1F53B9B9722F0BF"
 98 |     },
 99 |     "cell_type" : "markdown",
100 |     "source" : "Sometimes there are missing values in the data. Sometimes, in the description of the dataset one can found the description of format of missing values. Particularly in the given dataset  missing values are identified by '?' sign.\n\n**Problem** Find all the variables with missing values. Remove from the dataset all objects with missing values in any variable."
101 |   }, {
102 |     "metadata" : {
103 |       "trusted" : true,
104 |       "input_collapsed" : false,
105 |       "collapsed" : false,
106 |       "id" : "D22FE57836E541998BA905B0252FB07B"
107 |     },
108 |     "cell_type" : "code",
109 |     "source" : "val missingValsFeatures = df.columns.filter(column => df.filter(df(column) === \"?\").count > 0)\n\nprintln(\"Features with missing values: \" + missingValsFeatures.mkString(\", \"))\n\nval data = missingValsFeatures.foldLeft(df)((dfstage, column) => dfstage.filter(!dfstage(column).equalTo(\"?\")))",
110 |     "outputs" : [ ]
111 |   }, {
112 |     "metadata" : {
113 |       "id" : "D8D5DBE2B17641939EB490F5A2E80C9C"
114 |     },
115 |     "cell_type" : "markdown",
116 |     "source" : "Split on training and test datasets."
117 |   }, {
118 |     "metadata" : {
119 |       "trusted" : true,
120 |       "input_collapsed" : false,
121 |       "collapsed" : false,
122 |       "id" : "043DA92BF95A46D9B1CE7171EB007DA2"
123 |     },
124 |     "cell_type" : "code",
125 |     "source" : "val Array(training, test) = data.randomSplit(Array(0.8, 0.2), seed = 1234)",
126 |     "outputs" : [ ]
127 |   }, {
128 |     "metadata" : {
129 |       "id" : "7AE65B7403934817B40C0C18D3AFE774"
130 |     },
131 |     "cell_type" : "markdown",
132 |     "source" : "### MLlib Transformers and Estimators"
133 |   }, {
134 |     "metadata" : {
135 |       "id" : "429E47695E694CCF9F3EDDA47C1B1343"
136 |     },
137 |     "cell_type" : "markdown",
138 |     "source" : "`Transformer` transforms one `DataFrame` into another `DataFrame`."
139 |   }, {
140 |     "metadata" : {
141 |       "id" : "B3D8D8D439CA4AA583BF354AE382B80B"
142 |     },
143 |     "cell_type" : "markdown",
144 |     "source" : "<div style=\"text-align:left\">\n  <img src=\"https://gitlab.com/droff/ph/raw/master/images/Transformer.png\" width=\"566\" height=\"352\">\n</div>"
145 |   }, {
146 |     "metadata" : {
147 |       "id" : "BC347194CA8E49E0A8FD0430038614D6"
148 |     },
149 |     "cell_type" : "markdown",
150 |     "source" : "`Estimator` fits on a `DataFrame` to produce a `Transformer`."
151 |   }, {
152 |     "metadata" : {
153 |       "id" : "2943774D4879475382D91BCB027CFCD7"
154 |     },
155 |     "cell_type" : "markdown",
156 |     "source" : "<div style=\"text-align:left\">\n  <img src=\"https://gitlab.com/droff/ph/raw/master/images/Estimator.png\" width=\"681\" height=\"327\">\n</div>"
157 |   }, {
158 |     "metadata" : {
159 |       "id" : "FDC6151EF3B3469481C85B1CE0B103B3"
160 |     },
161 |     "cell_type" : "markdown",
162 |     "source" : "## Training classifiers on numeric features"
163 |   }, {
164 |     "metadata" : {
165 |       "id" : "AABD089AF368408F81FAC58D9C7D0DFC"
166 |     },
167 |     "cell_type" : "markdown",
168 |     "source" : "Some preprocessing steps are usually required after loading and cleaning dataset. In this case, these steps will include the following:\n\n - At first we will work only with numeric features. So let's select them separately in the feature vector \"numFeatures\" using [VectorAssembler](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler).\n - Select the target variable (the one we want to predict, string column of labels) and map it to an ML column of label indices using [StringIndexer](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.StringIndexer), give the name \"labelIndex\" to a new variable."
169 |   }, {
170 |     "metadata" : {
171 |       "trusted" : true,
172 |       "input_collapsed" : false,
173 |       "collapsed" : false,
174 |       "id" : "295D6680F4B2436384642C3EAA6E218B"
175 |     },
176 |     "cell_type" : "code",
177 |     "source" : "import org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark.ml.feature.StringIndexer\n\nval assembler = new VectorAssembler()\n  .setInputCols(Array(\"age\",\n                      \"fnlwgt\", \n                      \"education-num\", \n                      \"capital-gain\", \n                      \"capital-loss\",\n                      \"hours-per-week\"))\n  .setOutputCol(\"numFeatures\")\n\nval labelIndexer = new StringIndexer()\n  .setInputCol(\">50K,<=50K\")\n  .setOutputCol(\"label\")\n  .fit(training)",
178 |     "outputs" : [ ]
179 |   }, {
180 |     "metadata" : {
181 |       "trusted" : true,
182 |       "input_collapsed" : false,
183 |       "collapsed" : false,
184 |       "id" : "995D5E14FD31463D8C57B7B1485CBC9B"
185 |     },
186 |     "cell_type" : "code",
187 |     "source" : "labelIndexer.transform(training).select(\">50K,<=50K\", \"label\").show(8)",
188 |     "outputs" : [ ]
189 |   }, {
190 |     "metadata" : {
191 |       "trusted" : true,
192 |       "input_collapsed" : false,
193 |       "collapsed" : false,
194 |       "id" : "00D124318ED3471F8D0E9464E0991432"
195 |     },
196 |     "cell_type" : "code",
197 |     "source" : "assembler.transform(training)\n         .select(\"age\", \"fnlwgt\", \"education-num\", \"capital-gain\", \"capital-loss\", \"hours-per-week\", \"numFeatures\")\n         .limit(5)",
198 |     "outputs" : [ ]
199 |   }, {
200 |     "metadata" : {
201 |       "trusted" : true,
202 |       "input_collapsed" : false,
203 |       "collapsed" : false,
204 |       "id" : "524AD8781A2C40E78FB103D1A215730E"
205 |     },
206 |     "cell_type" : "code",
207 |     "source" : "val trainData = assembler.transform{\n                labelIndexer.transform(training)\n              }.select(\"label\", \"numFeatures\")\ntrainData.show(5, truncate=false)",
208 |     "outputs" : [ ]
209 |   }, {
210 |     "metadata" : {
211 |       "trusted" : true,
212 |       "input_collapsed" : false,
213 |       "collapsed" : false,
214 |       "id" : "F5C8FDC16C4D482F9D876C00BEA165B5"
215 |     },
216 |     "cell_type" : "code",
217 |     "source" : "import org.apache.spark.ml.classification.LogisticRegression\nimport org.apache.spark.ml.evaluation.BinaryClassificationEvaluator\n\n\nval lr = new LogisticRegression()\n                .setFeaturesCol(\"numFeatures\")\n                .setLabelCol(\"label\")\n                .setRegParam(0.1)\n\nval lrModel = lr.fit(trainData)",
218 |     "outputs" : [ ]
219 |   }, {
220 |     "metadata" : {
221 |       "trusted" : true,
222 |       "input_collapsed" : false,
223 |       "collapsed" : false,
224 |       "id" : "6B1E03D882D84D698E81C3B798FD7E3B"
225 |     },
226 |     "cell_type" : "code",
227 |     "source" : "val testData = assembler.transform{\n                labelIndexer.transform(test)\n              }",
228 |     "outputs" : [ ]
229 |   }, {
230 |     "metadata" : {
231 |       "trusted" : true,
232 |       "input_collapsed" : false,
233 |       "collapsed" : false,
234 |       "id" : "BC029CEEF51E4A8596F87D2B78A77F94"
235 |     },
236 |     "cell_type" : "code",
237 |     "source" : "val eval = new BinaryClassificationEvaluator()\n                  .setMetricName(\"areaUnderROC\")\n\nprintln(eval.evaluate(lrModel.transform(testData)))",
238 |     "outputs" : [ ]
239 |   }, {
240 |     "metadata" : {
241 |       "id" : "E1F6FC29B90A428D800EF818579F58D8"
242 |     },
243 |     "cell_type" : "markdown",
244 |     "source" : "## Model selection with MLlib\nApache Spark MLlib supports model hyperparameter tuning using tools such as `CrossValidator` and `TrainValidationSplit`. These tools require the following items:\n\n - Estimator: algorithm or Pipeline to tune\n - Set of ParamMaps: parameters to choose from, sometimes called a “parameter grid” to search over\n - Evaluator: metric to measure how well a fitted Model does on held-out test data"
245 |   }, {
246 |     "metadata" : {
247 |       "id" : "2797280B7D8E490D8C69DE421E730265"
248 |     },
249 |     "cell_type" : "markdown",
250 |     "source" : "In this section we will need to work only with numeric features and a target variable.\nAt the beginning let's have a look at grid search in action.\nWe will consider 2 algorithms:\n - [LogisticRegression](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression)\n - [DecisionTreeClassifier](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.classification.DecisionTreeClassifier)\n \nTo start with, let's choose one parameter to optimize for each algorithm:\n - LogisticRegression — regularization parameter (*regParam*)\n - DecisonTreeClassifier — maximum depth of the tree (*maxDepth*)\n \nThe remaining parameters we will leave at their default values. \nTo implement grid search procedure one can use\n[CrossValidator](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) class\ncombining with [ParamGridBuilder](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.tuning.ParamGridBuilder) class. \nAlso we need to specify appropriate evaluator for this task, in our case we should use [BinaryClassificationEvaluator](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator)\n(note that its default metric is areaUnderROC, so we don't neet to specify metric via `setMetricName` method call).\nSet up 5-fold cross validation scheme."
251 |   }, {
252 |     "metadata" : {
253 |       "id" : "FF01A8CDB111416ABE957E877A793ABB"
254 |     },
255 |     "cell_type" : "markdown",
256 |     "source" : "<div style=\"font-size:large\">K-fold cross-validation</div>\n<div style=\"text-align:left\">\n  <img src=\"https://upload.wikimedia.org/wikipedia/commons/1/1c/K-fold_cross_validation_EN.jpg\" width=\"562\" height=\"262\">\n</div>\n<div style=\"font-size:x-small\">\n  By Fabian Flöck (Own work) [CC BY-SA 3.0 (http://creativecommons.org/licenses/by-sa/3.0)], via Wikimedia Commons\n</div>"
257 |   }, {
258 |     "metadata" : {
259 |       "id" : "EF27CAA566E04232AEC9CFC49A2F2B81"
260 |     },
261 |     "cell_type" : "markdown",
262 |     "source" : "**Problem** Try to find the optimal values of these hyperparameters for each algorithm. Plot the average cross-validation metrics for a given value of hyperparameter for each algorithm (hint: use `avgMetrics` field of resulting `CrossValidatorModel`)."
263 |   }, {
264 |     "metadata" : {
265 |       "trusted" : true,
266 |       "input_collapsed" : false,
267 |       "collapsed" : false,
268 |       "id" : "7BFE57FA1AC340E28BBA55FE9513BD60"
269 |     },
270 |     "cell_type" : "code",
271 |     "source" : "import org.apache.spark.ml.classification.{LogisticRegression, DecisionTreeClassifier, RandomForestClassifier}\nimport org.apache.spark.ml.evaluation.BinaryClassificationEvaluator\nimport org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}\n\n\nval lr = new LogisticRegression()\n                .setFeaturesCol(\"numFeatures\")\n                .setLabelCol(\"label\")\n\nval lrParamGrid = new ParamGridBuilder()\n  .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4))\n  .build()\n\nval lrCV = new CrossValidator()\n  .setEstimator(lr)\n  .setEvaluator(new BinaryClassificationEvaluator)\n  .setEstimatorParamMaps(lrParamGrid)\n  .setNumFolds(5)\n\nval lrCVModel = lrCV.fit(trainData)",
272 |     "outputs" : [ ]
273 |   }, {
274 |     "metadata" : {
275 |       "trusted" : true,
276 |       "input_collapsed" : false,
277 |       "collapsed" : false,
278 |       "id" : "8E42871F7DF2428F8C7901A12A063319"
279 |     },
280 |     "cell_type" : "code",
281 |     "source" : "println(\"cross-validated areaUnderROC: \" + lrCVModel.avgMetrics.max)\nprintln(\"test areaUnderROC: \" + eval.evaluate(lrCVModel.transform(testData)))",
282 |     "outputs" : [ ]
283 |   }, {
284 |     "metadata" : {
285 |       "trusted" : true,
286 |       "input_collapsed" : false,
287 |       "collapsed" : false,
288 |       "id" : "D2CB230B21174A878C811CE6F13F21A3"
289 |     },
290 |     "cell_type" : "code",
291 |     "source" : "val tree = new DecisionTreeClassifier()\n                .setFeaturesCol(\"numFeatures\")\n                .setLabelCol(\"label\")\n\nval treeParamGrid = new ParamGridBuilder()\n  .addGrid(tree.maxDepth, Array(5, 10, 20, 25, 30))\n  .build()\n\nval treeCV = new CrossValidator()\n  .setEstimator(tree)\n  .setEvaluator(new BinaryClassificationEvaluator)\n  .setEstimatorParamMaps(treeParamGrid)\n  .setNumFolds(5)\n\nval treeCVModel = treeCV.fit(trainData)",
292 |     "outputs" : [ ]
293 |   }, {
294 |     "metadata" : {
295 |       "trusted" : true,
296 |       "input_collapsed" : false,
297 |       "collapsed" : false,
298 |       "id" : "2185D3BB0F144FC284AF94D49B1F3F57"
299 |     },
300 |     "cell_type" : "code",
301 |     "source" : "println(\"cross-validated areaUnderROC: \" + treeCVModel.avgMetrics.max)\nprintln(\"test areaUnderROC: \" + eval.evaluate(treeCVModel.transform(testData)))",
302 |     "outputs" : [ ]
303 |   }, {
304 |     "metadata" : {
305 |       "trusted" : true,
306 |       "input_collapsed" : false,
307 |       "collapsed" : false,
308 |       "presentation" : {
309 |         "tabs_state" : "{\n  \"tab_id\": \"#tab1099791619-2\"\n}",
310 |         "pivot_chart_state" : "{\n  \"hiddenAttributes\": [],\n  \"menuLimit\": 200,\n  \"cols\": [],\n  \"rows\": [],\n  \"vals\": [],\n  \"exclusions\": {},\n  \"inclusions\": {},\n  \"unusedAttrsVertical\": 85,\n  \"autoSortUnusedAttrs\": false,\n  \"inclusionsInfo\": {},\n  \"aggregatorName\": \"Count\",\n  \"rendererName\": \"Table\"\n}"
311 |       },
312 |       "id" : "7D0EFB1819E742B19D2276280F94B5E0"
313 |     },
314 |     "cell_type" : "code",
315 |     "source" : "lrCVModel.getEstimatorParamMaps\n                            .map(paramMap => paramMap(lr.regParam))\n                            .zip(lrCVModel.avgMetrics)\n                            .toSeq.toDF(\"regParam\", \"AUC-ROC\")\n                            .collect",
316 |     "outputs" : [ ]
317 |   }, {
318 |     "metadata" : {
319 |       "trusted" : true,
320 |       "input_collapsed" : false,
321 |       "collapsed" : false,
322 |       "presentation" : {
323 |         "tabs_state" : "{\n  \"tab_id\": \"#tab1136416947-2\"\n}",
324 |         "pivot_chart_state" : "{\n  \"hiddenAttributes\": [],\n  \"menuLimit\": 200,\n  \"cols\": [],\n  \"rows\": [],\n  \"vals\": [],\n  \"exclusions\": {},\n  \"inclusions\": {},\n  \"unusedAttrsVertical\": 85,\n  \"autoSortUnusedAttrs\": false,\n  \"inclusionsInfo\": {},\n  \"aggregatorName\": \"Count\",\n  \"rendererName\": \"Table\"\n}"
325 |       },
326 |       "id" : "D576959DAFBC443E80CFDA84833A5BBE"
327 |     },
328 |     "cell_type" : "code",
329 |     "source" : "treeCVModel.getEstimatorParamMaps\n                            .map(paramMap => paramMap(tree.maxDepth))\n                            .zip(treeCVModel.avgMetrics)\n                            .toSeq.toDF(\"maxDepth\", \"AUC-ROC\")\n                            .collect",
330 |     "outputs" : [ ]
331 |   }, {
332 |     "metadata" : {
333 |       "id" : "5796CB09E6A04C868B06834372440656"
334 |     },
335 |     "cell_type" : "markdown",
336 |     "source" : "## Adding categorical features"
337 |   }, {
338 |     "metadata" : {
339 |       "id" : "F329C50DFF5F4439BDABB14D308E1632"
340 |     },
341 |     "cell_type" : "markdown",
342 |     "source" : "Up to this point we did not use categorical features from the dataset. Let's see how additional categorical features will affect the quality of the classification. A common technique to convert categorical feature into numerical ones is [one-hot](https://en.wikipedia.org/wiki/One-hot) encoding. This can be done using [StringIndexer](http://spark.apache.org/docs/1.6.1/api/scala/index.html#org.apache.spark.ml.feature.StringIndexer) transformation followed by [OneHotEncoder](http://spark.apache.org/docs/1.6.1/api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder) transformation.\n\n*Let's start with encoding just one new feature `occupation` and after that generalize encoding step for all categorical features and combine all processing steps using [pipeline](http://spark.apache.org/docs/1.6.1/ml-guide.html#pipeline)*"
343 |   }, {
344 |     "metadata" : {
345 |       "trusted" : true,
346 |       "input_collapsed" : false,
347 |       "collapsed" : false,
348 |       "id" : "1EF8BB8D58BE403AADE26E26E14DEEEE"
349 |     },
350 |     "cell_type" : "code",
351 |     "source" : "data.groupBy(\"occupation\").count.show(truncate=false)\nprintln(data.select(\"occupation\").distinct.count)",
352 |     "outputs" : [ ]
353 |   }, {
354 |     "metadata" : {
355 |       "trusted" : true,
356 |       "input_collapsed" : false,
357 |       "collapsed" : false,
358 |       "id" : "80CD66B45BE146998C04BFF9D13FCE6A"
359 |     },
360 |     "cell_type" : "code",
361 |     "source" : "import org.apache.spark.ml.feature.OneHotEncoder\n\nval occupationIndexer = new StringIndexer()\n  .setInputCol(\"occupation\")\n  .setOutputCol(\"occupationIndex\")\n  .fit(training)\n\nval indexedTrainData = occupationIndexer.transform(training)\n\nval occupationEncoder = new OneHotEncoder()\n  .setInputCol(\"occupationIndex\")\n  .setOutputCol(\"occupationVec\")\n\nval oheEncodedTrainData = occupationEncoder.transform(indexedTrainData)\n\noheEncodedTrainData.select(\"occupation\", \"occupationVec\").limit(5)",
362 |     "outputs" : [ ]
363 |   }, {
364 |     "metadata" : {
365 |       "trusted" : true,
366 |       "input_collapsed" : false,
367 |       "collapsed" : false,
368 |       "id" : "2E841C874D65405188408597B0FC8F50"
369 |     },
370 |     "cell_type" : "code",
371 |     "source" : "val assembler = new VectorAssembler()\n  .setInputCols(Array(\"age\",\n                      \"fnlwgt\", \n                      \"education-num\", \n                      \"capital-gain\", \n                      \"capital-loss\",\n                      \"hours-per-week\",\n                      \"occupationVec\"))\n  .setOutputCol(\"features\")\n\n\nval trainDataWithOccupation = assembler.transform{\n                                labelIndexer.transform(oheEncodedTrainData)\n                              }.select(\"label\", \"features\")",
372 |     "outputs" : [ ]
373 |   }, {
374 |     "metadata" : {
375 |       "id" : "A165BD5B117349CA91CEF187CEAFF007"
376 |     },
377 |     "cell_type" : "markdown",
378 |     "source" : "*For the sake of brevity, from now let's use only LogisticRegression model.*"
379 |   }, {
380 |     "metadata" : {
381 |       "trusted" : true,
382 |       "input_collapsed" : false,
383 |       "collapsed" : false,
384 |       "id" : "F16D7F76854D452B8E5133CD8A5EBC0F"
385 |     },
386 |     "cell_type" : "code",
387 |     "source" : "val lr = new LogisticRegression()\n  .setFeaturesCol(\"features\")\n\nval lrParamGrid = new ParamGridBuilder()\n  .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4))\n  .build()\n\nval lrCV = new CrossValidator()\n  .setEstimator(lr)\n  .setEvaluator(new BinaryClassificationEvaluator)\n  .setEstimatorParamMaps(lrParamGrid)\n  .setNumFolds(5)\n\nval lrCVModel = lrCV.fit(trainDataWithOccupation)",
388 |     "outputs" : [ ]
389 |   }, {
390 |     "metadata" : {
391 |       "trusted" : true,
392 |       "input_collapsed" : false,
393 |       "collapsed" : false,
394 |       "id" : "CF24B52C5DA141CA865C45735CE4551C"
395 |     },
396 |     "cell_type" : "code",
397 |     "source" : "val testDataWithOccupation = assembler.transform{\n                                labelIndexer.transform(occupationEncoder.transform(occupationIndexer.transform(test)))\n                              }.select(\"label\", \"features\")",
398 |     "outputs" : [ ]
399 |   }, {
400 |     "metadata" : {
401 |       "trusted" : true,
402 |       "input_collapsed" : false,
403 |       "collapsed" : false,
404 |       "id" : "E7902B1AC34C4B8A8FA321B5951987E0"
405 |     },
406 |     "cell_type" : "code",
407 |     "source" : "println(\"cross-validated areaUnderROC: \" + lrCVModel.avgMetrics.max)\nprintln(\"test areaUnderROC: \" + eval.evaluate(lrCVModel.transform(testDataWithOccupation)))",
408 |     "outputs" : [ ]
409 |   }, {
410 |     "metadata" : {
411 |       "id" : "04CDB18D4A9E4A6384B950358A54675C"
412 |     },
413 |     "cell_type" : "markdown",
414 |     "source" : "Adding `occupation` categorical variable yielded an increase in quality."
415 |   }, {
416 |     "metadata" : {
417 |       "id" : "D893CE81ECB84308A552138EB876433A"
418 |     },
419 |     "cell_type" : "markdown",
420 |     "source" : "## Pipelines"
421 |   }, {
422 |     "metadata" : {
423 |       "id" : "D279C9B3198946D48A1A9C9E40A368EE"
424 |     },
425 |     "cell_type" : "markdown",
426 |     "source" : "Using [pipelines](http://spark.apache.org/docs/1.6.1/ml-guide.html#pipeline) one can combine all the processing stages into one pipeline and perform grid search against hyperparameters of all stages included in the pipeline. Also it's easy to extend given pipeline with new steps.\n\nA Pipeline chains multiple Transformers and Estimators together to specify an ML workflow."
427 |   }, {
428 |     "metadata" : {
429 |       "trusted" : true,
430 |       "input_collapsed" : false,
431 |       "collapsed" : true,
432 |       "id" : "8FBBD6915E364B3BBE564CC6DD77A0E6"
433 |     },
434 |     "cell_type" : "markdown",
435 |     "source" : "<div style=\"text-align:left\">\n  <img src=\"https://gitlab.com/droff/ph/raw/master/images/Pipeline.png\" width=\"1057\" height=\"515\">\n</div>"
436 |   }, {
437 |     "metadata" : {
438 |       "id" : "F33CFE5BAB1C4B278FB9A7C28016ABB8"
439 |     },
440 |     "cell_type" : "markdown",
441 |     "source" : " Let's see how we can combine all the preprocessing steps made so far into one pipeline."
442 |   }, {
443 |     "metadata" : {
444 |       "trusted" : true,
445 |       "input_collapsed" : false,
446 |       "collapsed" : false,
447 |       "id" : "8CBA081604CC4D11A7B370250DB6A335"
448 |     },
449 |     "cell_type" : "code",
450 |     "source" : "import org.apache.spark.ml.Pipeline\n\n\n// Chain indexers, encoders and assembler in a Pipeline\nval featurePipelineModel = new Pipeline()\n  .setStages(Array(occupationIndexer, \n                   occupationEncoder,\n                   assembler,\n                   labelIndexer))\n  .fit(training)",
451 |     "outputs" : [ ]
452 |   }, {
453 |     "metadata" : {
454 |       "trusted" : true,
455 |       "input_collapsed" : false,
456 |       "collapsed" : false,
457 |       "id" : "C6B076016C2F406F808218B6E4A6D859"
458 |     },
459 |     "cell_type" : "code",
460 |     "source" : "featurePipelineModel.transform(test).select(\"features\", \"label\").limit(3)",
461 |     "outputs" : [ ]
462 |   }, {
463 |     "metadata" : {
464 |       "trusted" : true,
465 |       "input_collapsed" : false,
466 |       "collapsed" : false,
467 |       "id" : "56F99D760F4D46CC8F0366495D55842B"
468 |     },
469 |     "cell_type" : "code",
470 |     "source" : "eval.evaluate(lrCVModel.transform(labelIndexer.transform(assembler.transform(occupationEncoder.transform(occupationIndexer.transform(test))))))",
471 |     "outputs" : [ ]
472 |   }, {
473 |     "metadata" : {
474 |       "trusted" : true,
475 |       "input_collapsed" : false,
476 |       "collapsed" : false,
477 |       "id" : "5A6C6F21D60340D4BCF548C4D3C6AA42"
478 |     },
479 |     "cell_type" : "code",
480 |     "source" : "eval.evaluate(lrCVModel.transform(featurePipelineModel.transform(test)))",
481 |     "outputs" : [ ]
482 |   }, {
483 |     "metadata" : {
484 |       "id" : "547273F4614F4E8E95E374D2F9759053"
485 |     },
486 |     "cell_type" : "markdown",
487 |     "source" : "Now let's extend our pipeline by adding one-hot encoding step for each categorical feature."
488 |   }, {
489 |     "metadata" : {
490 |       "trusted" : true,
491 |       "input_collapsed" : false,
492 |       "collapsed" : false,
493 |       "id" : "E62240E6A06B42CC8AA49AD8A7272910"
494 |     },
495 |     "cell_type" : "code",
496 |     "source" : "val categCols = Array(\"workclass\", \"education\", \"marital-status\", \"occupation\", \"relationship\", \"race\", \"sex\")\n\nval featureIndexers: Array[org.apache.spark.ml.PipelineStage] = categCols.map(\n  cname => new StringIndexer()\n    .setInputCol(cname)\n    .setOutputCol(s\"${cname}_index\")\n)\n\nval oneHotEncoders = categCols.map(\n    cname => new OneHotEncoder()\n     .setInputCol(s\"${cname}_index\")\n     .setOutputCol(s\"${cname}_vec\")\n)\n\nval assembler = new VectorAssembler()\n  .setInputCols(Array(\"age\",\n                      \"fnlwgt\", \n                      \"education-num\", \n                      \"capital-gain\", \n                      \"capital-loss\",\n                      \"hours-per-week\") ++\n                categCols.map(cname => s\"${cname}_vec\"))\n  .setOutputCol(\"features\")\n\nval rawDataProcessor = new Pipeline()\n  .setStages(featureIndexers ++\n             oneHotEncoders ++\n             Array(assembler, labelIndexer))\n  .fit(training)",
497 |     "outputs" : [ ]
498 |   }, {
499 |     "metadata" : {
500 |       "trusted" : true,
501 |       "input_collapsed" : false,
502 |       "collapsed" : false,
503 |       "id" : "8DD31D3BA4054DF08AE02D82489228EB"
504 |     },
505 |     "cell_type" : "code",
506 |     "source" : "rawDataProcessor.transform(test).limit(3).select(\"features\", \"label\")",
507 |     "outputs" : [ ]
508 |   }, {
509 |     "metadata" : {
510 |       "trusted" : true,
511 |       "input_collapsed" : false,
512 |       "collapsed" : false,
513 |       "id" : "BF3105D54D0646488BB1BD7168A2E95E"
514 |     },
515 |     "cell_type" : "code",
516 |     "source" : "val lr = new LogisticRegression()\n  .setFeaturesCol(\"features\")\n\nval lrParamGrid = new ParamGridBuilder()\n  .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4))\n  .build()\n\nval lrCV = new CrossValidator()\n  .setEstimator(lr)\n  .setEvaluator(new BinaryClassificationEvaluator)\n  .setEstimatorParamMaps(lrParamGrid)\n  .setNumFolds(5)\n\nval lrCVModel = lrCV.fit(rawDataProcessor.transform(training))",
517 |     "outputs" : [ ]
518 |   }, {
519 |     "metadata" : {
520 |       "trusted" : true,
521 |       "input_collapsed" : false,
522 |       "collapsed" : false,
523 |       "id" : "E6CE1231B7914499BE7F01C498F38ED1"
524 |     },
525 |     "cell_type" : "code",
526 |     "source" : "println(\"cross-validated areaUnderROC: \" + lrCVModel.avgMetrics.max)\nprintln(\"test areaUnderROC: \" + eval.evaluate(lrCVModel.transform(rawDataProcessor.transform(test))))",
527 |     "outputs" : [ ]
528 |   }, {
529 |     "metadata" : {
530 |       "id" : "544E5F9A75484F238B8E03A4A32CC949"
531 |     },
532 |     "cell_type" : "markdown",
533 |     "source" : "Adding one-hot encoding for each categorical variable yielded a significant increase in quality."
534 |   }, {
535 |     "metadata" : {
536 |       "id" : "381BE3ADF02346668B5C2F242089ED97"
537 |     },
538 |     "cell_type" : "markdown",
539 |     "source" : "We also can combine several stages with LogisticRegression stage into one pipeline and perform grid search against hyperparameters of several stages included in the pipeline."
540 |   }, {
541 |     "metadata" : {
542 |       "trusted" : true,
543 |       "input_collapsed" : false,
544 |       "collapsed" : true,
545 |       "id" : "85ECF5E33F154845A038EC9F71F2F5A3"
546 |     },
547 |     "cell_type" : "markdown",
548 |     "source" : "For example, let's try to add [Buketizer](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.Bucketizer)\ntransformation applied to `age` column and add `splits` parameter values\nto pipeline parameters grid and see how it will affect metric score."
549 |   }, {
550 |     "metadata" : {
551 |       "trusted" : true,
552 |       "input_collapsed" : false,
553 |       "collapsed" : false,
554 |       "id" : "BEF557AD59414C568B55E45B1CC38606"
555 |     },
556 |     "cell_type" : "code",
557 |     "source" : "data.select(min(\"age\"), max(\"age\"))",
558 |     "outputs" : [ ]
559 |   }, {
560 |     "metadata" : {
561 |       "trusted" : true,
562 |       "input_collapsed" : false,
563 |       "collapsed" : false,
564 |       "id" : "0C7C0F51A07C436A823D55D5E7319155"
565 |     },
566 |     "cell_type" : "code",
567 |     "source" : "// We need to cast age column to DoubleType to apply Bucketizer transformation.\nimport org.apache.spark.sql.types.DoubleType\n\nval castData = data.withColumn(\"age\", data(\"age\").cast(DoubleType))\n\nval Array(castTraining, castTest) = castData.randomSplit(Array(0.8, 0.2), seed = 12345)",
568 |     "outputs" : [ ]
569 |   }, {
570 |     "metadata" : {
571 |       "trusted" : true,
572 |       "input_collapsed" : false,
573 |       "collapsed" : false,
574 |       "id" : "9601D696BA2B44C193B42429F0A96AA9"
575 |     },
576 |     "cell_type" : "code",
577 |     "source" : "import org.apache.spark.ml.feature.Bucketizer\n\nval ageBucketizer = new Bucketizer()\n  .setInputCol(\"age\")\n  .setOutputCol(\"age-buckets\")\n\nval lr = new LogisticRegression()\n  .setFeaturesCol(\"features\")\n\nval pipelineParamGrid = new ParamGridBuilder()\n  .addGrid(lr.regParam, Array(1e-3, 5e-4, 1e-4, 5e-5, 1e-5))\n  .addGrid(ageBucketizer.splits, Array(Array(15.0, 30.0, 40.0, 50.0, 100.0),\n                                       Array(15.0, 21.0, 25.0, 30.0, 40.0, 50.0, 70.0, 100.0)))\n  .build()\n\nval assembler = new VectorAssembler()\n  .setInputCols(Array(\"age-buckets\",\n                      \"fnlwgt\", \n                      \"education-num\", \n                      \"capital-gain\", \n                      \"capital-loss\",\n                      \"hours-per-week\") ++\n                categCols.map(cname => s\"${cname}_vec\"))\n  .setOutputCol(\"features\")\n\nval mlPipeline = new Pipeline()\n  .setStages(Array(ageBucketizer) ++\n             featureIndexers ++\n             oneHotEncoders ++\n             Array(assembler, labelIndexer, lr))\n\nval pipelineCV = new CrossValidator()\n  .setEstimator(mlPipeline)\n  .setEvaluator(new BinaryClassificationEvaluator)\n  .setEstimatorParamMaps(pipelineParamGrid)\n  .setNumFolds(5)\n\nval pipelineCVModel = pipelineCV.fit(castTraining)",
578 |     "outputs" : [ ]
579 |   }, {
580 |     "metadata" : {
581 |       "trusted" : true,
582 |       "input_collapsed" : false,
583 |       "collapsed" : false,
584 |       "id" : "7F129445DA8F4AEC8B4DFC8720451AA8"
585 |     },
586 |     "cell_type" : "code",
587 |     "source" : "println(\"cross-validated areaUnderROC: \" + pipelineCVModel.avgMetrics.max)\nprintln(\"test areaUnderROC: \" + eval.evaluate(pipelineCVModel.transform(castTest)))",
588 |     "outputs" : [ ]
589 |   }, {
590 |     "metadata" : {
591 |       "trusted" : true,
592 |       "input_collapsed" : false,
593 |       "collapsed" : true,
594 |       "id" : "981D044870FA41F8B6B927580837FA53"
595 |     },
596 |     "cell_type" : "markdown",
597 |     "source" : "We can see what adding `Bucketizer` step into pipeline combained with simultanious param grid search over several stages (`Bucketizer` and `LogisticRegression`) boosted the quality of our ml pipeline.\n\nYou can continue to modify and expand the pipeline by adding new stages of data transformation and add new parameters into parameter grid for cross-validation."
598 |   } ],
599 |   "nbformat" : 4
600 | }


--------------------------------------------------------------------------------
/labs/IntroToMLandSparkMLPipelines/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction to Machine Learning and Spark ML Pipelines
  2 | 
  3 | <div style="text-align:center">
  4 |   <img src="https://gitlab.com/droff/ph/raw/477d2a011575887dfb65d36dc3ff4c116f3bf586/logos/Spark-logo.png" width="192" height="100" style="margin-right:70px">
  5 |   <img src="https://gitlab.com/droff/ph/raw/477d2a011575887dfb65d36dc3ff4c116f3bf586/logos/spark-notebook-logo.png" width="111" height="128">
  6 | </div>
  7 | 
  8 | # Machine learning Pipeline
  9 | 
 10 | In this lab we are going to learn how to teach machine learning models, how to correctly set up an experiment, how to tune model hyperparameters and how to compare models. Also we'are going to get familiar with spark.ml package as soon as all of the work we'are going to get done using this package.
 11 | 
 12 | * http://spark.apache.org/docs/latest/ml-guide.html
 13 | * http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.package
 14 | 
 15 | ## Evaluation Metrics
 16 | Model training and model quality assessment is performed on independent sets of examples. As a rule, the available examples are divided into two subsets: training (train) and control (test). The choice of the proportions of the split is a compromise. Indeed, the large size of the training leads to better quality of algorithms, but more noisy estimation of the model on the control. Conversely, the large size of the test sample leads to a less noisy assessment of the quality, however, models are less accurate.
 17 | 
 18 | Many classification models produce estimation of belonging to the class <img src="http://telegra.ph/file/8aacba8ccab5367659ee8.png" border="0" /> (for example, the probability of belonging to the class 1). They then make a decision about the class of the object by comparing the estimates with a certain threshold $\theta$:
 19 | 
 20 |  <img src="http://telegra.ph/file/dba22c4d6f6fd98795bc7.png" align="center" border="0" />
 21 | 
 22 | 
 23 | In this case, we can consider metrics that are able to work with estimates of belonging to a class.
 24 | In this lab, we will work with [AUC-ROC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) metric. Detailed understanding of the operating principle of AUC-ROC metric is not required to perform the lab.
 25 | ## Model Hyperparameter Tuning
 26 | In machine learning problems it is necessary to distinguish the parameters of the model and hyperparameters (structural parameters). The model parameters are adjusted during the training (e.g., weights in the linear model or the structure of the decision tree), while hyperparameters are set in advance (for example, the regularization in linear model or maximum depth of the decision tree). Each model usually has many hyperparameters, and there is no universal set of hyperparameters optimal working in all tasks, for each task one should choose a different set of hyperparameters. _Grid search_ is commonly used to optimize model hyperparameters: for each parameter several values are selected and combination of parameter values where the model shows the best quality (in terms of the metric that is being optimized) is selected. However, in this case, it is necessary to correctly assess the constructed model, namely to do the split into training and test sample. There are several ways how it can be implemented:
 27 | 
 28 |  - Split the available samples into training and test samples. In this case, the comparison of a large number of models in the search of parameters leads to a situation when the best model on test data does not maintain its quality on new data. We can say that there is overfitting on the test data.
 29 |  - To eliminate the problem described above, it is possible to split data into 3 disjoint sub-samples: `train`, `validation` and `test`. The `validation` set is used for models comparison, and `test` set is used for the final quality assessment and comparison of families of models with selected parameters.
 30 |  - Another way to compare models is [cross-validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics). There are different schemes of cross-validation:
 31 |   - Leave-one-out cross-validation
 32 |   - K-fold cross-validation
 33 |   - Repeated random sub-sampling validation
 34 |   
 35 | Cross-validation is computationally expensive operation, especially if you are doing a grid search with a very large number of combinations. So there are a number of compromises:
 36 |  - the grid can be made more sparse, touching fewer values for each parameter, however, we must not forget that in such case one can skip a good combination of parameters;
 37 |  - cross-validation can be done with a smaller number of partitions or folds, but in this case the quality assessment of cross-validation becomes more noisy and increases the risk to choose a suboptimal set of parameters due to the random nature of the split;
 38 |  - the parameters can be optimized sequentially (greedy) — one after another, and not to iterate over all combinations; this strategy does not always lead to the optimal set;
 39 |  - enumerate only small number of randomly selected combinations of values of hyperparameters.
 40 |  
 41 |  ## Data
 42 | 
 43 | We'are going to solve binary classification problem by building the algorithm which determines whether a person makes over 50K a year. Following variables are available:
 44 | * age
 45 | * workclass
 46 | * fnlwgt
 47 | * education
 48 | * education-num
 49 | * marital-status
 50 | * occupation
 51 | * relationship
 52 | * race
 53 | * sex
 54 | * capital-gain
 55 | * capital-loss
 56 | * hours-per-week
 57 | 
 58 | More on this data one can read in [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names)
 59 | 
 60 | ```scala
 61 | val spark = sparkSession
 62 | 
 63 | val df = spark.read
 64 |   .option("header", "true")
 65 |   .option("inferSchema", "true")
 66 |   .csv("notebooks/spark-notebook-ml-labs/labs/IntroToMLandSparkMLPipelines/data/data.adult.csv")  
 67 |   
 68 | df.show(5)
 69 | ```
 70 | 
 71 | ```
 72 | +---+---------+------+------------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+----------+
 73 | |age|workclass|fnlwgt|   education|education-num|    marital-status|     occupation| relationship| race|   sex|capital-gain|capital-loss|hours-per-week|>50K,<=50K|
 74 | +---+---------+------+------------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+----------+
 75 | | 34|Local-gov|284843|     HS-grad|            9|     Never-married|Farming-fishing|Not-in-family|Black|  Male|         594|           0|            60|     <=50K|
 76 | | 40|  Private|190290|Some-college|           10|          Divorced|          Sales|Not-in-family|White|  Male|           0|           0|            40|     <=50K|
 77 | | 36|Local-gov|177858|   Bachelors|           13|Married-civ-spouse| Prof-specialty|    Own-child|White|  Male|           0|           0|            40|     <=50K|
 78 | | 22|  Private|184756|Some-college|           10|     Never-married|          Sales|    Own-child|White|Female|           0|           0|            30|     <=50K|
 79 | | 47|  Private|149700|   Bachelors|           13|Married-civ-spouse|   Tech-support|      Husband|White|  Male|       15024|           0|            40|      >50K|
 80 | +---+---------+------+------------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+----------+
 81 | only showing top 5 rows
 82 | ```
 83 | 
 84 | Sometimes there are missing values in the data. Sometimes, in the description of the dataset one can found the description of format of missing values. Particularly in the given dataset  missing values are identified by '?' sign.
 85 | 
 86 | **Problem** Find all the variables with missing values. Remove from the dataset all objects with missing values in any variable.
 87 | 
 88 | ```scala
 89 | val missingValsFeatures = df.columns.filter(column => df.filter(df(column) === "?").count > 0)
 90 | 
 91 | println("Features with missing values: " + missingValsFeatures.mkString(", "))
 92 | 
 93 | val data = missingValsFeatures.foldLeft(df)((dfstage, column) => dfstage.filter(!dfstage(column).equalTo("?")))
 94 | ```
 95 | 
 96 | Split on training and test datasets.
 97 | 
 98 | ```scala
 99 | val Array(training, test) = data.randomSplit(Array(0.8, 0.2), seed = 1234)
100 | ```
101 | 
102 | ### MLlib Transformers and Estimators
103 | 
104 | `Transformer` transforms one `DataFrame` into another `DataFrame`.
105 | 
106 | <div style="text-align:left">
107 |   <img src="https://gitlab.com/droff/ph/raw/master/images/Transformer.png" width="566" height="352">
108 | </div>
109 | 
110 | `Estimator` fits on a `DataFrame` to produce a `Transformer`.
111 | 
112 | <div style="text-align:left">
113 |   <img src="https://gitlab.com/droff/ph/raw/master/images/Estimator.png" width="681" height="327">
114 | </div>
115 | 
116 | ## Training classifiers on numeric features
117 | 
118 | Some preprocessing steps are usually required after loading and cleaning dataset. In this case, these steps will include the following:
119 | 
120 |  - At first we will work only with numeric features. So let's select them separately in the feature vector "numFeatures" using [VectorAssembler](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler).
121 |  - Select the target variable (the one we want to predict, string column of labels) and map it to an ML column of label indices using [StringIndexer](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.StringIndexer), give the name "labelIndex" to a new variable.
122 |  
123 | ```scala
124 | import org.apache.spark.ml.feature.VectorAssembler
125 | import org.apache.spark.ml.feature.StringIndexer
126 | 
127 | val assembler = new VectorAssembler()
128 |   .setInputCols(Array("age",
129 |                       "fnlwgt", 
130 |                       "education-num", 
131 |                       "capital-gain", 
132 |                       "capital-loss",
133 |                       "hours-per-week"))
134 |   .setOutputCol("numFeatures")
135 | 
136 | val labelIndexer = new StringIndexer()
137 |   .setInputCol(">50K,<=50K")
138 |   .setOutputCol("label")
139 |   .fit(training)
140 | ```
141 |  
142 | ```scala
143 |  labelIndexer.transform(training).select(">50K,<=50K", "label").show(8)
144 | ```
145 | ```
146 |  +----------+-----+
147 | |>50K,<=50K|label|
148 | +----------+-----+
149 | |     <=50K|  0.0|
150 | |     <=50K|  0.0|
151 | |     <=50K|  0.0|
152 | |     <=50K|  0.0|
153 | |     <=50K|  0.0|
154 | |     <=50K|  0.0|
155 | |     <=50K|  0.0|
156 | |     <=50K|  0.0|
157 | +----------+-----+
158 | only showing top 8 rows
159 | ```
160 |  
161 | ```scala
162 |  assembler.transform(training)
163 |          .select("age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week", "numFeatures")
164 |          .show(5, truncate=false)
165 | ```
166 | ```
167 | +-----+--------------------------------+
168 | |label|numFeatures                     |
169 | +-----+--------------------------------+
170 | |0.0  |[17.0,192387.0,5.0,0.0,0.0,45.0]|
171 | |0.0  |[17.0,340043.0,8.0,0.0,0.0,12.0]|
172 | |0.0  |[17.0,24090.0,9.0,0.0,0.0,35.0] |
173 | |0.0  |[17.0,25690.0,6.0,0.0,0.0,10.0] |
174 | |0.0  |[17.0,28031.0,5.0,0.0,0.0,16.0] |
175 | +-----+--------------------------------+
176 | only showing top 5 rows
177 | ```
178 | 
179 | ```scala
180 | import org.apache.spark.ml.classification.LogisticRegression
181 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
182 | 
183 | 
184 | val lr = new LogisticRegression()
185 |                 .setFeaturesCol("numFeatures")
186 |                 .setLabelCol("label")
187 |                 .setRegParam(0.1)
188 | 
189 | val lrModel = lr.fit(trainData)
190 | 
191 | val testData = assembler.transform{
192 |                 labelIndexer.transform(test)
193 |               }
194 |               
195 | val eval = new BinaryClassificationEvaluator()
196 |                   .setMetricName("areaUnderROC")
197 | 
198 | println(eval.evaluate(lrModel.transform(testData)))
199 | ```
200 | ```
201 | 0.7937381854879748
202 | ```
203 | 
204 | ## Model selection with MLlib
205 | Apache Spark MLlib supports model hyperparameter tuning using tools such as `CrossValidator` and `TrainValidationSplit`. These tools require the following items:
206 | 
207 |  - Estimator: algorithm or Pipeline to tune
208 |  - Set of ParamMaps: parameters to choose from, sometimes called a “parameter grid” to search over
209 |  - Evaluator: metric to measure how well a fitted Model does on held-out test data
210 |  
211 | In this section we will need to work only with numeric features and a target variable.
212 | At the beginning let's have a look at grid search in action.
213 | We will consider 2 algorithms:
214 |  - [LogisticRegression](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression)
215 |  - [DecisionTreeClassifier](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.classification.DecisionTreeClassifier)
216 |  
217 | To start with, let's choose one parameter to optimize for each algorithm:
218 |  - LogisticRegression — regularization parameter (*regParam*)
219 |  - DecisonTreeClassifier — maximum depth of the tree (*maxDepth*)
220 |  
221 | The remaining parameters we will leave at their default values. 
222 | To implement grid search procedure one can use
223 | [CrossValidator](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) class
224 | combining with [ParamGridBuilder](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.tuning.ParamGridBuilder) class. 
225 | Also we need to specify appropriate evaluator for this task, in our case we should use [BinaryClassificationEvaluator](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator)
226 | (note that its default metric is areaUnderROC, so we don't neet to specify metric via `setMetricName` method call).
227 | Set up 5-fold cross validation scheme.
228 | 
229 | <div style="font-size:large">K-fold cross-validation</div>
230 | <div style="text-align:left">
231 |   <img src="https://upload.wikimedia.org/wikipedia/commons/1/1c/K-fold_cross_validation_EN.jpg" width="562" height="262">
232 | </div>
233 | <div style="font-size:x-small">
234 |   By Fabian Flöck (Own work) [CC BY-SA 3.0 (http://creativecommons.org/licenses/by-sa/3.0)], via Wikimedia Commons
235 | </div>
236 | 
237 | **Problem** Try to find the optimal values of these hyperparameters for each algorithm. Plot the average cross-validation metrics for a given value of hyperparameter for each algorithm (hint: use `avgMetrics` field of resulting `CrossValidatorModel`).
238 | 
239 | ```scala
240 | import org.apache.spark.ml.classification.{LogisticRegression, DecisionTreeClassifier, RandomForestClassifier}
241 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
242 | import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}
243 | 
244 | 
245 | val lr = new LogisticRegression()
246 |                 .setFeaturesCol("numFeatures")
247 |                 .setLabelCol("label")
248 | 
249 | val lrParamGrid = new ParamGridBuilder()
250 |   .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4))
251 |   .build()
252 | 
253 | val lrCV = new CrossValidator()
254 |   .setEstimator(lr)
255 |   .setEvaluator(new BinaryClassificationEvaluator)
256 |   .setEstimatorParamMaps(lrParamGrid)
257 |   .setNumFolds(5)
258 | 
259 | val lrCVModel = lrCV.fit(trainData)
260 | 
261 | println("cross-validated areaUnderROC: " + lrCVModel.avgMetrics.max)
262 | println("test areaUnderROC: " + eval.evaluate(lrCVModel.transform(testData)))
263 | ```
264 | ```
265 | cross-validated areaUnderROC: 0.8297755442702006
266 | test areaUnderROC: 0.8068812315222861
267 | ```
268 | 
269 | ```scala
270 | val tree = new DecisionTreeClassifier()
271 |                 .setFeaturesCol("numFeatures")
272 |                 .setLabelCol("label")
273 | 
274 | val treeParamGrid = new ParamGridBuilder()
275 |   .addGrid(tree.maxDepth, Array(5, 10, 20, 25, 30))
276 |   .build()
277 | 
278 | val treeCV = new CrossValidator()
279 |   .setEstimator(tree)
280 |   .setEvaluator(new BinaryClassificationEvaluator)
281 |   .setEstimatorParamMaps(treeParamGrid)
282 |   .setNumFolds(5)
283 | 
284 | val treeCVModel = treeCV.fit(trainData)
285 | 
286 | println("cross-validated areaUnderROC: " + treeCVModel.avgMetrics.max)
287 | println("test areaUnderROC: " + eval.evaluate(treeCVModel.transform(testData)))
288 | ```
289 | ```
290 | cross-validated areaUnderROC: 0.7105377328054816
291 | test areaUnderROC: 0.6934402983359256
292 | ```
293 | 
294 | ```scala
295 | lrCVModel.getEstimatorParamMaps
296 |                             .map(paramMap => paramMap(lr.regParam))
297 |                             .zip(lrCVModel.avgMetrics)
298 |                             .toSeq.toDF("regParam", "AUC-ROC")
299 |                             .collect
300 | ```
301 | 
302 | <img src="http://telegra.ph/file/62694a00f2434bca1a41a.png" width=900>
303 | </img>
304 | 
305 | ```scala
306 | treeCVModel.getEstimatorParamMaps
307 |                             .map(paramMap => paramMap(tree.maxDepth))
308 |                             .zip(treeCVModel.avgMetrics)
309 |                             .toSeq.toDF("maxDepth", "AUC-ROC")
310 |                             .collect
311 | ```
312 | 
313 | <img src="http://telegra.ph/file/8c2ba01164df95f3525d4.png" width=900>
314 | </img>
315 | 
316 | ## Adding categorical features
317 | 
318 | Up to this point we did not use categorical features from the dataset. Let's see how additional categorical features will affect the quality of the classification. A common technique to convert categorical feature into numerical ones is [one-hot](https://en.wikipedia.org/wiki/One-hot) encoding. This can be done using [StringIndexer](http://spark.apache.org/docs/1.6.1/api/scala/index.html#org.apache.spark.ml.feature.StringIndexer) transformation followed by [OneHotEncoder](http://spark.apache.org/docs/1.6.1/api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder) transformation.
319 | 
320 | *Let's start with encoding just one new feature `occupation` and after that generalize encoding step for all categorical features and combine all processing steps using [pipeline](http://spark.apache.org/docs/1.6.1/ml-guide.html#pipeline)*
321 | 
322 | ```scala
323 | data.groupBy("occupation").count.show(truncate=false)
324 | println(data.select("occupation").distinct.count)
325 | ```
326 | ```
327 | +-----------------+-----+
328 | |occupation       |count|
329 | +-----------------+-----+
330 | |Sales            |1840 |
331 | |Exec-managerial  |2017 |
332 | |Prof-specialty   |2095 |
333 | |Handlers-cleaners|674  |
334 | |Farming-fishing  |481  |
335 | |Craft-repair     |2057 |
336 | |Transport-moving |799  |
337 | |Priv-house-serv  |90   |
338 | |Protective-serv  |343  |
339 | |Other-service    |1617 |
340 | |Tech-support     |464  |
341 | |Machine-op-inspct|1023 |
342 | |Armed-Forces     |3    |
343 | |Adm-clerical     |1844 |
344 | +-----------------+-----+
345 | 
346 | 14
347 | ```
348 | 
349 | ```scala
350 | import org.apache.spark.ml.feature.OneHotEncoder
351 | 
352 | val occupationIndexer = new StringIndexer()
353 |   .setInputCol("occupation")
354 |   .setOutputCol("occupationIndex")
355 |   .fit(training)
356 | 
357 | val indexedTrainData = occupationIndexer.transform(training)
358 | 
359 | val occupationEncoder = new OneHotEncoder()
360 |   .setInputCol("occupationIndex")
361 |   .setOutputCol("occupationVec")
362 | 
363 | val oheEncodedTrainData = occupationEncoder.transform(indexedTrainData)
364 | 
365 | oheEncodedTrainData.select("occupation", "occupationVec").show(5, truncate=false)
366 | ```
367 | ```
368 | +---------------+--------------+
369 | |occupation     |occupationVec |
370 | +---------------+--------------+
371 | |Other-service  |(13,[5],[1.0])|
372 | |Adm-clerical   |(13,[4],[1.0])|
373 | |Exec-managerial|(13,[2],[1.0])|
374 | |Other-service  |(13,[5],[1.0])|
375 | |Other-service  |(13,[5],[1.0])|
376 | +---------------+--------------+
377 | only showing top 5 rows
378 | ```
379 | 
380 | ```scala
381 | val assembler = new VectorAssembler()
382 |   .setInputCols(Array("age",
383 |                       "fnlwgt", 
384 |                       "education-num", 
385 |                       "capital-gain", 
386 |                       "capital-loss",
387 |                       "hours-per-week",
388 |                       "occupationVec"))
389 |   .setOutputCol("features")
390 | 
391 | 
392 | val trainDataWithOccupation = assembler.transform{
393 |                                 labelIndexer.transform(oheEncodedTrainData)
394 |                               }.select("label", "features")
395 | ```
396 | 
397 | *For the sake of brevity, from now let's use only LogisticRegression model.*
398 | 
399 | ```scala
400 | val lr = new LogisticRegression()
401 |   .setFeaturesCol("features")
402 | 
403 | val lrParamGrid = new ParamGridBuilder()
404 |   .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4))
405 |   .build()
406 | 
407 | val lrCV = new CrossValidator()
408 |   .setEstimator(lr)
409 |   .setEvaluator(new BinaryClassificationEvaluator)
410 |   .setEstimatorParamMaps(lrParamGrid)
411 |   .setNumFolds(5)
412 | 
413 | val lrCVModel = lrCV.fit(trainDataWithOccupation)
414 | 
415 | val testDataWithOccupation = assembler.transform{
416 |                                 labelIndexer.transform(occupationEncoder.transform(occupationIndexer.transform(test)))
417 |                               }.select("label", "features")
418 | 
419 | println("cross-validated areaUnderROC: " + lrCVModel.avgMetrics.max)
420 | println("test areaUnderROC: " + eval.evaluate(lrCVModel.transform(testDataWithOccupation)))
421 | ```
422 | ```
423 | cross-validated areaUnderROC: 0.8447936545404254
424 | test areaUnderROC: 0.823490779891881
425 | ```
426 | 
427 | Adding `occupation` categorical variable yielded an increase in quality.
428 | 
429 | ## Pipelines
430 | 
431 | Using [pipelines](http://spark.apache.org/docs/1.6.1/ml-guide.html#pipeline) one can combine all the processing stages into one pipeline and perform grid search against hyperparameters of all stages included in the pipeline. Also it's easy to extend given pipeline with new steps.
432 | 
433 | A Pipeline chains multiple Transformers and Estimators together to specify an ML workflow.
434 | 
435 | <div style="text-align:left">
436 |   <img src="https://gitlab.com/droff/ph/raw/master/images/Pipeline.png" width="1057" height="515">
437 | </div>
438 | 
439 |  Let's see how we can combine all the preprocessing steps made so far into one pipeline.
440 |  
441 | ```scala
442 | import org.apache.spark.ml.Pipeline
443 | 
444 | 
445 | // Chain indexers, encoders and assembler in a Pipeline
446 | val featurePipelineModel = new Pipeline()
447 |   .setStages(Array(occupationIndexer, 
448 |                    occupationEncoder,
449 |                    assembler,
450 |                    labelIndexer))
451 |   .fit(training)
452 |   
453 | featurePipelineModel.transform(test).select("features", "label").show(3, truncate=false)
454 | ```
455 | ```
456 | +----------------------------------------------+-----+
457 | |features                                      |label|
458 | +----------------------------------------------+-----+
459 | |(19,[0,1,2,5,11],[17.0,39815.0,6.0,25.0,1.0]) |0.0  |
460 | |(19,[0,1,2,5,17],[17.0,175587.0,7.0,30.0,1.0])|0.0  |
461 | |(19,[0,1,2,5,9],[17.0,191910.0,7.0,20.0,1.0]) |0.0  |
462 | +----------------------------------------------+-----+
463 | only showing top 3 rows
464 | ```
465 | 
466 | Now compare this
467 | ```scala
468 | eval.evaluate(lrCVModel.transform(labelIndexer.transform(assembler.transform(occupationEncoder.transform(occupationIndexer.transform(test))))))
469 | ```
470 | ```
471 | 0.823490779891881
472 | ```
473 | 
474 | and this
475 | 
476 | ```scala
477 | eval.evaluate(lrCVModel.transform(featurePipelineModel.transform(test)))
478 | ```
479 | ```
480 | 0.823490779891881
481 | ```
482 | 
483 | Now let's extend our pipeline by adding one-hot encoding step for each categorical feature.
484 | 
485 | ```scala
486 | val categCols = Array("workclass", "education", "marital-status", "occupation", "relationship", "race", "sex")
487 | 
488 | val featureIndexers: Array[org.apache.spark.ml.PipelineStage] = categCols.map(
489 |   cname => new StringIndexer()
490 |     .setInputCol(cname)
491 |     .setOutputCol(s"${cname}_index")
492 | )
493 | 
494 | val oneHotEncoders = categCols.map(
495 |     cname => new OneHotEncoder()
496 |      .setInputCol(s"${cname}_index")
497 |      .setOutputCol(s"${cname}_vec")
498 | )
499 | 
500 | val assembler = new VectorAssembler()
501 |   .setInputCols(Array("age",
502 |                       "fnlwgt", 
503 |                       "education-num", 
504 |                       "capital-gain", 
505 |                       "capital-loss",
506 |                       "hours-per-week") ++
507 |                 categCols.map(cname => s"${cname}_vec"))
508 |   .setOutputCol("features")
509 | 
510 | val rawDataProcessor = new Pipeline()
511 |   .setStages(featureIndexers ++
512 |              oneHotEncoders ++
513 |              Array(assembler, labelIndexer))
514 |   .fit(training)
515 |   
516 | rawDataProcessor.transform(test).limit(3).select("features", "label").show(truncate=false)
517 | ```
518 | ```
519 | +---------------------------------------------------------------------------------------+-----+
520 | |features                                                                               |label|
521 | +---------------------------------------------------------------------------------------+-----+
522 | |(56,[0,1,2,5,8,19,28,38,48,51],[17.0,39815.0,6.0,25.0,1.0,1.0,1.0,1.0,1.0,1.0])        |0.0  |
523 | |(56,[0,1,2,5,8,18,28,44,48,51,55],[17.0,175587.0,7.0,30.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|0.0  |
524 | |(56,[0,1,2,5,8,18,28,36,48,51,55],[17.0,191910.0,7.0,20.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|0.0  |
525 | +---------------------------------------------------------------------------------------+-----+
526 | ```
527 | 
528 | ```scala
529 | val lr = new LogisticRegression()
530 |   .setFeaturesCol("features")
531 | 
532 | val lrParamGrid = new ParamGridBuilder()
533 |   .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4))
534 |   .build()
535 | 
536 | val lrCV = new CrossValidator()
537 |   .setEstimator(lr)
538 |   .setEvaluator(new BinaryClassificationEvaluator)
539 |   .setEstimatorParamMaps(lrParamGrid)
540 |   .setNumFolds(5)
541 | 
542 | val lrCVModel = lrCV.fit(rawDataProcessor.transform(training))
543 | 
544 | println("cross-validated areaUnderROC: " + lrCVModel.avgMetrics.max)
545 | println("test areaUnderROC: " + eval.evaluate(lrCVModel.transform(rawDataProcessor.transform(test))))
546 | ```
547 | ```
548 | cross-validated areaUnderROC: 0.9070537976977229
549 | test areaUnderROC: 0.8893619862500176
550 | ```
551 | 
552 | Adding one-hot encoding for each categorical variable yielded a significant increase in quality.
553 | 
554 | We also can combine several stages with LogisticRegression stage into one pipeline and perform grid search against hyperparameters of several stages included in the pipeline.
555 | 
556 | For example, let's try to add [Buketizer](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.Bucketizer)
557 | transformation applied to `age` column and add `splits` parameter values
558 | to pipeline parameters grid and see how it will affect metric score.
559 | 
560 | ```scala
561 | data.select(min("age"), max("age")).show()
562 | ```
563 | ```
564 | +--------+--------+
565 | |min(age)|max(age)|
566 | +--------+--------+
567 | |      17|      90|
568 | +--------+--------+
569 | ```
570 | 
571 | ```scala
572 | // We need to cast age column to DoubleType to apply Bucketizer transformation.
573 | import org.apache.spark.sql.types.DoubleType
574 | 
575 | val castData = data.withColumn("age", data("age").cast(DoubleType))
576 | 
577 | val Array(castTraining, castTest) = castData.randomSplit(Array(0.8, 0.2), seed = 12345)
578 | ```
579 | 
580 | ```scala
581 | import org.apache.spark.ml.feature.Bucketizer
582 | 
583 | val ageBucketizer = new Bucketizer()
584 |   .setInputCol("age")
585 |   .setOutputCol("age-buckets")
586 | 
587 | val lr = new LogisticRegression()
588 |   .setFeaturesCol("features")
589 | 
590 | val pipelineParamGrid = new ParamGridBuilder()
591 |   .addGrid(lr.regParam, Array(1e-3, 5e-4, 1e-4, 5e-5, 1e-5))
592 |   .addGrid(ageBucketizer.splits, Array(Array(15.0, 30.0, 40.0, 50.0, 100.0),
593 |                                        Array(15.0, 21.0, 25.0, 30.0, 40.0, 50.0, 70.0, 100.0)))
594 |   .build()
595 | 
596 | val assembler = new VectorAssembler()
597 |   .setInputCols(Array("age-buckets",
598 |                       "fnlwgt", 
599 |                       "education-num", 
600 |                       "capital-gain", 
601 |                       "capital-loss",
602 |                       "hours-per-week") ++
603 |                 categCols.map(cname => s"${cname}_vec"))
604 |   .setOutputCol("features")
605 | 
606 | val mlPipeline = new Pipeline()
607 |   .setStages(Array(ageBucketizer) ++
608 |              featureIndexers ++
609 |              oneHotEncoders ++
610 |              Array(assembler, labelIndexer, lr))
611 | 
612 | val pipelineCV = new CrossValidator()
613 |   .setEstimator(mlPipeline)
614 |   .setEvaluator(new BinaryClassificationEvaluator)
615 |   .setEstimatorParamMaps(pipelineParamGrid)
616 |   .setNumFolds(5)
617 | 
618 | val pipelineCVModel = pipelineCV.fit(castTraining)
619 | 
620 | println("cross-validated areaUnderROC: " + pipelineCVModel.avgMetrics.max)
621 | println("test areaUnderROC: " + eval.evaluate(pipelineCVModel.transform(castTest)))
622 | ```
623 | ```
624 | cross-validated areaUnderROC: 0.9052412424175416
625 | test areaUnderROC: 0.9033115341268361
626 | ```
627 | 
628 | We can see what adding `Bucketizer` step into pipeline combained with simultanious param grid search over several stages (`Bucketizer` and `LogisticRegression`) boosted the quality of our ml pipeline.
629 | 
630 | You can continue to modify and expand the pipeline by adding new stages of data transformation and add new parameters into parameter grid for cross-validation.
631 | 


--------------------------------------------------------------------------------
/labs/IntroToMachineLearning/images/ageHistData.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/ageHistData.png


--------------------------------------------------------------------------------
/labs/IntroToMachineLearning/images/cgainHistData.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/cgainHistData.png


--------------------------------------------------------------------------------
/labs/IntroToMachineLearning/images/fnlwgtHistData.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/fnlwgtHistData.png


--------------------------------------------------------------------------------
/labs/IntroToMachineLearning/images/lrAvgMetrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/lrAvgMetrics.png


--------------------------------------------------------------------------------
/labs/IntroToMachineLearning/images/rfAvgMetrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/rfAvgMetrics.png


--------------------------------------------------------------------------------
/labs/IntroToMachineLearning/images/rfAvgMetrics2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/rfAvgMetrics2.png


--------------------------------------------------------------------------------
/labs/IntroToMachineLearning/images/treeAvgMetrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/treeAvgMetrics.png


--------------------------------------------------------------------------------
/labs/TitanicSurvivalExploration/README.md:
--------------------------------------------------------------------------------
  1 | #  Titanic Survival Exploration
  2 | 
  3 | <div style="text-align:center">
  4 |   <img src="http://telegra.ph/file/f5adcd2e260285ed766bd.png" width="192" height="100" style="margin-right:70px">
  5 |   <img src="http://telegra.ph/file/72de05a5e0fc1e392e569.png" width="111" height="128">
  6 | </div>
  7 | 
  8 | ## Spark quick review
  9 | 
 10 | Spark provides convenient programming abstraction and parallel runtime to hide distributed computations complexities.
 11 | 
 12 | 
 13 | <img src="http://telegra.ph/file/41a2ce855b179b4e9bd44.png" width="316" height="149">
 14 | 
 15 | 
 16 | In this first lab we will focus on DataFrames and SQL.
 17 | In second lab we will use Spark MLlib for building machine learning pipelines.
 18 | 
 19 | ### Spark Cluster
 20 | 
 21 | <div style="text-align:left">
 22 |   <img src="http://telegra.ph/file/cf242107c6e3fc854ce04.png" width="567" height="492">
 23 | </div>
 24 | 
 25 | Main entry point for Spark functionality is a `SparkContex`. `SparkContext` tells Spark how to access a cluster.
 26 | `Spark Notebook` automatically creates `SparkContext`.
 27 | 
 28 | Examples of `master` parameter configuration for `SparkContext`:
 29 | 
 30 | | Master Parameter  |             Description                 |
 31 | | ----------------- |----------------------------------------:|
 32 | | local[K]          | run Spark locally with K worker threads |
 33 | | spark://HOST:PORT | connect to Spark Standalone cluster     |
 34 | | mesos://HOST:PORT | connect to Mesos cluster                |
 35 | 
 36 | ```scala
 37 | sparkContext
 38 | ```
 39 | 
 40 | ## Spark SQL and DataFrames
 41 | 
 42 | * http://spark.apache.org/docs/latest/sql-programming-guide.html
 43 | * http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.Dataset
 44 | 
 45 | A DataFrame is a distributed collection of data organized into named columns.
 46 | It is conceptually equivalent to a table in a relational database or a data frame in R/Python
 47 | 
 48 | The entry point to programming Spark with SQL and DataFrame API in Spark 2.0 is the new `SparkSession` class:
 49 | 
 50 | ```scala
 51 | sparkSession
 52 | ```
 53 | 
 54 | ```scala
 55 | val spark = sparkSession
 56 | 
 57 | // This import is needed to use the $-notation
 58 | import spark.implicits._
 59 | ```
 60 | 
 61 | With a SparkSession you can create DataFrames from an existing RDD, from files in HDFS or any other storage system, or from Scala collections.
 62 | 
 63 | ```scala
 64 | Seq(("Alice", 20, "female"), ("Bob", 31, "male"), ("Eva", 16, "female")).toDF("name", "age", "gender").show()
 65 | ```
 66 | 
 67 | ```
 68 | +-----+---+------+
 69 | | name|age|gender|
 70 | +-----+---+------+
 71 | |Alice| 20|female|
 72 | |  Bob| 31|  male|
 73 | |  Eva| 16|female|
 74 | +-----+---+------+
 75 | ```
 76 | 
 77 | ```scala
 78 | case class Person(name: String, age: Int, gender: String)
 79 | 
 80 | val persons = Seq(Person("Alice", 20, "female"), Person("Bob", 31, "male"), Person("Eva", 16, "female")).toDF()
 81 | persons.show()
 82 | ```
 83 | 
 84 | ```
 85 | +-----+---+------+
 86 | | name|age|gender|
 87 | +-----+---+------+
 88 | |Alice| 20|female|
 89 | |  Bob| 31|  male|
 90 | |  Eva| 16|female|
 91 | +-----+---+------+
 92 | 
 93 | persons: org.apache.spark.sql.DataFrame = [name: string, age: int ... 1 more field]
 94 | ```
 95 | 
 96 | ```scala
 97 | persons.select("name", "age").show()
 98 | ```
 99 | 
100 | ```
101 | +-----+---+
102 | | name|age|
103 | +-----+---+
104 | |Alice| 20|
105 | |  Bob| 31|
106 | |  Eva| 16|
107 | +-----+---+
108 | ```
109 | 
110 | ```scala
111 | val young = persons.filter($"age" < 21)
112 | young.show()
113 | ```
114 | 
115 | ```
116 | +-----+---+------+
117 | | name|age|gender|
118 | +-----+---+------+
119 | |Alice| 20|female|
120 | |  Eva| 16|female|
121 | +-----+---+------+
122 | 
123 | young: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [name: string, age: int ... 1 more field]
124 | ```
125 | 
126 | ```scala
127 | young.select(young("name"), ($"age" + 1).alias("incremented age")).show()
128 | ```
129 | 
130 | ```
131 | +-----+---------------+
132 | | name|incremented age|
133 | +-----+---------------+
134 | |Alice|             21|
135 | |  Eva|             17|
136 | +-----+---------------+
137 | ```
138 | 
139 | ```scala
140 | persons.groupBy("gender").count.show
141 | ```
142 | 
143 | ```
144 | +------+-----+
145 | |gender|count|
146 | +------+-----+
147 | |female|    2|
148 | |  male|    1|
149 | +------+-----+
150 | ```
151 | 
152 | # Titanic Dataset
153 | 
154 | More on this dataset you can read [here](https://www.kaggle.com/c/titanic/data).
155 | 
156 | <div style="text-align:left">
157 |   <img src="https://upload.wikimedia.org/wikipedia/commons/6/6e/Stöwer_Titanic.jpg" width="427" height="292">
158 | </div>
159 | <div style="font-size:x-small">
160 |   By <span class="fn value"><a href="//commons.wikimedia.org/wiki/Willy_St%C3%B6wer" title="Willy Stöwer">Willy Stöwer</a>, died on 31st May 1931</span> - Magazine Die Gartenlaube, <a href="https://en.wikipedia.org/wiki/Die_Gartenlaube" class="extiw" title="en:Die Gartenlaube">en:Die Gartenlaube</a> and <a href="https://de.wikipedia.org/wiki/Die_Gartenlaube" class="extiw" title="de:Die Gartenlaube">de:Die Gartenlaube</a>, Public Domain, <a href="https://commons.wikimedia.org/w/index.php?curid=97646">Link</a>
161 | </div>
162 | 
163 | Out of the box, DataFrame supports reading data from the most popular formats, including JSON files, CSV files, Parquet files, Hive tables.
164 | 
165 | ```scala
166 | val passengersDF = spark.read
167 |   .option("header", "true")
168 |   .option("inferSchema", "true")
169 |   .csv("notebooks/spark-notebook-ml-labs/labs/TitanicSurvivalExploration/data/titanic_train.csv")  
170 |   
171 | passengersDF.printSchema
172 | ```
173 | 
174 | ```
175 | root
176 |  |-- PassengerId: integer (nullable = true)
177 |  |-- Survived: integer (nullable = true)
178 |  |-- Pclass: integer (nullable = true)
179 |  |-- Name: string (nullable = true)
180 |  |-- Sex: string (nullable = true)
181 |  |-- Age: double (nullable = true)
182 |  |-- SibSp: integer (nullable = true)
183 |  |-- Parch: integer (nullable = true)
184 |  |-- Ticket: string (nullable = true)
185 |  |-- Fare: double (nullable = true)
186 |  |-- Cabin: string (nullable = true)
187 |  |-- Embarked: string (nullable = true)
188 | ```
189 | 
190 | Look at 5 records in passengers DataFrame:
191 | 
192 | ```scala
193 | passengersDF.show(5, truncate=false)
194 | ```
195 | 
196 | ```
197 | +-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
198 | |PassengerId|Survived|Pclass|Name                                               |Sex   |Age |SibSp|Parch|Ticket          |Fare   |Cabin|Embarked|
199 | +-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
200 | |1          |0       |3     |Braund, Mr. Owen Harris                            |male  |22.0|1    |0    |A/5 21171       |7.25   |null |S       |
201 | |2          |1       |1     |Cumings, Mrs. John Bradley (Florence Briggs Thayer)|female|38.0|1    |0    |PC 17599        |71.2833|C85  |C       |
202 | |3          |1       |3     |Heikkinen, Miss. Laina                             |female|26.0|0    |0    |STON/O2. 3101282|7.925  |null |S       |
203 | |4          |1       |1     |Futrelle, Mrs. Jacques Heath (Lily May Peel)       |female|35.0|1    |0    |113803          |53.1   |C123 |S       |
204 | |5          |0       |3     |Allen, Mr. William Henry                           |male  |35.0|0    |0    |373450          |8.05   |null |S       |
205 | +-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+
206 | only showing top 5 rows
207 | ```
208 | 
209 | The sql function on a SparkSession enables applications to run SQL queries programmatically and returns the result as a DataFrame.
210 | To do this we need to register the DataFrame as a SQL temporary view
211 | 
212 | ```scala
213 | passengersDF.createOrReplaceTempView("passengers")
214 | 
215 | spark.sql("""
216 |   SELECT Name, Age, Pclass, Survived FROM passengers
217 |   WHERE Age < 30
218 | """).show(3, truncate=false)
219 | ```
220 | 
221 | ```
222 | +------------------------------+----+------+--------+
223 | |Name                          |Age |Pclass|Survived|
224 | +------------------------------+----+------+--------+
225 | |Braund, Mr. Owen Harris       |22.0|3     |0       |
226 | |Heikkinen, Miss. Laina        |26.0|3     |1       |
227 | |Palsson, Master. Gosta Leonard|2.0 |3     |0       |
228 | +------------------------------+----+------+--------+
229 | only showing top 3 rows
230 | ```
231 | 
232 | ### Transformations and Actions
233 | 
234 | Spark operations on DataFrames are one of two types. 
235 | * Transformations are lazily evaluated and create new Dataframes from existing ones. 
236 | * Actions trigger computation and return results or write DataFrames to storage.
237 | 
238 | *Computations are only triggered when an action is invoked.*
239 | 
240 | Here are some examples.
241 | 
242 | 
243 | |   Transformations   |    Actions   |
244 | | :-----------------: |:------------:|
245 | | select              |  count       |
246 | | filter              |  show        |
247 | | groupBy             |  save        |
248 | | orderBy             |  **collect** |
249 | | sample              |  take        |
250 | | limit               |  reduce      |
251 | | withColumn          ||
252 | | join                ||
253 | 
254 | **Q-1. How many different classes of passengers were aboard the Titanic?**
255 | 
256 | ```scala
257 | val pclasses = passengersDF.select("Pclass").distinct
258 | 
259 | pclasses.count
260 | ```
261 | ```
262 | res141: Long = 3
263 | 3
264 | ```
265 | 
266 | ```scala
267 | pclasses.show
268 | ```
269 | ```
270 | +------+
271 | |Pclass|
272 | +------+
273 | |     1|
274 | |     3|
275 | |     2|
276 | +------+
277 | ```
278 | 
279 | ```scala
280 | spark.sql("""
281 |   SELECT DISTINCT Pclass from passengers
282 | """).count
283 | ```
284 | ```
285 | res145: Long = 3
286 | 3
287 | ```
288 | 
289 | **Q-2. How many passengers were in each class?**
290 | 
291 | ```scala
292 | val numByClass = passengersDF.groupBy("Pclass").count
293 | numByClass.show
294 | ```
295 | ```
296 | +------+-----+
297 | |Pclass|count|
298 | +------+-----+
299 | |     1|  216|
300 | |     3|  491|
301 | |     2|  184|
302 | +------+-----+
303 | ```
304 | 
305 | ```scala
306 | spark.sql("""
307 |  SELECT Pclass, count(PassengerID) as class_count FROM passengers
308 |  GROUP BY Pclass
309 |  ORDER BY class_count DESC
310 | """).show
311 | ```
312 | ```
313 | +------+-----------+
314 | |Pclass|class_count|
315 | +------+-----------+
316 | |     3|        491|
317 | |     1|        216|
318 | |     2|        184|
319 | +------+-----------+
320 | ```
321 | 
322 | ```scala
323 | CustomPlotlyChart(numByClass,
324 |                   layout="{title: 'Passengers per class', xaxis: {title: 'Pclass'}}",
325 |                   dataOptions="{type: 'bar'}",
326 |                   dataSources="{x: 'Pclass', y: 'count'}")
327 | ```
328 | 
329 | <img src="http://telegra.ph/file/cd760d2837a43c2738614.png" width=900>
330 | </img>
331 | 
332 | **Q-3. How many women and men were in each class?**
333 | ```scala
334 | val grByGenderAndClass = passengersDF.groupBy("Pclass", "Sex").count
335 | grByGenderAndClass.show()
336 | ```
337 | ```
338 | +------+------+-----+
339 | |Pclass|   Sex|count|
340 | +------+------+-----+
341 | |     2|female|   76|
342 | |     3|  male|  347|
343 | |     1|  male|  122|
344 | |     3|female|  144|
345 | |     1|female|   94|
346 | |     2|  male|  108|
347 | +------+------+-----+
348 | ```
349 | 
350 | ```scala
351 | CustomPlotlyChart(grByGenderAndClass,
352 |                   layout="{title: 'Passengers per class', xaxis: {title: 'Pclass'}, barmode: 'group'}",
353 |                   dataOptions="{type: 'bar', splitBy: 'Sex'}",
354 |                   dataSources="{x: 'Pclass', y: 'count'}")
355 | ```
356 | 
357 | <img src="http://telegra.ph/file/ff0fd209a3804c90b9601.png" width=900>
358 | </img>
359 | 
360 | 
361 | ### DataFrame Functions and UDF
362 | 
363 | http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.functions$
364 | 
365 | ```scala
366 | import org.apache.spark.sql.functions.{mean, min, max}
367 | 
368 | passengersDF.select(mean("Age").alias("Average Age"), min("Age"), max("Age")).show()
369 | ```
370 | ```
371 | +-----------------+--------+--------+
372 | |      Average Age|min(Age)|max(Age)|
373 | +-----------------+--------+--------+
374 | |29.69911764705882|    0.42|    80.0|
375 | +-----------------+--------+--------+
376 | ```
377 | 
378 | ```scala
379 | import org.apache.spark.sql.functions.count
380 | 
381 | passengersDF.groupBy("Pclass")
382 |             .agg(count("Pclass").alias("class_count"))
383 |             .orderBy(-$"class_count")
384 |             .show
385 | ```
386 | ```
387 | +------+-----------+
388 | |Pclass|class_count|
389 | +------+-----------+
390 | |     3|        491|
391 | |     1|        216|
392 | |     2|        184|
393 | +------+-----------+
394 | ```
395 | 
396 | For more specific tasks one can use User Defined Functions.
397 | 
398 | Let's say we want to get a column with full names of port of embarkation.
399 | 
400 | ```scala
401 | passengersDF.select("Embarked").distinct.show
402 | ```
403 | ```
404 | +--------+
405 | |Embarked|
406 | +--------+
407 | |       Q|
408 | |    null|
409 | |       C|
410 | |       S|
411 | +--------+
412 | ```
413 | 
414 | From dataset description we know that C = Cherbourg; Q = Queenstown; S = Southampton.
415 | 
416 | ```scala
417 | import org.apache.spark.sql.functions.udf
418 | 
419 | val embarkedFullName: (String) => String = (embarked: String) =>
420 |   if (embarked == "Q")
421 |     "Queenstown"
422 |   else if (embarked == "C")
423 |     "Cherbourg"
424 |   else
425 |     "Southampton"
426 | 
427 | 
428 | val embarkedFullNameUDF = udf(embarkedFullName)
429 | ```
430 | 
431 | Also we want to get a column with more verbose survival status of passenger: `survived` and `died`.
432 | 
433 | ```scala
434 | val survivedStatus: (Integer) => String = (survived: Integer) =>
435 |   if (survived == 1)
436 |     "survived"
437 |   else
438 |     "died"
439 | 
440 | val survivedStatusUDF = udf(survivedStatus)
441 | 
442 | val pdf = passengersDF
443 |         .withColumn("Embarkation", embarkedFullNameUDF($"Embarked"))
444 |         .drop("Embarked")
445 |         .withColumn("SurvivedStatus", survivedStatusUDF($"Survived"))
446 |         .cache()
447 |         
448 | pdf.select("Name", "Embarkation", "SurvivedStatus").show(5, truncate=false)
449 | ```
450 | ```
451 | +---------------------------------------------------+-----------+--------------+
452 | |Name                                               |Embarkation|SurvivedStatus|
453 | +---------------------------------------------------+-----------+--------------+
454 | |Braund, Mr. Owen Harris                            |Southampton|died          |
455 | |Cumings, Mrs. John Bradley (Florence Briggs Thayer)|Cherbourg  |survived      |
456 | |Heikkinen, Miss. Laina                             |Southampton|survived      |
457 | |Futrelle, Mrs. Jacques Heath (Lily May Peel)       |Southampton|survived      |
458 | |Allen, Mr. William Henry                           |Southampton|died          |
459 | +---------------------------------------------------+-----------+--------------+
460 | only showing top 5 rows
461 | ```
462 | 
463 | **Q-5. Count the number and percentage of survivors and dead passengers.**
464 | 
465 | ```scala
466 | import org.apache.spark.sql.functions.count
467 | 
468 | val numPassengers = pdf.count()
469 | 
470 | val grBySurvived = pdf.groupBy("SurvivedStatus")
471 |                       .agg(count("PassengerId").alias("count"), 
472 |                            ((count("PassengerId") / numPassengers) * 100).alias("%"))
473 | grBySurvived.show
474 | ```
475 | ```
476 | +--------------+-----+-----------------+
477 | |SurvivedStatus|count|                %|
478 | +--------------+-----+-----------------+
479 | |          died|  549|61.61616161616161|
480 | |      survived|  342|38.38383838383838|
481 | +--------------+-----+-----------------+
482 | ```
483 | 
484 | **Q-6.** 
485 | - **Plot the distribution of dead and surviving passengers.**
486 | - **Plot the distribution of survivors and dead passengers by class.**
487 | - **Plot the distribution of survivors and dead passengers by gender.**
488 | - **Plot the distribution of survivors and dead passengers by port of embarkation.**
489 | - **Plot the % of survivors by port of embarkation.**
490 | - **Plot the distribution of passenger classes by port of embarkation.**
491 | 
492 | ```scala
493 | // Distribution of dead and survived passengers
494 | 
495 | CustomPlotlyChart(grBySurvived,
496 |                   layout="{title: 'Passengers by status', xaxis: {title: 'status'}, yaxis: {title: '%'}}",
497 |                   dataOptions="{type: 'bar'}",
498 |                   dataSources="{x: 'SurvivedStatus', y: '%'}")
499 | ```
500 | 
501 | <img src="http://telegra.ph/file/a03b6882b09456285e697.png" width=900>
502 | </img>
503 | 
504 | ```scala
505 | // Distribution of the number of survivors and dead passengers by class.
506 | 
507 | CustomPlotlyChart(pdf.groupBy("SurvivedStatus", "Pclass").count,
508 |                   layout="{title: 'Number of passengers by survival status per class', xaxis: {title: 'Pclass'}, barmode: 'group'}",
509 |                   dataOptions="{type: 'bar', splitBy: 'SurvivedStatus'}",
510 |                   dataSources="{x: 'Pclass', y: 'count'}")
511 | ```
512 | 
513 | <img src="http://telegra.ph/file/6e81137a13d88112dc7a3.png" width=900>
514 | </img>
515 | 
516 | ```scala
517 | // Distribution of survivors and dead passengers by gender.
518 | 
519 | CustomPlotlyChart(pdf.groupBy("SurvivedStatus", "Sex").count,
520 |                   layout="{title: 'Number of passengers by status by gender', xaxis: {title: 'Gender'}, barmode: 'group'}",
521 |                   dataOptions="{type: 'bar', splitBy: 'SurvivedStatus'}",
522 |                   dataSources="{x: 'Sex', y: 'count'}")
523 | ```
524 | 
525 | <img src="http://telegra.ph/file/3d717a5eb7e265dc858eb.png" width=900>
526 | </img>
527 | 
528 | ```scala
529 | // Distribution of survivors and dead passengers by port of embarkation.
530 | 
531 | CustomPlotlyChart(pdf.groupBy("Embarkation", "SurvivedStatus").count,
532 |                   layout="{barmode: 'group'}",
533 |                   dataOptions="{type: 'bar', splitBy: 'SurvivedStatus'}",
534 |                   dataSources="{x: 'Embarkation', y: 'count'}")
535 | ```
536 | 
537 | <img src="http://telegra.ph/file/75f81dd53d5bcb8f8db4e.png" width=900>
538 | </img>
539 | 
540 | ```scala
541 | // % of survivors by port of embarkation.
542 | 
543 | CustomPlotlyChart(pdf.groupBy("Embarkation").agg((sum("Survived") / count("Survived") * 100).alias("SurvivalRate")),
544 |                   layout="{title: '% of survival per embarkation'}",
545 |                   dataOptions="{type: 'bar'}",
546 |                   dataSources="{x: 'Embarkation', y: 'SurvivalRate'}")
547 | ```
548 | 
549 | <img src="http://telegra.ph/file/c430db6b285ae804f7d53.png" width=900>
550 | </img>
551 | 
552 | ```scala
553 | // Distribution of passenger classes by port of embarkation.
554 | 
555 | CustomPlotlyChart(pdf.groupBy("Embarkation", "Pclass").count,
556 |                   layout="{barmode: 'stack', title: 'Pclass distribution by Embarkation'}",
557 |                   dataOptions="{type: 'bar', splitBy: 'Pclass'}",
558 |                   dataSources="{x: 'Embarkation', y: 'count'}")
559 | ```
560 | <img src="http://telegra.ph/file/9df9159d58880165e8a46.png" width=900>
561 | </img>
562 | 
563 | How to get the % of survived passengers by port of embarkation in this case?
564 | 
565 | ```scala
566 | val byEmbark =  pdf.groupBy("Embarkation").agg(count("PassengerId").alias("totalCount"))
567 | val byEmbarkByClass = pdf.groupBy("Embarkation", "Pclass").count
568 | 
569 | val embarkClassDistr = byEmbarkByClass.join(byEmbark, usingColumn="Embarkation")
570 |                                       .select($"Embarkation",
571 |                                               $"Pclass", 
572 |                                               ($"count" / $"totalCount" * 100).alias("%"))
573 | 
574 | CustomPlotlyChart(embarkClassDistr,
575 |                   layout="{barmode: 'stack', title: 'Pclass distribution by Embarkation', yaxis: {title: '%'}}",
576 |                   dataOptions="{type: 'bar', splitBy: 'Pclass'}",
577 |                   dataSources="{x: 'Embarkation', y: '%'}")
578 | ```
579 | 
580 | <img src="http://telegra.ph/file/53cf0b46923ad41549287.png" width=900>
581 | </img>
582 | 
583 | ### Histograms and Box Plots
584 | 
585 | **Q-7 Obtain age distributions by passengers survival status.**
586 | 
587 | ```scala
588 | CustomPlotlyChart(pdf, 
589 |                   layout="{title: 'Age distribution by status', xaxis: {title: 'Age'}, barmode: 'overlay'}",
590 |                   dataOptions="{type: 'histogram', opacity: 0.6, splitBy: 'SurvivedStatus'}",
591 |                   dataSources="{x: 'Age'}")
592 | ```
593 | <img src="http://telegra.ph/file/1b109fed97fb2e2eaa494.png" width=900>
594 | </img>
595 | 
596 | ```scala
597 | CustomPlotlyChart(pdf, 
598 |                   layout="{yaxis: {title: 'Age'}}",
599 |                   dataOptions="{type: 'box', splitBy: 'SurvivedStatus'}",
600 |                   dataSources="{y: 'Age'}")
601 | ```
602 | <img src="http://telegra.ph/file/f8270cfa13e9245f998c0.png" width=900>
603 | </img>
604 | 
605 | **Q-8. Plot box plots of age distributions by passengers classes.**
606 | 
607 | ```scala
608 | CustomPlotlyChart(pdf, 
609 |                   layout="{yaxis: {title: 'Age'}}",
610 |                   dataOptions="{type: 'box', splitBy: 'Pclass'}",
611 |                   dataSources="{y: 'Age'}")
612 | ```
613 | 
614 | <img src="http://telegra.ph/file/8613f3df86862869837a4.png" width=900>
615 | </img>
616 | 
617 | This scatter plots show the dependences of the chances of survival from the cabin class, age and gender:
618 | 
619 | ```scala
620 | val survByClassAndAge = List("male", "female").map{
621 |   gender =>
622 |     CustomPlotlyChart(pdf.filter($"Sex" === gender),
623 |                   layout=s"""{
624 |                     title: 'Survival by class and age, $gender.', 
625 |                     yaxis: {title: 'class'}, 
626 |                     xaxis: {title: 'age'}
627 |                   }""",
628 |                   dataOptions="""{
629 |                     splitBy: 'SurvivedStatus',
630 |                     byTrace: {
631 |                       'survived': {
632 |                         mode: 'markers',
633 |                         marker: {
634 |                           size: 20,
635 |                           opacity: 0.3,
636 |                           color: 'orange'
637 |                         }
638 |                       },
639 |                       'died': {
640 |                         mode: 'markers',
641 |                         marker: {
642 |                           size: 15,
643 |                           opacity: 0.9,
644 |                           color: 'rgba(55, 128, 191, 0.6)'
645 |                         }
646 |                       }
647 |                     }
648 |                   }""",
649 |                   dataSources = "{x: 'Age', y: 'Pclass'}"
650 |                      )
651 | }
652 | 
653 | survByClassAndAge(0)
654 | ```
655 | 
656 | <img src="http://telegra.ph/file/e91733ba23293250a7500.png" width=900>
657 | </img>
658 | 
659 | ```scala
660 | survByClassAndAge(1)
661 | ```
662 | 
663 | <img src="http://telegra.ph/file/8f52fcd59c70e68664a0c.png" width=900>
664 | </img>
665 | 
666 | ### More practice with UDF and Box Plots
667 | 
668 | The titles of passengers could be useful source of information. Let's explore that.
669 | 
670 | **Q-9. Plot box plots of age distributions by title.**
671 | 
672 | ```scala
673 | pdf.select("Name").show(3, truncate=false)
674 | ```
675 | ```
676 | +---------------------------------------------------+
677 | |Name                                               |
678 | +---------------------------------------------------+
679 | |Braund, Mr. Owen Harris                            |
680 | |Cumings, Mrs. John Bradley (Florence Briggs Thayer)|
681 | |Heikkinen, Miss. Laina                             |
682 | +---------------------------------------------------+
683 | only showing top 3 rows
684 | ```
685 | 
686 | ```scala
687 | val parseTitle: String => String = (name: String) =>
688 |   name.split(", ")(1).split("\\.")(0)
689 | 
690 | val parseTitleUDF = udf(parseTitle)
691 | 
692 | CustomPlotlyChart(pdf.withColumn("Title", parseTitleUDF($"Name")), 
693 |                   layout="{yaxis: {title: 'Age'}}",
694 |                   dataOptions="{type: 'box', splitBy: 'Title'}",
695 |                   dataSources="{y: 'Age'}")
696 | ```
697 | 
698 | <img src="http://telegra.ph/file/9036277a7a6e4b47390b4.png" width=900>
699 | </img>
700 | 
701 | Often it is good practice to group the values of the categorical feature, especially when there are rare individual feature values such as `Don`, `Lady`, `Capt` in our case.
702 | 
703 | **Q-10. Write UDF to group all the titles into five groups according to the following table:**
704 | 
705 | |   Group       |    Title     |
706 | | :------------:|:------------:|
707 | | Aristocratic  | Capt, Col, Don, Dr, Jonkheer, Lady, Major, Rev, Sir, Countess |
708 | | Mrs           | Mrs, Ms         |
709 | | Miss          | Miss, Mlle, Mme |
710 | | Mr            | Mr              |
711 | | Master        | Master          |
712 | 
713 | ** Create new column called 'TitleGroup' and plot box plots of age distributions by title group.**
714 | 
715 | ```scala
716 | val titleGroup: String => String = (title: String) => {
717 |   val aristocratic = Set("Capt", "Col", "Don", "Dr", "Jonkheer", "Lady", "Major", "Rev", "Sir", "the Countess")
718 |   val mrs = Set("Mrs", "Ms")
719 |   val miss = Set("Miss", "Mlle", "Mme")
720 |   if (aristocratic.contains(title))
721 |     "Aristocratic"
722 |   else if (mrs.contains(title))
723 |     "Mrs"
724 |   else if (miss.contains(title))
725 |     "Miss"
726 |   else
727 |     title
728 | }
729 | 
730 | // given column with passenger name obtain column with passenger title group.
731 | val parseTitleGroupUDF = udf(parseTitle andThen titleGroup)
732 | ```
733 | 
734 | ```scala
735 | val withTitleDF = pdf.withColumn("TitleGroup", parseTitleGroupUDF($"Name"))
736 | 
737 | CustomPlotlyChart(withTitleDF, 
738 |                   layout="{yaxis: {title: 'Age'}}",
739 |                   dataOptions="{type: 'box', splitBy: 'TitleGroup'}",
740 |                   dataSources="{y: 'Age'}")
741 | ```
742 | 
743 | <img src="http://telegra.ph/file/03cdce9bc6bcfdffb2a68.png" width=900>
744 | </img>
745 | 
746 | 
747 | **Q-11 Plot the distribution of the % of survivors by title group.**
748 | 
749 | ```scala
750 | val byTitleGr = withTitleDF
751 |                    .groupBy("TitleGroup")
752 |                    .agg((sum("Survived") / count("Survived") * 100).alias("%"))
753 | 
754 | CustomPlotlyChart(byTitleGr,
755 |                   layout="{title: '% of survival by title group'}",
756 |                   dataOptions="{type: 'bar'}",
757 |                   dataSources="{x: 'TitleGroup', y: '%'}")
758 | ```
759 | 
760 | <img src="http://telegra.ph/file/71e50da227e09979866f9.png" width=900>
761 | </img>
762 | 
763 | ### Handling missing values
764 | 
765 | ```scala
766 | import org.apache.spark.sql.functions.isnull
767 | 
768 | 100.0 * pdf.filter(isnull($"Age")).count / pdf.count
769 | ```
770 | ```
771 | res209: Double = 19.865319865319865
772 | 19.865319865319865
773 | ```
774 | 
775 | ```scala
776 | 100.0 * pdf.filter(isnull($"Cabin")).count / pdf.count
777 | ```
778 | ```
779 | res237: Double = 77.10437710437711
780 | 77.10437710437711
781 | ```
782 | 
783 | ```scala
784 | val cabinStatus: (String) => String = (cabin: String) =>
785 |   if (cabin == null)
786 |     "noname"
787 |   else
788 |     "hasNumber"
789 | 
790 | val cabinStatusUDF = udf(cabinStatus)
791 | ```
792 | 
793 | ```scala
794 | val withCabinStatusDF = pdf.withColumn("CabinStatus", cabinStatusUDF($"Cabin"))
795 | ```
796 | 
797 | ```scala
798 | CustomPlotlyChart(withCabinStatusDF.groupBy("CabinStatus", "SurvivedStatus").count,
799 |                   layout="{title: 'Number of passengers by survival status by cabin type', xaxis: {title: 'Cabin'}}",
800 |                   dataOptions="{type: 'bar', splitBy: 'SurvivedStatus'}",
801 |                   dataSources="{x: 'CabinStatus', y: 'count'}")
802 | ```
803 | 
804 | <img src="http://telegra.ph/file/80f0099117e7772825118.png" width=900>
805 | </img>
806 | 
807 | ### On your own
808 | 
809 | Explore family relationships variables (SibSp and Parch).
810 | How does the number of siblings/spouses aboard affect the chances of survival?
811 | How does the number of parents/children aboard affect the chances of survival?
812 | 
813 | Invent a new variable called `Family` to represent total number of relatives aboard and explore how does it affect hte chances of survival.
814 | 


--------------------------------------------------------------------------------
/labs/TitanicSurvivalExploration/TitanicSurvivalExploration.snb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "metadata" : {
  3 |     "name" : "TitanicSurvivalExploration",
  4 |     "user_save_timestamp" : "1970-01-01T03:00:00.000Z",
  5 |     "auto_save_timestamp" : "1970-01-01T03:00:00.000Z",
  6 |     "language_info" : {
  7 |       "name" : "scala",
  8 |       "file_extension" : "scala",
  9 |       "codemirror_mode" : "text/x-scala"
 10 |     },
 11 |     "trusted" : true,
 12 |     "customLocalRepo" : null,
 13 |     "customRepos" : null,
 14 |     "customDeps" : null,
 15 |     "customImports" : null,
 16 |     "customArgs" : null,
 17 |     "customSparkConf" : {
 18 |       "spark.app.name" : "ScalaIO Getting Started",
 19 |       "spark.master" : "local[4]",
 20 |       "spark.executor.memory" : "2G"
 21 |     }
 22 |   },
 23 |   "cells" : [ {
 24 |     "metadata" : {
 25 |       "id" : "0BA359D1BEC942DB8031E9858B9DD1AA"
 26 |     },
 27 |     "cell_type" : "markdown",
 28 |     "source" : "#  Titanic Survival Exploration"
 29 |   }, {
 30 |     "metadata" : {
 31 |       "id" : "A7F1BB711F984380B18A2940C47EF8E5"
 32 |     },
 33 |     "cell_type" : "markdown",
 34 |     "source" : "<div style=\"text-align:center\">\n  <img src=\"http://telegra.ph/file/f5adcd2e260285ed766bd.png\" width=\"192\" height=\"100\" style=\"margin-right:70px\">\n  <img src=\"http://telegra.ph/file/72de05a5e0fc1e392e569.png\" width=\"111\" height=\"128\">\n</div>"
 35 |   }, {
 36 |     "metadata" : {
 37 |       "id" : "E4266CA3E6134B5E8A554817A707C970"
 38 |     },
 39 |     "cell_type" : "markdown",
 40 |     "source" : "## Spark quick review"
 41 |   }, {
 42 |     "metadata" : {
 43 |       "id" : "567C1BC18DE446EE98E0CE36009E9831"
 44 |     },
 45 |     "cell_type" : "markdown",
 46 |     "source" : "Spark provides convenient programming abstraction and parallel runtime to hide distributed computations complexities.\n\n\n<img src=\"http://telegra.ph/file/41a2ce855b179b4e9bd44.png\" width=\"316\" height=\"149\">\n\n\nIn this first lab we will focus on DataFrames and SQL.\nIn second lab we will use Spark MLlib for building machine learning pipelines."
 47 |   }, {
 48 |     "metadata" : {
 49 |       "id" : "C8DC749EF8F4478DB39C467D73068FF6"
 50 |     },
 51 |     "cell_type" : "markdown",
 52 |     "source" : "### Spark Cluster"
 53 |   }, {
 54 |     "metadata" : {
 55 |       "id" : "4AD326A983274CEA8B87BE4C98D3CDAD"
 56 |     },
 57 |     "cell_type" : "markdown",
 58 |     "source" : "<div style=\"text-align:left\">\n  <img src=\"http://telegra.ph/file/cf242107c6e3fc854ce04.png\" width=\"567\" height=\"492\">\n</div>"
 59 |   }, {
 60 |     "metadata" : {
 61 |       "id" : "15413540BBFD4F388A44153BB4CF069B"
 62 |     },
 63 |     "cell_type" : "markdown",
 64 |     "source" : "Main entry point for Spark functionality is a `SparkContex`. `SparkContext` tells Spark how to access a cluster.\n`Spark Notebook` automatically creates `SparkContext`."
 65 |   }, {
 66 |     "metadata" : {
 67 |       "id" : "C825C743E52D43C7BCAEA3F3891995E7"
 68 |     },
 69 |     "cell_type" : "markdown",
 70 |     "source" : "Examples of `master` parameter configuration for `SparkContext`:\n\n| Master Parameter  |             Description                 |\n| ----------------- |----------------------------------------:|\n| local[K]          | run Spark locally with K worker threads |\n| spark://HOST:PORT | connect to Spark Standalone cluster     |\n| mesos://HOST:PORT | connect to Mesos cluster                |"
 71 |   }, {
 72 |     "metadata" : {
 73 |       "trusted" : true,
 74 |       "input_collapsed" : false,
 75 |       "collapsed" : false,
 76 |       "id" : "ABD9BF935E294C88805CD1AEC1E96ADA"
 77 |     },
 78 |     "cell_type" : "code",
 79 |     "source" : "sparkContext",
 80 |     "outputs" : [ ]
 81 |   }, {
 82 |     "metadata" : {
 83 |       "trusted" : true,
 84 |       "input_collapsed" : false,
 85 |       "collapsed" : true,
 86 |       "id" : "0A2F5803720C45CA9A889FD74DC877DA"
 87 |     },
 88 |     "cell_type" : "markdown",
 89 |     "source" : "## Spark SQL and DataFrames"
 90 |   }, {
 91 |     "metadata" : {
 92 |       "id" : "1D4AF5C1BA434D9E829C556E8C2DEE9B"
 93 |     },
 94 |     "cell_type" : "markdown",
 95 |     "source" : "* http://spark.apache.org/docs/latest/sql-programming-guide.html\n* http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.Dataset"
 96 |   }, {
 97 |     "metadata" : {
 98 |       "id" : "824436EF882B4EACBAC707DF156DD69F"
 99 |     },
100 |     "cell_type" : "markdown",
101 |     "source" : "A DataFrame is a distributed collection of data organized into named columns.\nIt is conceptually equivalent to a table in a relational database or a data frame in R/Python"
102 |   }, {
103 |     "metadata" : {
104 |       "trusted" : true,
105 |       "input_collapsed" : false,
106 |       "collapsed" : true,
107 |       "id" : "F3CCD76E01224741BFFC12D770595288"
108 |     },
109 |     "cell_type" : "markdown",
110 |     "source" : "The entry point to programming Spark with SQL and DataFrame API in Spark 2.0 is the new `SparkSession` class:"
111 |   }, {
112 |     "metadata" : {
113 |       "trusted" : true,
114 |       "input_collapsed" : false,
115 |       "collapsed" : false,
116 |       "id" : "2E2F6A73B390466997E51CE0BADA4ECD"
117 |     },
118 |     "cell_type" : "code",
119 |     "source" : "sparkSession",
120 |     "outputs" : [ ]
121 |   }, {
122 |     "metadata" : {
123 |       "trusted" : true,
124 |       "input_collapsed" : false,
125 |       "collapsed" : false,
126 |       "id" : "F0D3F939A3B44F5893ECEC40FA6D1B84"
127 |     },
128 |     "cell_type" : "code",
129 |     "source" : "val spark = sparkSession",
130 |     "outputs" : [ ]
131 |   }, {
132 |     "metadata" : {
133 |       "trusted" : true,
134 |       "input_collapsed" : false,
135 |       "collapsed" : true,
136 |       "id" : "07E8DBC462CA4A7282B9A36CED349356"
137 |     },
138 |     "cell_type" : "code",
139 |     "source" : "// This import is needed to use the $-notation\nimport spark.implicits._",
140 |     "outputs" : [ ]
141 |   }, {
142 |     "metadata" : {
143 |       "id" : "2CDC4126ACA04B7AA763E6D33DD9625A"
144 |     },
145 |     "cell_type" : "markdown",
146 |     "source" : "With a SparkSession you can create DataFrames from an existing RDD, from files in HDFS or any other storage system, or from Scala collections."
147 |   }, {
148 |     "metadata" : {
149 |       "trusted" : true,
150 |       "input_collapsed" : false,
151 |       "collapsed" : false,
152 |       "id" : "A1BDE4957E344C76963EEF5F0FA057D7"
153 |     },
154 |     "cell_type" : "code",
155 |     "source" : "Seq((\"Alice\", 20, \"female\"), (\"Bob\", 31, \"male\"), (\"Eva\", 16, \"female\")).toDF(\"name\", \"age\", \"gender\").show()",
156 |     "outputs" : [ ]
157 |   }, {
158 |     "metadata" : {
159 |       "trusted" : true,
160 |       "input_collapsed" : false,
161 |       "collapsed" : false,
162 |       "presentation" : {
163 |         "tabs_state" : "{\n  \"tab_id\": \"#tab539102187-0\"\n}",
164 |         "pivot_chart_state" : "{\n  \"hiddenAttributes\": [],\n  \"menuLimit\": 200,\n  \"cols\": [],\n  \"rows\": [],\n  \"vals\": [],\n  \"exclusions\": {},\n  \"inclusions\": {},\n  \"unusedAttrsVertical\": 85,\n  \"autoSortUnusedAttrs\": false,\n  \"inclusionsInfo\": {},\n  \"aggregatorName\": \"Count\",\n  \"rendererName\": \"Table\"\n}"
165 |       },
166 |       "id" : "F953AF369A1D49999B1F547696C898F9"
167 |     },
168 |     "cell_type" : "code",
169 |     "source" : "case class Person(name: String, age: Int, gender: String)",
170 |     "outputs" : [ ]
171 |   }, {
172 |     "metadata" : {
173 |       "trusted" : true,
174 |       "input_collapsed" : false,
175 |       "collapsed" : false,
176 |       "id" : "4D8CE8E8E05340DCB6E4087D9CC583C2"
177 |     },
178 |     "cell_type" : "code",
179 |     "source" : "val persons = Seq(Person(\"Alice\", 20, \"female\"), Person(\"Bob\", 31, \"male\"), Person(\"Eva\", 16, \"female\")).toDF()\npersons.show()",
180 |     "outputs" : [ ]
181 |   }, {
182 |     "metadata" : {
183 |       "trusted" : true,
184 |       "input_collapsed" : false,
185 |       "collapsed" : false,
186 |       "id" : "F4837DEF00DB49518CA0B3E9A79A4B2B"
187 |     },
188 |     "cell_type" : "code",
189 |     "source" : "persons.select(\"name\", \"age\").show()",
190 |     "outputs" : [ ]
191 |   }, {
192 |     "metadata" : {
193 |       "trusted" : true,
194 |       "input_collapsed" : false,
195 |       "collapsed" : false,
196 |       "id" : "12C92D4BA4FE4C35954609339303CDBA"
197 |     },
198 |     "cell_type" : "code",
199 |     "source" : "val young = persons.filter($\"age\" < 21)\nyoung.show()",
200 |     "outputs" : [ ]
201 |   }, {
202 |     "metadata" : {
203 |       "trusted" : true,
204 |       "input_collapsed" : false,
205 |       "collapsed" : false,
206 |       "id" : "153803FA749D4E768373D0FCEABEB951"
207 |     },
208 |     "cell_type" : "code",
209 |     "source" : "young.select(young(\"name\"), ($\"age\" + 1).alias(\"incremented age\"))",
210 |     "outputs" : [ ]
211 |   }, {
212 |     "metadata" : {
213 |       "trusted" : true,
214 |       "input_collapsed" : false,
215 |       "collapsed" : false,
216 |       "id" : "B3D06BA78056468F9359DC1262B02FAA"
217 |     },
218 |     "cell_type" : "code",
219 |     "source" : "persons.groupBy(\"gender\").count.show",
220 |     "outputs" : [ ]
221 |   }, {
222 |     "metadata" : {
223 |       "id" : "E4C4B34A08814E478FB00FB3788F0E36"
224 |     },
225 |     "cell_type" : "markdown",
226 |     "source" : "## Titanic Dataset"
227 |   }, {
228 |     "metadata" : {
229 |       "id" : "7BF47E9F62104B648B12C00F0B139D3F"
230 |     },
231 |     "cell_type" : "markdown",
232 |     "source" : "More on this dataset you can read [here](https://www.kaggle.com/c/titanic/data)."
233 |   }, {
234 |     "metadata" : {
235 |       "id" : "ABE465E9052F4B84A8369D2872C159D3"
236 |     },
237 |     "cell_type" : "markdown",
238 |     "source" : "<div style=\"text-align:left\">\n  <img src=\"https://upload.wikimedia.org/wikipedia/commons/6/6e/Stöwer_Titanic.jpg\" width=\"427\" height=\"292\">\n</div>\n<div style=\"font-size:x-small\">\n  By <span class=\"fn value\"><a href=\"//commons.wikimedia.org/wiki/Willy_St%C3%B6wer\" title=\"Willy Stöwer\">Willy Stöwer</a>, died on 31st May 1931</span> - Magazine Die Gartenlaube, <a href=\"https://en.wikipedia.org/wiki/Die_Gartenlaube\" class=\"extiw\" title=\"en:Die Gartenlaube\">en:Die Gartenlaube</a> and <a href=\"https://de.wikipedia.org/wiki/Die_Gartenlaube\" class=\"extiw\" title=\"de:Die Gartenlaube\">de:Die Gartenlaube</a>, Public Domain, <a href=\"https://commons.wikimedia.org/w/index.php?curid=97646\">Link</a>\n</div>"
239 |   }, {
240 |     "metadata" : {
241 |       "id" : "6C9768C541FA402583A3991F6F64F981"
242 |     },
243 |     "cell_type" : "markdown",
244 |     "source" : "Out of the box, DataFrame supports reading data from the most popular formats, including JSON files, CSV files, Parquet files, Hive tables."
245 |   }, {
246 |     "metadata" : {
247 |       "trusted" : true,
248 |       "input_collapsed" : false,
249 |       "collapsed" : false,
250 |       "id" : "39C8B97E811A4F48810AEF1D886F26BF"
251 |     },
252 |     "cell_type" : "code",
253 |     "source" : "val passengersDF = spark.read\n  .option(\"header\", \"true\")\n  .option(\"inferSchema\", \"true\")\n  .csv(\"notebooks/spark-notebook-ml-labs/labs/TitanicSurvivalExploration/data/titanic_train.csv\")  ",
254 |     "outputs" : [ ]
255 |   }, {
256 |     "metadata" : {
257 |       "trusted" : true,
258 |       "input_collapsed" : false,
259 |       "collapsed" : false,
260 |       "id" : "1FCA01D29C584C3C8CD81564234B66B8"
261 |     },
262 |     "cell_type" : "code",
263 |     "source" : "passengersDF.printSchema",
264 |     "outputs" : [ ]
265 |   }, {
266 |     "metadata" : {
267 |       "id" : "8BFFFBF5773B431CB9B48760CB5D2E4C"
268 |     },
269 |     "cell_type" : "markdown",
270 |     "source" : "Look at 5 records in passengers DataFrame:"
271 |   }, {
272 |     "metadata" : {
273 |       "trusted" : true,
274 |       "input_collapsed" : false,
275 |       "collapsed" : false,
276 |       "presentation" : {
277 |         "tabs_state" : "{\n  \"tab_id\": \"#tab1709273702-0\"\n}",
278 |         "pivot_chart_state" : "{\n  \"hiddenAttributes\": [],\n  \"menuLimit\": 200,\n  \"cols\": [],\n  \"rows\": [],\n  \"vals\": [],\n  \"exclusions\": {},\n  \"inclusions\": {},\n  \"unusedAttrsVertical\": 85,\n  \"autoSortUnusedAttrs\": false,\n  \"inclusionsInfo\": {},\n  \"aggregatorName\": \"Count\",\n  \"rendererName\": \"Table\"\n}"
279 |       },
280 |       "id" : "0FBC0EB917884F5092DB5870A778C20D"
281 |     },
282 |     "cell_type" : "code",
283 |     "source" : "passengersDF.limit(5)",
284 |     "outputs" : [ ]
285 |   }, {
286 |     "metadata" : {
287 |       "id" : "C7F79F4F8FC44F9FBC0D0007C0CA9D02"
288 |     },
289 |     "cell_type" : "markdown",
290 |     "source" : "The sql function on a SparkSession enables applications to run SQL queries programmatically and returns the result as a DataFrame.\nTo do this we need to register the DataFrame as a SQL temporary view"
291 |   }, {
292 |     "metadata" : {
293 |       "trusted" : true,
294 |       "input_collapsed" : false,
295 |       "collapsed" : false,
296 |       "id" : "CE6823262FC64FDB9F3F0A561F1A3EA6"
297 |     },
298 |     "cell_type" : "code",
299 |     "source" : "passengersDF.createOrReplaceTempView(\"passengers\")",
300 |     "outputs" : [ ]
301 |   }, {
302 |     "metadata" : {
303 |       "trusted" : true,
304 |       "input_collapsed" : false,
305 |       "collapsed" : false,
306 |       "id" : "1EBC8EA3E63D47418D87F6C9D643344A"
307 |     },
308 |     "cell_type" : "code",
309 |     "source" : "spark.sql(\"\"\"\n  SELECT Name, Age, Pclass, Survived FROM passengers\n  WHERE Age < 30\n\"\"\").show(3, truncate=false)",
310 |     "outputs" : [ ]
311 |   }, {
312 |     "metadata" : {
313 |       "id" : "4DD3EA91E8F647C285CBF053308ECE28"
314 |     },
315 |     "cell_type" : "markdown",
316 |     "source" : "### Transformations and Actions"
317 |   }, {
318 |     "metadata" : {
319 |       "id" : "1AD3002411144A64B75E212B903D2D2A"
320 |     },
321 |     "cell_type" : "markdown",
322 |     "source" : "Spark operations on DataFrames are one of two types. \n* Transformations are lazily evaluated and create new Dataframes from existing ones. \n* Actions trigger computation and return results or write DataFrames to storage.\n\n*Computations are only triggered when an action is invoked.*\n\nHere are some examples.\n\n\n|   Transformations   |    Actions   |\n| :-----------------: |:------------:|\n| select              |  count       |\n| filter              |  show        |\n| groupBy             |  save        |\n| orderBy             |  **collect** |\n| sample              |  take        |\n| limit               |  reduce      |\n| withColumn          ||\n| join                ||"
323 |   }, {
324 |     "metadata" : {
325 |       "trusted" : true,
326 |       "input_collapsed" : false,
327 |       "collapsed" : true,
328 |       "id" : "66265A8E3F3C4682B120E4491E19FE80"
329 |     },
330 |     "cell_type" : "markdown",
331 |     "source" : "**Q-1. How many different classes of passengers were aboard the Titanic?**"
332 |   }, {
333 |     "metadata" : {
334 |       "trusted" : true,
335 |       "input_collapsed" : false,
336 |       "collapsed" : false,
337 |       "id" : "E06494D18F2A41A988ED237FEE686936"
338 |     },
339 |     "cell_type" : "code",
340 |     "source" : "val pclasses = passengersDF.select(\"Pclass\").distinct\n\npclasses.count",
341 |     "outputs" : [ ]
342 |   }, {
343 |     "metadata" : {
344 |       "trusted" : true,
345 |       "input_collapsed" : false,
346 |       "collapsed" : false,
347 |       "id" : "2D1113AEF17F420C9FE869661E10A7A7"
348 |     },
349 |     "cell_type" : "code",
350 |     "source" : "pclasses.show",
351 |     "outputs" : [ ]
352 |   }, {
353 |     "metadata" : {
354 |       "trusted" : true,
355 |       "input_collapsed" : false,
356 |       "collapsed" : false,
357 |       "id" : "E689CD08EF394D178E415C5F1EB5D164"
358 |     },
359 |     "cell_type" : "code",
360 |     "source" : "spark.sql(\"\"\"\n  SELECT DISTINCT Pclass from passengers\n\"\"\").count",
361 |     "outputs" : [ ]
362 |   }, {
363 |     "metadata" : {
364 |       "id" : "ABB5DAB6E64B49F589EC1A3E0EA25756"
365 |     },
366 |     "cell_type" : "markdown",
367 |     "source" : "**Q-2. How many passengers were in each class?**"
368 |   }, {
369 |     "metadata" : {
370 |       "trusted" : true,
371 |       "input_collapsed" : false,
372 |       "collapsed" : false,
373 |       "id" : "F83908E659D148A29C2B08C20D3F20D3"
374 |     },
375 |     "cell_type" : "code",
376 |     "source" : "val numByClass = passengersDF.groupBy(\"Pclass\").count\nnumByClass.show",
377 |     "outputs" : [ ]
378 |   }, {
379 |     "metadata" : {
380 |       "trusted" : true,
381 |       "input_collapsed" : false,
382 |       "collapsed" : false,
383 |       "id" : "2DE1CB257F9B42979B67818B65835825"
384 |     },
385 |     "cell_type" : "code",
386 |     "source" : "spark.sql(\"\"\"\n SELECT Pclass, count(PassengerID) as class_count FROM passengers\n GROUP BY Pclass\n ORDER BY class_count DESC\n\"\"\").show",
387 |     "outputs" : [ ]
388 |   }, {
389 |     "metadata" : {
390 |       "trusted" : true,
391 |       "input_collapsed" : false,
392 |       "collapsed" : false,
393 |       "presentation" : {
394 |         "tabs_state" : "{\n  \"tab_id\": \"#tab1686677314-0\"\n}",
395 |         "pivot_chart_state" : "{\n  \"hiddenAttributes\": [],\n  \"menuLimit\": 200,\n  \"cols\": [],\n  \"rows\": [],\n  \"vals\": [],\n  \"exclusions\": {},\n  \"inclusions\": {},\n  \"unusedAttrsVertical\": 85,\n  \"autoSortUnusedAttrs\": false,\n  \"inclusionsInfo\": {},\n  \"aggregatorName\": \"Count\",\n  \"rendererName\": \"Table\"\n}"
396 |       },
397 |       "id" : "241BF06A77504EE5835B7706DC3D6A80"
398 |     },
399 |     "cell_type" : "code",
400 |     "source" : "numByClass.collect",
401 |     "outputs" : [ ]
402 |   }, {
403 |     "metadata" : {
404 |       "trusted" : true,
405 |       "input_collapsed" : false,
406 |       "collapsed" : false,
407 |       "id" : "539395181C18426A8A92BDFE30B38FF5"
408 |     },
409 |     "cell_type" : "code",
410 |     "source" : "CustomPlotlyChart(numByClass,\n                  layout=\"{title: 'Passengers per class', xaxis: {title: 'Pclass'}}\",\n                  dataOptions=\"{type: 'bar'}\",\n                  dataSources=\"{x: 'Pclass', y: 'count'}\")",
411 |     "outputs" : [ ]
412 |   }, {
413 |     "metadata" : {
414 |       "trusted" : true,
415 |       "input_collapsed" : false,
416 |       "collapsed" : true,
417 |       "id" : "B22F8FEBB82E49348B5788109E706B07"
418 |     },
419 |     "cell_type" : "markdown",
420 |     "source" : "**Q-3. How many women and men were in each class?**"
421 |   }, {
422 |     "metadata" : {
423 |       "trusted" : true,
424 |       "input_collapsed" : false,
425 |       "collapsed" : false,
426 |       "id" : "D9767491DA4147ED883CC035089C8C76"
427 |     },
428 |     "cell_type" : "code",
429 |     "source" : "val grByGenderAndClass = passengersDF.groupBy(\"Pclass\", \"Sex\").count",
430 |     "outputs" : [ ]
431 |   }, {
432 |     "metadata" : {
433 |       "trusted" : true,
434 |       "input_collapsed" : false,
435 |       "collapsed" : false,
436 |       "id" : "D6A740F6875146578887A67D62CABE51"
437 |     },
438 |     "cell_type" : "code",
439 |     "source" : "grByGenderAndClass",
440 |     "outputs" : [ ]
441 |   }, {
442 |     "metadata" : {
443 |       "trusted" : true,
444 |       "input_collapsed" : false,
445 |       "collapsed" : false,
446 |       "id" : "B3E3EC5E5ED8444883D27AE4050BAB4D"
447 |     },
448 |     "cell_type" : "code",
449 |     "source" : "CustomPlotlyChart(grByGenderAndClass,\n                  layout=\"{title: 'Passengers per class', xaxis: {title: 'Pclass'}, barmode: 'group'}\",\n                  dataOptions=\"{type: 'bar', splitBy: 'Sex'}\",\n                  dataSources=\"{x: 'Pclass', y: 'count'}\")",
450 |     "outputs" : [ ]
451 |   }, {
452 |     "metadata" : {
453 |       "id" : "8852C6D27A7548678558B726FA4EC0CA"
454 |     },
455 |     "cell_type" : "markdown",
456 |     "source" : "### DataFrame Functions and UDF"
457 |   }, {
458 |     "metadata" : {
459 |       "id" : "0C12589398E54FDB9CB5303E2ACF6600"
460 |     },
461 |     "cell_type" : "markdown",
462 |     "source" : "http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.functions$"
463 |   }, {
464 |     "metadata" : {
465 |       "trusted" : true,
466 |       "input_collapsed" : false,
467 |       "collapsed" : false,
468 |       "id" : "B4A586C3124641D08FD92E291914325C"
469 |     },
470 |     "cell_type" : "code",
471 |     "source" : "import org.apache.spark.sql.functions.{mean, min, max}\n\npassengersDF.select(mean(\"Age\").alias(\"Average Age\"), min(\"Age\"), max(\"Age\")).show()",
472 |     "outputs" : [ ]
473 |   }, {
474 |     "metadata" : {
475 |       "trusted" : true,
476 |       "input_collapsed" : false,
477 |       "collapsed" : false,
478 |       "id" : "36483A5F3D3148FA867F7C00FCBDFE5E"
479 |     },
480 |     "cell_type" : "code",
481 |     "source" : "import org.apache.spark.sql.functions.count\n\npassengersDF.groupBy(\"Pclass\")\n            .agg(count(\"Pclass\").alias(\"class_count\"))\n            .orderBy(-$\"class_count\")\n            .show",
482 |     "outputs" : [ ]
483 |   }, {
484 |     "metadata" : {
485 |       "id" : "857AD3D31D3447388F2EF03F2150EF7E"
486 |     },
487 |     "cell_type" : "markdown",
488 |     "source" : "For more specific tasks one can use User Defined Functions.\n\nLet's say we want to get a column with full names of port of embarkation."
489 |   }, {
490 |     "metadata" : {
491 |       "trusted" : true,
492 |       "input_collapsed" : false,
493 |       "collapsed" : false,
494 |       "id" : "51DB33FCFB394E358EDE6C6C7D4AEF8B"
495 |     },
496 |     "cell_type" : "code",
497 |     "source" : "passengersDF.select(\"Embarked\").distinct.show",
498 |     "outputs" : [ ]
499 |   }, {
500 |     "metadata" : {
501 |       "id" : "1ACADA9161D94D6393E9639507EF6B77"
502 |     },
503 |     "cell_type" : "markdown",
504 |     "source" : "From dataset description we know that C = Cherbourg; Q = Queenstown; S = Southampton."
505 |   }, {
506 |     "metadata" : {
507 |       "trusted" : true,
508 |       "input_collapsed" : false,
509 |       "collapsed" : false,
510 |       "id" : "B2C67EAA4DAD4D048297CCEF98601BD9"
511 |     },
512 |     "cell_type" : "code",
513 |     "source" : "import org.apache.spark.sql.functions.udf\n\nval embarkedFullName: (String) => String = (embarked: String) =>\n  if (embarked == \"Q\")\n    \"Queenstown\"\n  else if (embarked == \"C\")\n    \"Cherbourg\"\n  else\n    \"Southampton\"\n\n\nval embarkedFullNameUDF = udf(embarkedFullName)",
514 |     "outputs" : [ ]
515 |   }, {
516 |     "metadata" : {
517 |       "id" : "CBA7286CBB9B40C08E961A5AFFC562AA"
518 |     },
519 |     "cell_type" : "markdown",
520 |     "source" : "Also we want to get a column with more verbose survival status of passenger: `survived` and `died`."
521 |   }, {
522 |     "metadata" : {
523 |       "trusted" : true,
524 |       "input_collapsed" : false,
525 |       "collapsed" : false,
526 |       "id" : "E7158FE3637440D29AA7C0611532686D"
527 |     },
528 |     "cell_type" : "code",
529 |     "source" : "val survivedStatus: (Integer) => String = (survived: Integer) =>\n  if (survived == 1)\n    \"survived\"\n  else\n    \"died\"\n\nval survivedStatusUDF = udf(survivedStatus)",
530 |     "outputs" : [ ]
531 |   }, {
532 |     "metadata" : {
533 |       "trusted" : true,
534 |       "input_collapsed" : false,
535 |       "collapsed" : false,
536 |       "id" : "6EBB943DEAB247D084A70AD43DAAD151"
537 |     },
538 |     "cell_type" : "code",
539 |     "source" : "val pdf = passengersDF\n        .withColumn(\"Embarkation\", embarkedFullNameUDF($\"Embarked\"))\n        .drop(\"Embarked\")\n        .withColumn(\"SurvivedStatus\", survivedStatusUDF($\"Survived\"))\n        .cache()",
540 |     "outputs" : [ ]
541 |   }, {
542 |     "metadata" : {
543 |       "trusted" : true,
544 |       "input_collapsed" : false,
545 |       "collapsed" : false,
546 |       "id" : "9111446574A142638399B4B2FDBFE2E0"
547 |     },
548 |     "cell_type" : "code",
549 |     "source" : "pdf.select(\"Name\", \"Embarkation\", \"SurvivedStatus\").limit(5)",
550 |     "outputs" : [ ]
551 |   }, {
552 |     "metadata" : {
553 |       "id" : "7478E5D34328498FAFAFCE7587733B8B"
554 |     },
555 |     "cell_type" : "markdown",
556 |     "source" : "### Practice session"
557 |   }, {
558 |     "metadata" : {
559 |       "trusted" : true,
560 |       "input_collapsed" : false,
561 |       "collapsed" : false,
562 |       "id" : "FE0F6FE181A14D70B5BC90E69DC471BE"
563 |     },
564 |     "cell_type" : "markdown",
565 |     "source" : "**Q-5. Count the number and percentage of survivors and dead passengers.**"
566 |   }, {
567 |     "metadata" : {
568 |       "trusted" : true,
569 |       "input_collapsed" : false,
570 |       "collapsed" : false,
571 |       "id" : "D8C3FC886C224BA1B40FC6B96E221410"
572 |     },
573 |     "cell_type" : "code",
574 |     "source" : "import org.apache.spark.sql.functions.count\n\nval numPassengers = pdf.count()\n\nval grBySurvived = pdf.groupBy(\"SurvivedStatus\")\n                      .agg(count(\"PassengerId\").alias(\"count\"), \n                           ((count(\"PassengerId\") / numPassengers) * 100).alias(\"%\"))\ngrBySurvived.show",
575 |     "outputs" : [ ]
576 |   }, {
577 |     "metadata" : {
578 |       "id" : "F2E89C82208D4CE1BD756D6147752C26"
579 |     },
580 |     "cell_type" : "markdown",
581 |     "source" : "**Q-6.** \n- **Plot the distribution of dead and surviving passengers.**\n- **Plot the distribution of survivors and dead passengers by class.**\n- **Plot the distribution of survivors and dead passengers by gender.**\n- **Plot the distribution of survivors and dead passengers by port of embarkation.**\n- **Plot the % of survivors by port of embarkation.**\n- **Plot the distribution of passenger classes by port of embarkation.**"
582 |   }, {
583 |     "metadata" : {
584 |       "trusted" : true,
585 |       "input_collapsed" : false,
586 |       "collapsed" : false,
587 |       "id" : "DCFE583EBC614827B2192913336A270C"
588 |     },
589 |     "cell_type" : "code",
590 |     "source" : "// Distribution of dead and survived passengers\n\nCustomPlotlyChart(grBySurvived,\n                  layout=\"{title: 'Passengers by status', xaxis: {title: 'status'}, yaxis: {title: '%'}}\",\n                  dataOptions=\"{type: 'bar'}\",\n                  dataSources=\"{x: 'SurvivedStatus', y: '%'}\")",
591 |     "outputs" : [ ]
592 |   }, {
593 |     "metadata" : {
594 |       "trusted" : true,
595 |       "input_collapsed" : false,
596 |       "collapsed" : false,
597 |       "id" : "FF53CEB29DED42CCBBEF5061F090D300"
598 |     },
599 |     "cell_type" : "code",
600 |     "source" : "// Distribution of the number of survivors and dead passengers by class.\n\nCustomPlotlyChart(pdf.groupBy(\"SurvivedStatus\", \"Pclass\").count,\n                  layout=\"{title: 'Number of passengers by survival status per class', xaxis: {title: 'Pclass'}, barmode: 'group'}\",\n                  dataOptions=\"{type: 'bar', splitBy: 'SurvivedStatus'}\",\n                  dataSources=\"{x: 'Pclass', y: 'count'}\")",
601 |     "outputs" : [ ]
602 |   }, {
603 |     "metadata" : {
604 |       "trusted" : true,
605 |       "input_collapsed" : false,
606 |       "collapsed" : false,
607 |       "id" : "B0A55CA3DB65485D8EAD5DC01B1EBD43"
608 |     },
609 |     "cell_type" : "code",
610 |     "source" : "// Distribution of survivors and dead passengers by gender.\n\nCustomPlotlyChart(pdf.groupBy(\"SurvivedStatus\", \"Sex\").count,\n                  layout=\"{title: 'Number of passengers by status by gender', xaxis: {title: 'Gender'}, barmode: 'group'}\",\n                  dataOptions=\"{type: 'bar', splitBy: 'SurvivedStatus'}\",\n                  dataSources=\"{x: 'Sex', y: 'count'}\")",
611 |     "outputs" : [ ]
612 |   }, {
613 |     "metadata" : {
614 |       "trusted" : true,
615 |       "input_collapsed" : false,
616 |       "collapsed" : false,
617 |       "id" : "4382CC3408474AC591A917BE71C0D6DD"
618 |     },
619 |     "cell_type" : "code",
620 |     "source" : "// Distribution of survivors and dead passengers by port of embarkation.\n\nCustomPlotlyChart(pdf.groupBy(\"Embarkation\", \"SurvivedStatus\").count,\n                  layout=\"{barmode: 'group'}\",\n                  dataOptions=\"{type: 'bar', splitBy: 'SurvivedStatus'}\",\n                  dataSources=\"{x: 'Embarkation', y: 'count'}\")",
621 |     "outputs" : [ ]
622 |   }, {
623 |     "metadata" : {
624 |       "trusted" : true,
625 |       "input_collapsed" : false,
626 |       "collapsed" : false,
627 |       "presentation" : {
628 |         "tabs_state" : "{\n  \"tab_id\": \"#tab348620047-1\"\n}",
629 |         "pivot_chart_state" : "{\n  \"hiddenAttributes\": [],\n  \"menuLimit\": 200,\n  \"cols\": [],\n  \"rows\": [],\n  \"vals\": [],\n  \"exclusions\": {},\n  \"inclusions\": {},\n  \"unusedAttrsVertical\": 85,\n  \"autoSortUnusedAttrs\": false,\n  \"inclusionsInfo\": {},\n  \"aggregatorName\": \"Count\",\n  \"rendererName\": \"Table\"\n}"
630 |       },
631 |       "id" : "C92E240948514A769695125D47A6F3E8"
632 |     },
633 |     "cell_type" : "code",
634 |     "source" : "// % of survivors by port of embarkation.\n\nCustomPlotlyChart(pdf.groupBy(\"Embarkation\").agg((sum(\"Survived\") / count(\"Survived\") * 100).alias(\"SurvivalRate\")),\n                  layout=\"{title: '% of survival per embarkation'}\",\n                  dataOptions=\"{type: 'bar'}\",\n                  dataSources=\"{x: 'Embarkation', y: 'SurvivalRate'}\")",
635 |     "outputs" : [ ]
636 |   }, {
637 |     "metadata" : {
638 |       "trusted" : true,
639 |       "input_collapsed" : false,
640 |       "collapsed" : false,
641 |       "id" : "EE39CF0D57B1481A917D276C5D55275D"
642 |     },
643 |     "cell_type" : "code",
644 |     "source" : "// Distribution of passenger classes by port of embarkation.\n\nCustomPlotlyChart(pdf.groupBy(\"Embarkation\", \"Pclass\").count,\n                  layout=\"{barmode: 'stack', title: 'Pclass distribution by Embarkation'}\",\n                  dataOptions=\"{type: 'bar', splitBy: 'Pclass'}\",\n                  dataSources=\"{x: 'Embarkation', y: 'count'}\")",
645 |     "outputs" : [ ]
646 |   }, {
647 |     "metadata" : {
648 |       "id" : "B94C5A499DEF4344AFAFC71EDEA4E4EB"
649 |     },
650 |     "cell_type" : "markdown",
651 |     "source" : "How to get the % of survived passengers by port of embarkation in this case?"
652 |   }, {
653 |     "metadata" : {
654 |       "trusted" : true,
655 |       "input_collapsed" : false,
656 |       "collapsed" : false,
657 |       "id" : "42A9EA0451BF421E91A475E4ABD0A186"
658 |     },
659 |     "cell_type" : "code",
660 |     "source" : "val byEmbark =  pdf.groupBy(\"Embarkation\").agg(count(\"PassengerId\").alias(\"totalCount\"))\nval byEmbarkByClass = pdf.groupBy(\"Embarkation\", \"Pclass\").count",
661 |     "outputs" : [ ]
662 |   }, {
663 |     "metadata" : {
664 |       "trusted" : true,
665 |       "input_collapsed" : false,
666 |       "collapsed" : false,
667 |       "id" : "02AC8277D19648188507F5312C51E562"
668 |     },
669 |     "cell_type" : "code",
670 |     "source" : "val embarkClassDistr = byEmbarkByClass.join(byEmbark, usingColumn=\"Embarkation\")\n                                      .select($\"Embarkation\",\n                                              $\"Pclass\", \n                                              ($\"count\" / $\"totalCount\" * 100).alias(\"%\"))\n\nCustomPlotlyChart(embarkClassDistr,\n                  layout=\"{barmode: 'stack', title: 'Pclass distribution by Embarkation', yaxis: {title: '%'}}\",\n                  dataOptions=\"{type: 'bar', splitBy: 'Pclass'}\",\n                  dataSources=\"{x: 'Embarkation', y: '%'}\")",
671 |     "outputs" : [ ]
672 |   }, {
673 |     "metadata" : {
674 |       "id" : "ED9BD96273124BE6B502992651149D91"
675 |     },
676 |     "cell_type" : "markdown",
677 |     "source" : "### Histograms and Box Plots"
678 |   }, {
679 |     "metadata" : {
680 |       "id" : "AB9E955BCB114FB3882A6EDDDF015FAE"
681 |     },
682 |     "cell_type" : "markdown",
683 |     "source" : "**Q-7 Obtain age distributions by passengers survival status.**"
684 |   }, {
685 |     "metadata" : {
686 |       "trusted" : true,
687 |       "input_collapsed" : false,
688 |       "collapsed" : false,
689 |       "id" : "FD743A9848C74C8DB1B1A7E93E351BAD"
690 |     },
691 |     "cell_type" : "code",
692 |     "source" : "CustomPlotlyChart(pdf, \n                  layout=\"{title: 'Age distribution by status', xaxis: {title: 'Age'}, barmode: 'overlay'}\",\n                  dataOptions=\"{type: 'histogram', opacity: 0.6, splitBy: 'SurvivedStatus'}\",\n                  dataSources=\"{x: 'Age'}\")",
693 |     "outputs" : [ ]
694 |   }, {
695 |     "metadata" : {
696 |       "trusted" : true,
697 |       "input_collapsed" : false,
698 |       "collapsed" : false,
699 |       "id" : "A825DE70B29E46A58167CF7D5F9DA4AF"
700 |     },
701 |     "cell_type" : "code",
702 |     "source" : "CustomPlotlyChart(pdf, \n                  layout=\"{yaxis: {title: 'Age'}}\",\n                  dataOptions=\"{type: 'box', splitBy: 'SurvivedStatus'}\",\n                  dataSources=\"{y: 'Age'}\")",
703 |     "outputs" : [ ]
704 |   }, {
705 |     "metadata" : {
706 |       "id" : "3B308D4AABF1491485890B86C4D4BFC8"
707 |     },
708 |     "cell_type" : "markdown",
709 |     "source" : "**Q-8. Plot box plots of age distributions by passengers classes.**"
710 |   }, {
711 |     "metadata" : {
712 |       "trusted" : true,
713 |       "input_collapsed" : false,
714 |       "collapsed" : false,
715 |       "id" : "D26A8F4FF341485AAD1034460EDA6761"
716 |     },
717 |     "cell_type" : "code",
718 |     "source" : "CustomPlotlyChart(pdf, \n                  layout=\"{yaxis: {title: 'Age'}}\",\n                  dataOptions=\"{type: 'box', splitBy: 'Pclass'}\",\n                  dataSources=\"{y: 'Age'}\")",
719 |     "outputs" : [ ]
720 |   }, {
721 |     "metadata" : {
722 |       "id" : "D683E06CFC934A098175544B47E6C701"
723 |     },
724 |     "cell_type" : "markdown",
725 |     "source" : "This scatter plots show the dependences of the chances of survival from the cabin class, age and gender:"
726 |   }, {
727 |     "metadata" : {
728 |       "trusted" : true,
729 |       "input_collapsed" : false,
730 |       "collapsed" : false,
731 |       "id" : "A6CB9EB11CED4E1F8623B9D2ED582AAE"
732 |     },
733 |     "cell_type" : "code",
734 |     "source" : "val survByClassAndAge = List(\"male\", \"female\").map{\n  gender =>\n    CustomPlotlyChart(pdf.filter($\"Sex\" === gender),\n                  layout=s\"\"\"{\n                    title: 'Survival by class and age, $gender.', \n                    yaxis: {title: 'class'}, \n                    xaxis: {title: 'age'}\n                  }\"\"\",\n                  dataOptions=\"\"\"{\n                    splitBy: 'SurvivedStatus',\n                    byTrace: {\n                      'survived': {\n                        mode: 'markers',\n                        marker: {\n                          size: 20,\n                          opacity: 0.3,\n                          color: 'orange'\n                        }\n                      },\n                      'died': {\n                        mode: 'markers',\n                        marker: {\n                          size: 15,\n                          opacity: 0.9,\n                          color: 'rgba(55, 128, 191, 0.6)'\n                        }\n                      }\n                    }\n                  }\"\"\",\n                  dataSources = \"{x: 'Age', y: 'Pclass'}\"\n                     )\n}",
735 |     "outputs" : [ ]
736 |   }, {
737 |     "metadata" : {
738 |       "trusted" : true,
739 |       "input_collapsed" : false,
740 |       "collapsed" : false,
741 |       "id" : "A1F17AC9F5634BCC866AC03BD35CAF52"
742 |     },
743 |     "cell_type" : "code",
744 |     "source" : "survByClassAndAge(0)",
745 |     "outputs" : [ ]
746 |   }, {
747 |     "metadata" : {
748 |       "trusted" : true,
749 |       "input_collapsed" : false,
750 |       "collapsed" : false,
751 |       "id" : "E752E11B5E3E415DB83B1752E0CAA39C"
752 |     },
753 |     "cell_type" : "code",
754 |     "source" : "survByClassAndAge(1)",
755 |     "outputs" : [ ]
756 |   }, {
757 |     "metadata" : {
758 |       "id" : "DF18F97FFCB44B42A657D6002C72B0E5"
759 |     },
760 |     "cell_type" : "markdown",
761 |     "source" : "### More practice with UDF and Box Plots"
762 |   }, {
763 |     "metadata" : {
764 |       "id" : "895FA63345FC47F288917539A9E50014"
765 |     },
766 |     "cell_type" : "markdown",
767 |     "source" : "The titles of passengers could be useful source of information. Let's explore that."
768 |   }, {
769 |     "metadata" : {
770 |       "id" : "9BDE6CD9C586463E8CE2C86FEC396932"
771 |     },
772 |     "cell_type" : "markdown",
773 |     "source" : "**Q-9. Plot box plots of age distributions by title.**"
774 |   }, {
775 |     "metadata" : {
776 |       "trusted" : true,
777 |       "input_collapsed" : false,
778 |       "collapsed" : false,
779 |       "id" : "69E2BD21E9E14113AE3842909DC135E7"
780 |     },
781 |     "cell_type" : "code",
782 |     "source" : "pdf.select(\"Name\").show(3, truncate=false)",
783 |     "outputs" : [ ]
784 |   }, {
785 |     "metadata" : {
786 |       "trusted" : true,
787 |       "input_collapsed" : false,
788 |       "collapsed" : false,
789 |       "id" : "A16D246A68A34A13AB667BB060F8785F"
790 |     },
791 |     "cell_type" : "code",
792 |     "source" : "val parseTitle: String => String = (name: String) =>\n  name.split(\", \")(1).split(\"\\\\.\")(0)\n\nval parseTitleUDF = udf(parseTitle)",
793 |     "outputs" : [ ]
794 |   }, {
795 |     "metadata" : {
796 |       "trusted" : true,
797 |       "input_collapsed" : false,
798 |       "collapsed" : false,
799 |       "id" : "3FCBCF42719244908AD3271F198B723A"
800 |     },
801 |     "cell_type" : "code",
802 |     "source" : "CustomPlotlyChart(pdf.withColumn(\"Title\", parseTitleUDF($\"Name\")), \n                  layout=\"{yaxis: {title: 'Age'}}\",\n                  dataOptions=\"{type: 'box', splitBy: 'Title'}\",\n                  dataSources=\"{y: 'Age'}\")",
803 |     "outputs" : [ ]
804 |   }, {
805 |     "metadata" : {
806 |       "id" : "7C40B636686F448D880E4EE0E4C1DC4E"
807 |     },
808 |     "cell_type" : "markdown",
809 |     "source" : "Often it is good practice to group the values of the categorical feature, especially when there are rare individual feature values such as `Don`, `Lady`, `Capt` in our case."
810 |   }, {
811 |     "metadata" : {
812 |       "id" : "5CAAC988C4C249E9899F435D95FA2BB8"
813 |     },
814 |     "cell_type" : "markdown",
815 |     "source" : "**Q-10. Write UDF to group all the titles into five groups according to the following table:**\n\n|   Group       |    Title     |\n| :------------:|:------------:|\n| Aristocratic  | Capt, Col, Don, Dr, Jonkheer, Lady, Major, Rev, Sir, Countess |\n| Mrs           | Mrs, Ms         |\n| Miss          | Miss, Mlle, Mme |\n| Mr            | Mr              |\n| Master        | Master          |\n\n** Create new column called 'TitleGroup' and plot box plots of age distributions by title group.**"
816 |   }, {
817 |     "metadata" : {
818 |       "trusted" : true,
819 |       "input_collapsed" : false,
820 |       "collapsed" : false,
821 |       "id" : "F32C7967F2C346548B5A2143BCE73D80"
822 |     },
823 |     "cell_type" : "code",
824 |     "source" : "val titleGroup: String => String = (title: String) => {\n  val aristocratic = Set(\"Capt\", \"Col\", \"Don\", \"Dr\", \"Jonkheer\", \"Lady\", \"Major\", \"Rev\", \"Sir\", \"the Countess\")\n  val mrs = Set(\"Mrs\", \"Ms\")\n  val miss = Set(\"Miss\", \"Mlle\", \"Mme\")\n  if (aristocratic.contains(title))\n    \"Aristocratic\"\n  else if (mrs.contains(title))\n    \"Mrs\"\n  else if (miss.contains(title))\n    \"Miss\"\n  else\n    title\n}\n\n// given column with passenger name obtain column with passenger title group.\nval parseTitleGroupUDF = udf(parseTitle andThen titleGroup)",
825 |     "outputs" : [ ]
826 |   }, {
827 |     "metadata" : {
828 |       "trusted" : true,
829 |       "input_collapsed" : false,
830 |       "collapsed" : false,
831 |       "id" : "115AAD76AD1C4FB385A3144AFCE13A92"
832 |     },
833 |     "cell_type" : "code",
834 |     "source" : "val withTitleDF = pdf.withColumn(\"TitleGroup\", parseTitleGroupUDF($\"Name\"))\n\nCustomPlotlyChart(withTitleDF, \n                  layout=\"{yaxis: {title: 'Age'}}\",\n                  dataOptions=\"{type: 'box', splitBy: 'TitleGroup'}\",\n                  dataSources=\"{y: 'Age'}\")",
835 |     "outputs" : [ ]
836 |   }, {
837 |     "metadata" : {
838 |       "id" : "5247C9EABE594E778E2012CA5161DA2E"
839 |     },
840 |     "cell_type" : "markdown",
841 |     "source" : "**Q-11 Plot the distribution of the % of survivors by title group.**"
842 |   }, {
843 |     "metadata" : {
844 |       "trusted" : true,
845 |       "input_collapsed" : false,
846 |       "collapsed" : false,
847 |       "id" : "C1412161CF1942288B64C419CEDC2A81"
848 |     },
849 |     "cell_type" : "code",
850 |     "source" : "val byTitleGr = withTitleDF\n                   .groupBy(\"TitleGroup\")\n                   .agg((sum(\"Survived\") / count(\"Survived\") * 100).alias(\"%\"))\n\nCustomPlotlyChart(byTitleGr,\n                  layout=\"{title: '% of survival by title group'}\",\n                  dataOptions=\"{type: 'bar'}\",\n                  dataSources=\"{x: 'TitleGroup', y: '%'}\")",
851 |     "outputs" : [ ]
852 |   }, {
853 |     "metadata" : {
854 |       "id" : "7C6408089C9F4EB5B32066C66E7E8306"
855 |     },
856 |     "cell_type" : "markdown",
857 |     "source" : "### Handling missing values"
858 |   }, {
859 |     "metadata" : {
860 |       "trusted" : true,
861 |       "input_collapsed" : false,
862 |       "collapsed" : false,
863 |       "id" : "339B055ADE124967B44CA329F13B857B"
864 |     },
865 |     "cell_type" : "code",
866 |     "source" : "import org.apache.spark.sql.functions.isnull\n\n100.0 * pdf.filter(isnull($\"Age\")).count / pdf.count",
867 |     "outputs" : [ ]
868 |   }, {
869 |     "metadata" : {
870 |       "trusted" : true,
871 |       "input_collapsed" : false,
872 |       "collapsed" : false,
873 |       "id" : "5FD862836CBD4E3A80F284772118043D"
874 |     },
875 |     "cell_type" : "code",
876 |     "source" : "100.0 * pdf.filter(isnull($\"Cabin\")).count / pdf.count",
877 |     "outputs" : [ ]
878 |   }, {
879 |     "metadata" : {
880 |       "trusted" : true,
881 |       "input_collapsed" : false,
882 |       "collapsed" : false,
883 |       "id" : "1EFDAA762DB1431DAEF1F3C9F34B545A"
884 |     },
885 |     "cell_type" : "code",
886 |     "source" : "val cabinStatus: (String) => String = (cabin: String) =>\n  if (cabin == null)\n    \"noname\"\n  else\n    \"hasNumber\"\n\nval cabinStatusUDF = udf(cabinStatus)",
887 |     "outputs" : [ ]
888 |   }, {
889 |     "metadata" : {
890 |       "trusted" : true,
891 |       "input_collapsed" : false,
892 |       "collapsed" : false,
893 |       "id" : "0C1F6D0D8C04490385680BC9F370D7A4"
894 |     },
895 |     "cell_type" : "code",
896 |     "source" : "val withCabinStatusDF = pdf.withColumn(\"CabinStatus\", cabinStatusUDF($\"Cabin\"))",
897 |     "outputs" : [ ]
898 |   }, {
899 |     "metadata" : {
900 |       "trusted" : true,
901 |       "input_collapsed" : false,
902 |       "collapsed" : false,
903 |       "id" : "2E23AA3B714F49E286B71AD626AEC7C5"
904 |     },
905 |     "cell_type" : "code",
906 |     "source" : "CustomPlotlyChart(withCabinStatusDF.groupBy(\"CabinStatus\", \"SurvivedStatus\").count,\n                  layout=\"{title: 'Number of passengers by survival status by cabin type', xaxis: {title: 'Cabin'}}\",\n                  dataOptions=\"{type: 'bar', splitBy: 'SurvivedStatus'}\",\n                  dataSources=\"{x: 'CabinStatus', y: 'count'}\")",
907 |     "outputs" : [ ]
908 |   }, {
909 |     "metadata" : {
910 |       "id" : "5AD2074CF32B4E40A18B17FCD63250AB"
911 |     },
912 |     "cell_type" : "markdown",
913 |     "source" : "### On your own"
914 |   }, {
915 |     "metadata" : {
916 |       "id" : "CE92499FD6F74A79B7A5E3B2A5E90831"
917 |     },
918 |     "cell_type" : "markdown",
919 |     "source" : "Explore family relationships variables (SibSp and Parch).\nHow does the number of siblings/spouses aboard affect the chances of survival?\nHow does the number of parents/children aboard affect the chances of survival?\n\nInvent a new variable called `Family` to represent total number of relatives aboard and explore how does it affect hte chances of survival."
920 |   } ],
921 |   "nbformat" : 4
922 | }


--------------------------------------------------------------------------------