├── .gitignore ├── LICENSE ├── README.md └── labs ├── BagOfWordsMeetsBagsOfPopcorn ├── BagOfWordsMeetsBagsOfPopcorn.snb ├── README.md └── images │ ├── recallPrecision.png │ └── roc.png ├── DLFramework ├── DLFramework.snb └── README.md ├── DataAnalysisToolbox ├── DataAnalysisToolbox.snb ├── README.md ├── images │ ├── ageHist.png │ ├── ageHistPerClass.png │ ├── ageHistPerClassStacked.png │ └── plotFunction.png └── titanic.csv ├── IntroToMLandSparkMLPipelines ├── Intro To Machine Learning and SparkML Pipelines.snb ├── README.md └── data │ └── data.adult.csv ├── IntroToMachineLearning ├── IntroToMachineLearning.snb ├── README.md ├── data.adult.csv └── images │ ├── ageHistData.png │ ├── cgainHistData.png │ ├── fnlwgtHistData.png │ ├── lrAvgMetrics.png │ ├── rfAvgMetrics.png │ ├── rfAvgMetrics2.png │ └── treeAvgMetrics.png └── TitanicSurvivalExploration ├── README.md ├── TitanicSurvivalExploration.snb └── data └── titanic_train.csv /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask instance folder 57 | instance/ 58 | 59 | # Scrapy stuff: 60 | .scrapy 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | 68 | # IPython Notebook 69 | .ipynb_checkpoints 70 | 71 | # pyenv 72 | .python-version 73 | 74 | # celery beat schedule file 75 | celerybeat-schedule 76 | 77 | # dotenv 78 | .env 79 | 80 | # virtualenv 81 | venv/ 82 | ENV/ 83 | 84 | # Spyder project settings 85 | .spyderproject 86 | 87 | 88 | *.class 89 | *.log 90 | 91 | # sbt specific 92 | .cache 93 | .history 94 | .lib/ 95 | dist/* 96 | target/ 97 | lib_managed/ 98 | src_managed/ 99 | project/boot/ 100 | project/plugins/project/ 101 | 102 | # Scala-IDE specific 103 | .scala_dependencies 104 | .worksheet -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Andrey Romanov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spark-notebook-ml-labs 2 | All labs are implemented in [Spark Notebook](https://github.com/andypetrella/spark-notebook). In particular spark-notebook-0.6.3 with scala-2.10.5 and spark-1.6.1 was used for the most of the labs. 3 | In these labs we are going to get familiar with tools for data analysis and machine learning: 4 | * [breeze](https://github.com/scalanlp/breeze) 5 | * [spark dataframes](http://spark.apache.org/docs/latest/sql-programming-guide) 6 | * [spark.ml](http://spark.apache.org/docs/latest/ml-guide.html) 7 | * spark-notebook visualization capabilities 8 | 9 | Available labs: 10 | * [Data Analysis Toolbox](https://github.com/drewnoff/spark-notebook-ml-labs/tree/master/labs/DataAnalysisToolbox) 11 | * [Titanic Survival Exploration](https://github.com/drewnoff/spark-notebook-ml-labs/tree/master/labs/TitanicSurvivalExploration) 12 | * [Introduction To Machine Learning and Spark ML Pipelines](https://github.com/drewnoff/spark-notebook-ml-labs/tree/master/labs/IntroToMLandSparkMLPipelines) 13 | * [Bag of Words Meets Bags of Popcorn](https://github.com/drewnoff/spark-notebook-ml-labs/tree/master/labs/BagOfWordsMeetsBagsOfPopcorn) 14 | * [Neural Networks & Backpropagation with ND4J](https://github.com/drewnoff/spark-notebook-ml-labs/tree/master/labs/DLFramework) 15 | -------------------------------------------------------------------------------- /labs/BagOfWordsMeetsBagsOfPopcorn/images/recallPrecision.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/BagOfWordsMeetsBagsOfPopcorn/images/recallPrecision.png -------------------------------------------------------------------------------- /labs/BagOfWordsMeetsBagsOfPopcorn/images/roc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/BagOfWordsMeetsBagsOfPopcorn/images/roc.png -------------------------------------------------------------------------------- /labs/DLFramework/README.md: -------------------------------------------------------------------------------- 1 | # Neural Networks & Backpropagation with ND4J 2 | 3 | In this lab we're going to implement a small framework for training neural networks for classification tasks using [ND4J](http://nd4j.org/) numerical computing library . 4 | 5 | This lab is not intended to provide full explanation of underlying theory. Recommended materials: [deeplearningbook.org](http://www.deeplearningbook.org/), [Introduction to Deep Learning leacture slides](https://m2dsupsdlclass.github.io/lectures-labs/). 6 | 7 | Our framework will support following neural network layers. 8 | 9 | 10 | 11 | 12 | - **Fully-connected layer (or dense layer)**. Neurons in a fully connected layer have full connections to all activations in the previous layer. Their activations can hence be computed with a matrix multiplication followed by a bias offset.* 13 | 14 | \mathrm{Dense} \equiv f\left(\textbf{x}\right)=\textbf{W}\textbf{x}+\textbf{b}, 15 | 16 | where 17 | \textbf{W}\in\mathbb{R}^{(k,n)} - weight matrix, 18 | \textbf{b}\in\mathbb{R}^k - bias offset. 19 | 20 | 21 | - **Sigmoid activation layer**. 22 | 23 | \mathrm{Sigmoid} \equiv f\left(\textbf{x}\right)=\frac{1}{1+\exp^{\textbf{-x}}} 24 | 25 | 26 | - **[Dropout layer](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf)**. It's introduced to prevent overfitting. 27 | It takes parameter $d$ which is equal to probability of individual neuron being "dropped out" during the *training stage* independently for each training example. The removed nodes are then reinserted into the network with their original weights. At *testing stage* we're using the full network with each neuron's output weighted by a factor of $1-d$, so the expected value of the output of any neuron is the same as in the training stages. 28 | 29 |  $$\mathrm{Dropout_{train}} \equiv f\left(\textbf{x}\right)=\textbf{m}\odot\textbf{x}$$    $$\textbf{m} \in \left\{0,1\right\}^{n}$$    $$p\left(m_{i}=0\right)=d$$        $$\mathrm{Dropout_{test}}\equiv f\left(\textbf{x}\right)=\left(1-d\right)\textbf{x}$$ 30 | 31 | 32 | - **Softmax classifier layer**. It's a generalization of binary Logistic Regression classifier to multiple classes. The Softmax classifier gives normalized class probabilities as its output. 33 | 34 |  $$\mathrm{Softmax}_{i} \equiv p_{i}\left(\textbf{x}\right)=\frac{e^{x_{i}}}{\sum_{j}{e^{x_{j}}}}$$ 35 | 36 | We will use the Softmax classifier together with **cross-entropy loss** which is a generalization of binary log loss for multiple classes. 37 | The cross-entropy between a “true” distribution $p$ and an estimated distribution $q$ is defined as: 38 | 39 | $$\mathcal{L}=-\sum_{i}{p_{i}\log{q_{i}}}$$ 40 | 41 | The Softmax classifier is hence minimizing the cross-entropy between the estimated class probabilities and the “true” distribution, where "true" distribution $\textbf{p}=\left[p_{1}...p_{i}...\right]$ with only one element is equal to $1$ (true class) and all the other are equal to $0$. 42 | 43 | ## Install ND4J 44 | 45 | ### Prerequisites 46 | 47 | - [JavaCPP](http://nd4j.org/getstarted#javacpp) 48 | - [BLAS (ATLAS, MKL, or OpenBLAS)](http://nd4j.org/getstarted#blas) 49 | 50 | These will vary depending on whether you’re running on CPUs or GPUs. 51 | The default backend for CPUs is `nd4j-native-platform`, and for CUDA it is `nd4j-cuda-7.5-platform`. 52 | 53 | Assuming the default backend for CPUs is used, `customDeps` section of Spark Notebook metadata (`Edit` -> `Edit Notebook Metadata`) should look like following: 54 | 55 | ``` 56 | "customDeps": [ 57 | "org.bytedeco % javacpp % 1.3.2", 58 | "org.nd4j % nd4j-native-platform % 0.8.0", 59 | "org.nd4j %% nd4s % 0.8.0", 60 | "org.deeplearning4j % deeplearning4j-core % 0.8.0" 61 | ] 62 | ``` 63 | 64 | **[ND4J user guide](http://nd4j.org/userguide)** might be of the great help to track neural network components implementation. 65 | 66 | ```scala 67 | import org.nd4j.linalg.factory.Nd4j 68 | import org.nd4j.linalg.api.ndarray.INDArray 69 | import org.nd4j.linalg.ops.transforms.Transforms 70 | import org.nd4s.Implicits._ 71 | 72 | import org.nd4j.linalg.cpu.nativecpu.rng.CpuNativeRandom 73 | ``` 74 | 75 | ```scala 76 | val rngSEED = 181 77 | val RNG = new CpuNativeRandom(rngSEED) 78 | ``` 79 | 80 | ## Sigmoid & Softmax functions 81 | 82 | First let's implement **`sigmoid`** and **`sigmoidGrad`** functions: 83 | 84 | - **`sigmoid`** function applies sigmoid transformation in an element-wise manner to each row of the input; 85 | - **`sigmoidGrad`** computes the gradient for the sigmoid function. It takes sigmoid function value as an input. 86 | 87 | ```scala 88 | def sigmoid(x: INDArray): INDArray = { 89 | Transforms.pow(Transforms.exp(-x) + 1, -1) 90 | } 91 | 92 | 93 | def sigmoidGrad(f: INDArray): INDArray = { 94 | f * (-f + 1) 95 | } 96 | ``` 97 | 98 | We used [`Transform ops`](http://nd4j.org/userguide#opstransform) to apply element-wise `exp` and `pow`. 99 | 100 | **`softmax`** computes the softmax function for each row of the input. 101 | 102 | ```scala 103 | def softmax(x: INDArray): INDArray = { 104 | val exps = Transforms.exp(x.addColumnVector(-x.max(1))) 105 | exps.divColumnVector(exps.sum(1)) 106 | } 107 | ``` 108 | 109 | In addition to previously seen `Transforms ops` we also used [`Vector ops`](http://nd4j.org/userguide#opsbroadcast) here to subtract from each row its max element and divide each row by the sum of its elements. 110 | 111 | ```scala 112 | def sigmoidTest(): Unit = { 113 | val x = Array(Array(1, 2), Array(-1, -2)).toNDArray 114 | val f = sigmoid(x) 115 | val g = sigmoidGrad(f) 116 | val sigmoidVals = Array(Array(0.73105858, 0.88079708), 117 | Array(0.26894142, 0.11920292)).toNDArray 118 | val gradVals = Array(Array(0.19661193, 0.10499359), 119 | Array(0.19661193, 0.10499359)).toNDArray 120 | assert((f - Transforms.abs(sigmoidVals)).max(1) < 1e-6) 121 | assert((g - Transforms.abs(gradVals)).max(1) < 1e-6) 122 | println("sigmoid tests passed") 123 | } 124 | 125 | 126 | def softmaxTest(): Unit = { 127 | val x = Array(Array(1001, 1002), 128 | Array(3, 4)).toNDArray 129 | val logits = softmax(x) 130 | val expectedLogits = Array(Array(0.26894142, 0.73105858), 131 | Array(0.26894142, 0.73105858)).toNDArray 132 | assert((logits - Transforms.abs(expectedLogits)).max(1) < 1e-6) 133 | assert( 134 | (softmax(Array(1, 1).toNDArray) - Transforms.abs(Array(0.5, 0.5).toNDArray)).max(1) < 1e-6 135 | ) 136 | println("softmax tests passed") 137 | } 138 | ``` 139 | 140 | ```scala 141 | sigmoidTest 142 | softmaxTest 143 | ``` 144 | 145 | ## Network Layers 146 | 147 | Let's define `NetLayer` trait for building network layers. We need to provide two methods: 148 | - `forwardProp` for forward propagation of input through the neural network in order to generate the network's output. 149 | - `backProp` for delta backpropagation and weights update. 150 | `backProp` takes the weight's output gradients with respect to layer's inputs. The weight's output gradient and input activation are multiplied to find the gradient of the weight. A ratio (gets tuned by `learningRate`) of the weight's gradient is subtracted from the weight. 151 | 152 | ```scala 153 | trait NetLayer { 154 | def forwardProp(inputs: INDArray, isTrain: Boolean): INDArray 155 | def backProp(outputsGrad: INDArray): INDArray 156 | } 157 | ``` 158 | 159 | ```scala 160 | class Dense(inputDim: Int, outputDim: Int, val learningRate: Double) extends NetLayer { 161 | private val W = Nd4j.rand(Array(inputDim, outputDim), -0.01, 0.01, RNG) 162 | private val b = Nd4j.rand(Array(1, outputDim), -0.01, 0.01, RNG) 163 | private var _inputs = Nd4j.zeros(1, inputDim) 164 | 165 | def forwardProp(inputs: INDArray, isTrain: Boolean): INDArray = { 166 | _inputs = inputs 167 | (inputs mmul W) addRowVector b 168 | } 169 | 170 | def backProp(outputsGrad: INDArray): INDArray = { 171 | val gradW = _inputs.T mmul outputsGrad 172 | val gradb = outputsGrad.sum(0) 173 | val prop = outputsGrad mmul W.T 174 | W -= gradW * learningRate 175 | b -= gradb * learningRate 176 | prop 177 | } 178 | } 179 | ``` 180 | 181 | ```scala 182 | class SigmoidActivation extends NetLayer { 183 | private var _outputs = Nd4j.zeros(1) 184 | 185 | def forwardProp(inputs: INDArray, isTrain: Boolean): INDArray = { 186 | _outputs = sigmoid(inputs) 187 | _outputs 188 | } 189 | 190 | def backProp(outputsGrad: INDArray): INDArray = { 191 | outputsGrad * sigmoidGrad(_outputs) 192 | } 193 | } 194 | ``` 195 | 196 | ```scala 197 | class Dropout(val dropRate: Double = 0.0) extends NetLayer { 198 | var mask: INDArray = Nd4j.zeros(1) 199 | def forwardProp(inputs: INDArray, isTrain: Boolean): INDArray = { 200 | if (isTrain) { 201 | mask = Nd4j.zeros(1, inputs.shape()(1)) 202 | Nd4j.choice(Array(0, 1).toNDArray, Array(dropRate, 1 - dropRate).toNDArray, mask) 203 | inputs.mulRowVector(mask) 204 | } else { 205 | inputs * (1 - dropRate) 206 | } 207 | } 208 | 209 | def backProp(outputsGrad: INDArray): INDArray = { 210 | outputsGrad.mulRowVector(mask) 211 | } 212 | } 213 | ``` 214 | 215 | We assume that the **Softmax** is always the last layer of the network. 216 | 217 | Also it can be shown that the gradient of cross-entropy loss of the outputs of softmax layer with respect to softmax layer's input has a simple form: 218 | 219 |  $$\frac{\partial \mathcal{L}}{\partial x_{i}}=g_{i}-p_{i}$$ 220 | 221 | So to start backpropagation stage let's take the `Softmax` output probabilities alongside with true labels as an input for `backProp` method of the `Softmax` layer. 222 | 223 | 224 | ```scala 225 | import org.nd4s.Implicits._ 226 | 227 | class Softmax extends NetLayer { 228 | def forwardProp(inputs: INDArray, isTrain: Boolean): INDArray = { 229 | softmax(inputs) 230 | } 231 | 232 | def backProp(outputsGrad: INDArray): INDArray = { 233 | val predictions = outputsGrad(0, ->) 234 | val labels = outputsGrad(1, ->) 235 | predictions - labels 236 | } 237 | } 238 | ``` 239 | 240 | ```scala 241 | def crossEntropy(predictions: INDArray, labels: INDArray): Double = { 242 | val cost = - (Transforms.log(predictions) * labels).sumNumber.asInstanceOf[Double] 243 | cost / labels.shape()(0) 244 | } 245 | ``` 246 | 247 | ```scala 248 | def accuracy(predictions: INDArray, labels: INDArray): Double = { 249 | val samplesNum = labels.shape()(0) 250 | val matchesNum = (Nd4j.argMax(predictions, 1) eq Nd4j.argMax(labels, 1)).sumNumber.asInstanceOf[Double] 251 | 100.0 * matchesNum / samplesNum 252 | } 253 | ``` 254 | 255 | ## Neural Network 256 | 257 | ```scala 258 | import org.nd4j.linalg.dataset.api.iterator.DataSetIterator 259 | import org.nd4j.linalg.dataset.DataSet 260 | ``` 261 | 262 | ```scala 263 | case class Metric(epoch: Int, acc: Double, loss: Double) 264 | ``` 265 | 266 | We will use the class called `DataSetIterator` to fetch `DataSet`s. 267 | 268 | ```scala 269 | import scala.collection.JavaConverters._ 270 | 271 | 272 | case class NeuralNet(layers: Vector[NetLayer] = Vector()) { 273 | 274 | def addLayer(layer: NetLayer): NeuralNet = { 275 | this.copy(layers :+ layer) 276 | } 277 | 278 | def fit(trainData: DataSetIterator, numEpochs: Int, validationData: DataSet): Seq[Metric] = { 279 | val history = (1 to numEpochs).foldLeft(List[Metric]()){ (history, epoch) => 280 | trainData.reset() 281 | trainData.asScala.foreach ( ds => trainBatch(ds.getFeatures, ds.getLabels) ) 282 | 283 | // validate on validation Dataset 284 | val prediction = this.predict(validationData.getFeatures) 285 | val loss = crossEntropy(prediction, validationData.getLabels) 286 | val acc = accuracy(prediction, validationData.getLabels) 287 | 288 | println(s"Epoch: $epoch/$numEpochs - loss: $loss - acc: $acc") 289 | 290 | Metric(epoch, acc, loss) :: history 291 | } 292 | history.reverse 293 | } 294 | 295 | def predict(X: INDArray): INDArray = { 296 | layers.foldLeft(X){ 297 | (input, layer) => layer.forwardProp(input, isTrain=false) 298 | } 299 | } 300 | 301 | private def trainBatch(X: INDArray, Y: INDArray): Unit = { 302 | val YPredict = layers.foldLeft(X){ 303 | (input, layer) => layer.forwardProp(input, isTrain=true) 304 | } 305 | val shape = Y.shape 306 | layers.reverse.foldLeft( 307 | Nd4j.vstack(YPredict, Y).reshape(2, shape(0), shape(1)) 308 | ){ 309 | (deriv, layer) => layer.backProp(deriv) 310 | } 311 | } 312 | } 313 | ``` 314 | 315 | ## MNIST 316 | 317 | Now let's apply our framework to build neural network for MNIST dataset classification. 318 | The `DatasetIterator` implementation called `MnistDataSetIterator` is available in `deeplearning4j` to iterate over MNIST dataset. 319 | 320 | ```scala 321 | import org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator 322 | ``` 323 | 324 | ```scala 325 | val learningRate = 0.01 326 | val batchSize = 128 327 | 328 | val mnistTrain = new MnistDataSetIterator(batchSize, true, rngSEED) 329 | val mnistTest = new MnistDataSetIterator(batchSize, false, rngSEED) 330 | 331 | val inputDim = mnistTest.next.getFeatures.shape()(1) 332 | val totalTestExamples = mnistTest.numExamples() 333 | ``` 334 | 335 | ```scala 336 | val model = NeuralNet() 337 | .addLayer(new Dense(inputDim=inputDim, outputDim=512, learningRate=learningRate)) 338 | .addLayer(new SigmoidActivation()) 339 | .addLayer(new Dropout(dropRate=0.3)) 340 | .addLayer(new Dense(512, 512, learningRate)) 341 | .addLayer(new SigmoidActivation()) 342 | .addLayer(new Dropout(0.3)) 343 | .addLayer(new Dense(512, 10, learningRate)) 344 | .addLayer(new Softmax()) 345 | ``` 346 | 347 | ```scala 348 | val history = model.fit(mnistTrain, 40, (new MnistDataSetIterator(totalTestExamples, false, rngSEED)).next) 349 | ``` 350 | > Epoch: 1/40 - loss: 0.57162705078125 - acc: 81.94 351 | Epoch: 2/40 - loss: 0.348628173828125 - acc: 89.34 352 | Epoch: 3/40 - loss: 0.273960546875 - acc: 91.83 353 | Epoch: 4/40 - loss: 0.2305306396484375 - acc: 92.76 354 | Epoch: 5/40 - loss: 0.20194395751953126 - acc: 93.76 355 | Epoch: 6/40 - loss: 0.17214320068359376 - acc: 94.87 356 | Epoch: 7/40 - loss: 0.15777041015625 - acc: 95.29 357 | Epoch: 8/40 - loss: 0.1411923583984375 - acc: 95.75 358 | Epoch: 9/40 - loss: 0.1371442138671875 - acc: 95.65 359 | Epoch: 10/40 - loss: 0.1223932373046875 - acc: 96.2 360 | Epoch: 11/40 - loss: 0.11889525146484375 - acc: 96.35 361 | Epoch: 12/40 - loss: 0.11355523681640625 - acc: 96.5 362 | Epoch: 13/40 - loss: 0.10255557861328125 - acc: 96.63 363 | Epoch: 14/40 - loss: 0.10248739013671875 - acc: 96.67 364 | Epoch: 15/40 - loss: 0.10121082153320313 - acc: 96.76 365 | Epoch: 16/40 - loss: 0.09314661254882813 - acc: 97.05 366 | Epoch: 17/40 - loss: 0.0908234619140625 - acc: 97.09 367 | Epoch: 18/40 - loss: 0.08782809448242188 - acc: 97.21 368 | Epoch: 19/40 - loss: 0.084460498046875 - acc: 97.25 369 | Epoch: 20/40 - loss: 0.08508148803710938 - acc: 97.32 370 | Epoch: 21/40 - loss: 0.08242890625 - acc: 97.49 371 | Epoch: 22/40 - loss: 0.07931015014648438 - acc: 97.55 372 | Epoch: 23/40 - loss: 0.07825602416992188 - acc: 97.6 373 | Epoch: 24/40 - loss: 0.07847127685546874 - acc: 97.47 374 | Epoch: 25/40 - loss: 0.07547276611328126 - acc: 97.6 375 | Epoch: 26/40 - loss: 0.074110009765625 - acc: 97.64 376 | Epoch: 27/40 - loss: 0.07486264038085938 - acc: 97.69 377 | Epoch: 28/40 - loss: 0.07151276245117187 - acc: 97.73 378 | Epoch: 29/40 - loss: 0.07469411010742187 - acc: 97.76 379 | Epoch: 30/40 - loss: 0.06966272583007813 - acc: 97.88 380 | Epoch: 31/40 - loss: 0.066982666015625 - acc: 97.84 381 | Epoch: 32/40 - loss: 0.06796741333007812 - acc: 97.87 382 | Epoch: 33/40 - loss: 0.06789564208984375 - acc: 97.95 383 | Epoch: 34/40 - loss: 0.065538916015625 - acc: 98.03 384 | Epoch: 35/40 - loss: 0.066549365234375 - acc: 97.88 385 | Epoch: 36/40 - loss: 0.06736263427734375 - acc: 97.83 386 | Epoch: 37/40 - loss: 0.0646685302734375 - acc: 97.98 387 | Epoch: 38/40 - loss: 0.0628564208984375 - acc: 97.97 388 | Epoch: 39/40 - loss: 0.0657330322265625 - acc: 98.0 389 | Epoch: 40/40 - loss: 0.063365771484375 - acc: 97.98 390 | 391 | ```scala 392 | CustomPlotlyChart(history, 393 | layout="{title: 'Accuracy on validation set', xaxis: {title: 'epoch'}, yaxis: {title: '%'}}", 394 | dataOptions="{mode: 'lines'}", 395 | dataSources="{x: 'epoch', y: 'acc'}") 396 | ``` 397 | 398 | 399 | 400 | 401 | ```scala 402 | CustomPlotlyChart(history, 403 | layout="{title: 'Cross entropy on validation set', xaxis: {title: 'epoch'}, yaxis: {title: 'loss'}}", 404 | dataOptions="""{ 405 | mode: 'lines', 406 | line: { 407 | color: 'green', 408 | width: 3 409 | } 410 | }""", 411 | dataSources="{x: 'epoch', y: 'loss'}") 412 | ``` 413 | 414 | 415 | 416 | 417 | ## On your own: 418 | - Implement [rectified linear unit (ReLU)](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) activation. 419 | - Add support for *L2* regularization on `Dense` layer weights. 420 | - Train similar neural network with `relu` activation instead of `sigmoid` activation and added support for `L2` regularization. Compare obtained results. 421 | -------------------------------------------------------------------------------- /labs/DataAnalysisToolbox/README.md: -------------------------------------------------------------------------------- 1 | ## Data Analysis Toolbox 2 | In this lab we are going to get familiar with **Breeze** numerical processing library, Spark **DataFrames** (distributed collections of data organized into named columns) and **C3 Charts** library in a way of solving little challenges. At the beginning of each section are reference materials necessary for solving the problems. 3 | ### Breeze 4 | * [Quick start tutorial](https://github.com/scalanlp/breeze/wiki/Quickstart) 5 | 6 | ```scala 7 | import breeze.linalg._ 8 | import breeze.stats.{mean, stddev} 9 | import breeze.stats.distributions._ 10 | ``` 11 | 12 | 13 | >
 14 | > import breeze.linalg._
 15 | > import breeze.stats.{mean, stddev}
 16 | > import breeze.stats.distributions._
 17 | >
18 | 19 | 20 | 21 | ** Problem 1.** Implement a method that takes Matrix X and two sequences ii and jj of equal size as an input and produces breeze.linalg.DenseVector[Double] of elements [X[ii[0], jj[0]], X[ii[1], jj[1]], ..., X[ii[N-1], jj[N-1]]]. 22 | 23 | ```scala 24 | def constructVector(X: Matrix[Double], ii: Seq[Int], jj: Seq[Int]): DenseVector[Double] = ??? 25 | ``` 26 | 27 | 28 | >
 29 | > constructVector: (X: breeze.linalg.Matrix[Double], ii: Seq[Int], jj: Seq[Int])breeze.linalg.DenseVector[Double]
 30 | >
31 | 32 | 33 | 34 | 35 | ```scala 36 | // Solution for problem 1 37 | def constructVector(X: Matrix[Double], ii: Seq[Int], jj: Seq[Int]): DenseVector[Double] = 38 | DenseVector(ii.zip(jj).map(ix => X(ix._1, ix._2)).toArray) 39 | 40 | constructVector(DenseMatrix((1.0,2.0,3.0), 41 | (4.0,5.0,6.0), 42 | (7.0, 8.0, 9.0)), 43 | List(0, 1, 2), List(0, 1, 2)) 44 | ``` 45 | 46 | 47 | >
 48 | > constructVector: (X: breeze.linalg.Matrix[Double], ii: Seq[Int], jj: Seq[Int])breeze.linalg.DenseVector[Double]
 49 | > res4: breeze.linalg.DenseVector[Double] = DenseVector(1.0, 5.0, 9.0)
 50 | >
51 | 52 | > DenseVector(1.0, 5.0, 9.0) 53 | 54 | ** Problem 2. ** Write a method to calculate the product of nonzero elements on the diagonal of a rectangular matrix. For example, for X = Matrix((1.0, 0.0, 1.0), (2.0, 0.0, 2.0), (3.0, 0.0, 3.0), (4.0, 4.0, 4.0)) the answer is Some(3). If there are no nonzero elements, the method should return None. 55 | 56 | ```scala 57 | def nonzeroProduct(X: Matrix[Double]): Option[Double] = ??? 58 | ``` 59 | 60 | 61 | >
 62 | > nonzeroProduct: (X: breeze.linalg.Matrix[Double])Option[Double]
 63 | >
64 | 65 | 66 | 67 | 68 | ```scala 69 | // Solution for problem 2 70 | def nonzeroProduct(X: Matrix[Double]): Option[Double] = 71 | (0 until min(X.rows, X.cols)).map(i => X(i, i)).filter(_ != 0) match { 72 | case Seq() => None 73 | case xs => Some(xs.reduce(_ * _)) 74 | } 75 | 76 | nonzeroProduct(Matrix((1.0, 0.0, 1.0), (2.0, 0.0, 2.0), (3.0, 0.0, 3.0), (4.0, 4.0, 4.0))) 77 | ``` 78 | 79 | 80 | >
 81 | > nonzeroProduct: (X: breeze.linalg.Matrix[Double])Option[Double]
 82 | > res7: Option[Double] = Some(3.0)
 83 | >
84 | 85 | > Some(3.0) 86 | 87 | ** Problem 3. ** Write a method to find the maximum element of the vector with the preceding zero element. For example, for Vector(6, 2, 0, 3, 0, 0, 5, 7, 0) the answer is Some(5). If there are no such an elements, the method should return None. 88 | 89 | ```scala 90 | def maxAfterZeroElement(vec: Vector[Double]): Option[Double] = ??? 91 | ``` 92 | 93 | 94 | >
 95 | > maxAfterZeroElement: (vec: breeze.linalg.Vector[Double])Option[Double]
 96 | >
97 | 98 | 99 | 100 | 101 | ```scala 102 | def maxAfterZeroElement(vec: Vector[Double]): Option[Double] = 103 | vec.toArray.foldLeft((None, false): (Option[Double], Boolean))( 104 | (prev: (Option[Double], Boolean), el: Double) => 105 | if (el == 0) { 106 | (prev._1, true) 107 | } else { 108 | prev match { 109 | case (p, false) => (p, false) 110 | case (None, true) => (Some(el), false) 111 | case (Some(m), true) => ({if (el > m) Some(el) else Some(m)}, false) 112 | } 113 | } 114 | )._1 115 | ``` 116 | 117 | 118 | >
119 | > maxAfterZeroElement: (vec: breeze.linalg.Vector[Double])Option[Double]
120 | >
121 | 122 | 123 | 124 | ** Problem 4. ** Write a method that takes Matrix X and some number Double v and returns closest matrix element to given number v. For example: for X = new DenseMatrix(2, 5, DenseVector.range(0, 10).mapValues(_.toDouble).toArray) and v = 3.6 the answer would be 4.0. 125 | 126 | ```scala 127 | def closestValue(X: DenseMatrix[Double], v: Double): Double = ??? 128 | ``` 129 | 130 | 131 | >
132 | > closestValue: (X: breeze.linalg.DenseMatrix[Double], v: Double)Double
133 | >
134 | 135 | 136 | 137 | 138 | ```scala 139 | // Solution for problem 4 140 | import scala.math.abs 141 | 142 | def closestValue(X: DenseMatrix[Double], v: Double): Double = 143 | X(argmin(X.map(e => abs(e - v)))) 144 | ``` 145 | 146 | 147 | >
148 | > import scala.math.abs
149 | > closestValue: (X: breeze.linalg.DenseMatrix[Double], v: Double)Double
150 | >
151 | 152 | 153 | 154 | 155 | ```scala 156 | // Another solution for problem 4 157 | import breeze.numerics.abs 158 | 159 | def closestValue(X: DenseMatrix[Double], v: Double): Double = 160 | X(argmin(abs(X - v))) 161 | ``` 162 | 163 | 164 | >
165 | > import breeze.numerics.abs
166 | > closestValue: (X: breeze.linalg.DenseMatrix[Double], v: Double)Double
167 | >
168 | 169 | 170 | 171 | ** Problem 5. ** Write a method that takes Matrix X and scales each column of this matrix by subtracting mean value and dividing by standard deviation of the column. For testing one can generate random matrix. Avoid division by zero. 172 | 173 | ```scala 174 | def scale(X: DenseMatrix[Double]): Unit = ??? 175 | ``` 176 | 177 | 178 | >
179 | > scale: (X: breeze.linalg.DenseMatrix[Double])Unit
180 | >
181 | 182 | 183 | 184 | 185 | ```scala 186 | // Solution for problem 5 187 | def scale(X: DenseMatrix[Double]): Unit = { 188 | val mm = mean(X(::, *)) // using broadcasting 189 | val std = stddev(X(::, *)) // https://github.com/scalanlp/breeze/wiki/Quickstart#broadcasting 190 | (0 until X.cols).foreach{i => 191 | if (std(0, i) == 0.0) { 192 | X(::, i) := 0.0 193 | } else { 194 | X(::, i) := (X(::, i) - mm(0, i)) :/ std(0, i) 195 | } 196 | } 197 | } 198 | ``` 199 | 200 | 201 | >
202 | > scale: (X: breeze.linalg.DenseMatrix[Double])Unit
203 | >
204 | 205 | 206 | 207 | 208 | ```scala 209 | // Another solution for problem 5 210 | def scale(X: DenseMatrix[Double]): Unit = 211 | (0 until X.cols).map{i => 212 | val col = X(::, i) 213 | val std = stddev(col) 214 | if (std != 0.0) { 215 | X(::, i) := (col - mean(col)) / std 216 | } else { 217 | X(::, i) := DenseVector.zeros[Double](col.size) 218 | } 219 | } 220 | ``` 221 | 222 | 223 | >
224 | > scale: (X: breeze.linalg.DenseMatrix[Double])Unit
225 | >
226 | 227 | 228 | 229 | 230 | ```scala 231 | // Let's test our scale method on random data 232 | val nd = new Gaussian(12, 20) 233 | val m = DenseMatrix.rand(10, 3, nd) 234 | println(m) 235 | println("============") 236 | scale(m) 237 | println(m) 238 | ``` 239 | 240 | 241 | >
242 | > 15.590452840444563  26.751701453651677   -3.87442957211206    
243 | > 20.327157147052404  4.872835405186789    -1.723076564770194   
244 | > 8.623837647458954   -12.515032706820008  17.23652514034355    
245 | > -22.6959606971933   -3.5252869052855402  -28.569802562830404  
246 | > 5.084148521366598   6.537587281421278    1.27947368109675     
247 | > 45.550604542120766  33.63584014298664    14.398835562651708   
248 | > 28.39067989774948   21.884251067827837   26.21188242480804    
249 | > 35.760270426060366  33.15913097645061    43.652905311745315   
250 | > -6.957271573704126  30.631777233387844   4.858850308567796    
251 | > 32.17744687777203   8.983683803901943    4.909365750891229    
252 | > ============
253 | > -0.02858428109928919  0.714489638531793    -0.6056134391326071   
254 | > 0.1990918470152323    -0.6204508202172598  -0.4943741445319128   
255 | > -0.36344410568028807  -1.6813727428933674  0.48596367654601474   
256 | > -1.8688727663822855   -1.132862808340346   -1.882528878864535    
257 | > -0.5335840727948753   -0.5188758744023809  -0.3391222773517981   
258 | > 1.411491116397139     1.1345258158947291   0.33923620511713787   
259 | > 0.5866760236928795    0.41750183136681246  0.9500494912429874    
260 | > 0.9409054052767336    1.1054393745747901   1.8518666623146085    
261 | > -1.1123712899791023   0.9512327188133928   -0.1540446402595702   
262 | > 0.7686921235538567    -0.3696271333281644  -0.15143265508032536  
263 | > nd: breeze.stats.distributions.Gaussian = Gaussian(12.0, 20.0)
264 | > m: breeze.linalg.DenseMatrix[Double] = 
265 | > -0.02858428109928919  0.714489638531793    -0.6056134391326071   
266 | > 0.1990918470152323    -0.6204508202172598  -0.4943741445319128   
267 | > -0.36344410568028807  -1.6813727428933674  0.48596367654601474   
268 | > -1.8688727663822855   -1.132862808340346   -1.882528878864535    
269 | > -0.5335840727948753   -0.5188758744023809  -0.3391222773517981   
270 | > 1.411491116397139     1.1345258158947291   0.33923620511713787   
271 | > 0.5866760236928795    0.41750183136681246  0.9500494912429874    
272 | > 0.9409054052767336    1.1054393745747901   1.8518666623146085    
273 | > -1.1123712899791023   0.9512327188133928   -0.1540446402595702   
274 | > 0.7686921235538567    -0.3696271333281644  -0.15143265508032536  
275 | >
276 | 277 | 278 | 279 | ** Problem 6. ** Implement a method that for given matrix X finds: 280 | * the determinant 281 | * the trace 282 | * max and min elements 283 | * Frobenius Norm 284 | * eigenvalues 285 | * inverse matrix 286 | 287 | For testing one can generate random matrix from normal distribution $N(10, 1)$. 288 | 289 | ```scala 290 | def getStats(X: Matrix[Double]): Unit = ??? 291 | ``` 292 | 293 | 294 | >
295 | > getStats: (X: breeze.linalg.Matrix[Double])Unit
296 | >
297 | 298 | 299 | 300 | 301 | ```scala 302 | // Solution for problem 6 303 | def getStats(X: DenseMatrix[Double]): String = { 304 | val dt = det(X) 305 | val tr = trace(X) 306 | val minE = min(X) 307 | val maxE = max(X) 308 | val frob = breeze.linalg.norm(X.toDenseVector) 309 | val ev = eig(X).eigenvalues 310 | val invM = inv(X) 311 | 312 | s"""Stats: 313 | determinant: $dt 314 | trace: $tr 315 | min element: $minE 316 | max element: $maxE 317 | Frobenius Norm: $frob 318 | eigenvalues: $ev 319 | inverse matrix:\n$invM""".stripMargin 320 | } 321 | ``` 322 | 323 | 324 | >
325 | > getStats: (X: breeze.linalg.DenseMatrix[Double])String
326 | >
327 | 328 | 329 | 330 | 331 | ```scala 332 | // Let's test our scale method on random data 333 | val nd = new Gaussian(10, 1) 334 | val X = DenseMatrix.rand(4, 4, nd) 335 | ``` 336 | 337 | 338 | >
339 | > nd: breeze.stats.distributions.Gaussian = Gaussian(10.0, 1.0)
340 | > X: breeze.linalg.DenseMatrix[Double] = 
341 | > 10.15867550081024   10.713391519035639  10.18898336794234   11.633517053992334  
342 | > 9.077895190590993   10.687077605375258  9.75691251834008    10.289451974113568  
343 | > 12.419948133142773  8.799359381094582   12.333412584337028  9.616047767507087   
344 | > 9.018762639197664   11.122058811926983  9.603119538562519   10.441697550864596  
345 | >
346 | 347 | 348 | 349 | 350 | ```scala 351 | println(getStats(X)) 352 | ``` 353 | 354 | 355 | >
356 | > Stats:
357 | > determinant: -14.64894396592202
358 | > trace: 43.62086324138712
359 | > min element: 8.799359381094582
360 | > max element: 12.419948133142773
361 | > Frobenius Norm: 41.681818838737364
362 | > eigenvalues: DenseVector(41.461632636433905, 1.182643130384728, 1.182643130384728, -0.20605565581625584)
363 | > inverse matrix:
364 | > 0.37634342430946144  -4.699111409373191   0.45067158561397047   3.796260506021671    
365 | > -0.3874775018168392  -1.7712409918032728  0.09247520887399419   2.0919567065524114   
366 | > -0.6039672881460412  4.807753751137877    -0.20653804947039545  -3.8745431604779714  
367 | > 0.6431296809327107   1.523754431265583    -0.297806479947489    -1.8480459447576396  
368 | >
369 | 370 | 371 | 372 | ### DataFrames 373 | * https://databricks.com/blog/2015/02/17/introducing-dataframes-in-spark-for-large-scale-data-science.html 374 | * http://spark.apache.org/docs/latest/sql-programming-guide.html 375 | 376 | In this lab we will be using [data](https://www.kaggle.com/c/titanic/download/train.csv) from [Titanic dataset](https://www.kaggle.com/c/titanic/data). 377 | To load data from csv file direct to Spark's Dataframe we will use [spark-csv](http://spark-packages.org/package/databricks/spark-csv) package. 378 | To add spark-csv package to spark notebook one could add "com.databricks:spark-csv_2.10:1.4.0" (or "com.databricks:spark-csv_2.11:1.4.0" for Scala 2.11) dependency into customDeps conf section. Alternatively one could specify this dependency in `--packages` command line option while submiting spark application to a cluster (`spark-submit`) or launching spark shell (`spark-shell`). 379 | 380 | ```scala 381 | import org.apache.spark.sql.SQLContext 382 | ``` 383 | 384 | 385 | >
386 | > import org.apache.spark.sql.SQLContext
387 | >
388 | 389 | 390 | 391 | 392 | ```scala 393 | val sqlContext = new SQLContext(sc) 394 | 395 | val df = sqlContext.read 396 | .format("com.databricks.spark.csv") 397 | .option("header", "true") 398 | .option("inferSchema", "true") 399 | .load("notebooks/labs/DataAnalysisToolbox/titanic.csv") 400 | ``` 401 | 402 | 403 | >
404 | > sqlContext: org.apache.spark.sql.SQLContext = org.apache.spark.sql.SQLContext@31b5f894
405 | > df: org.apache.spark.sql.DataFrame = [PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]
406 | >
407 | 408 | 409 | 410 | 411 | ```scala 412 | // df.show() 413 | df.limit(5) 414 | ``` 415 | 416 | 417 | >
418 | > res26: org.apache.spark.sql.DataFrame = [PassengerId: int, Survived: int, Pclass: int, Name: string, Sex: string, Age: double, SibSp: int, Parch: int, Ticket: string, Fare: double, Cabin: string, Embarked: string]
419 | >
420 | 421 | 422 | 423 | **Problem 1.** Describe given dataset by answering following questions. How many women and men were on board? How many passengers were in each class? What is the average/minimum/maximum age of passengers? What can you say about the number of the surviving passengers? 424 | 425 | ```scala 426 | // Solution for problem 1 427 | import org.apache.spark.sql.functions.{min, max, mean} 428 | 429 | df.groupBy("Sex").count().show() 430 | df.groupBy("Pclass").count().show() 431 | df.select(mean("Age").alias("Average Age"), min("Age"), max("Age")).show() 432 | 433 | val totalPassengers = df.count() 434 | val survived = df.groupBy("Survived").count() 435 | survived.withColumn("%", (survived("count") / totalPassengers) * 100).show() 436 | ``` 437 | 438 | 439 | >
440 | > +------+-----+
441 | > |   Sex|count|
442 | > +------+-----+
443 | > |female|  314|
444 | > |  male|  577|
445 | > +------+-----+
446 | > 
447 | > +------+-----+
448 | > |Pclass|count|
449 | > +------+-----+
450 | > |     1|  216|
451 | > |     2|  184|
452 | > |     3|  491|
453 | > +------+-----+
454 | > 
455 | > +-----------------+--------+--------+
456 | > |      Average Age|min(Age)|max(Age)|
457 | > +-----------------+--------+--------+
458 | > |29.69911764705882|    0.42|    80.0|
459 | > +-----------------+--------+--------+
460 | > 
461 | > +--------+-----+-----------------+
462 | > |Survived|count|                %|
463 | > +--------+-----+-----------------+
464 | > |       0|  549|61.61616161616161|
465 | > |       1|  342|38.38383838383838|
466 | > +--------+-----+-----------------+
467 | > 
468 | > import org.apache.spark.sql.functions.{min, max, mean}
469 | > totalPassengers: Long = 891
470 | > survived: org.apache.spark.sql.DataFrame = [Survived: int, count: bigint]
471 | >
472 | 473 | 474 | 475 | **Problem 2.** Is it true that women were more likely to survive than men? Who had more chances to survive: the passenger with a cheap ticket or the passenger with an expensive one? Is that true that youngest passengers had more chances to survive? 476 | 477 | ```scala 478 | import org.apache.spark.sql.functions.{sum, count} 479 | import org.apache.spark.sql.types.IntegerType 480 | ``` 481 | 482 | 483 | >
484 | > import org.apache.spark.sql.functions.{sum, count}
485 | > import org.apache.spark.sql.types.IntegerType
486 | >
487 | 488 | 489 | 490 | 491 | ```scala 492 | // Answer for q1 493 | df.groupBy("Sex") 494 | .agg((sum("Survived") / count("Survived")) 495 | .alias("survived part")) 496 | .show() 497 | ``` 498 | 499 | 500 | >
501 | > +------+-------------------+
502 | > |   Sex|      survived part|
503 | > +------+-------------------+
504 | > |female| 0.7420382165605095|
505 | > |  male|0.18890814558058924|
506 | > +------+-------------------+
507 | >
508 | 509 | 510 | 511 | Women were more likely to survive. 512 | 513 | ```scala 514 | // Answer for q2 515 | val survivedByFareRange = df.select(df("Survived"), 516 | ((df("Fare") / (df("SibSp") + df("Parch") + 1) / 5).cast(IntegerType) 517 | ).alias("fareRange")) 518 | 519 | survivedByFareRange.groupBy("fareRange") 520 | .agg((sum("Survived") / count("Survived")).alias("Survived part"), 521 | count("Survived").alias("passengers num")) 522 | .sort("fareRange") 523 | .show() 524 | ``` 525 | 526 | 527 | >
528 | > +---------+-------------------+--------------+
529 | > |fareRange|      Survived part|passengers num|
530 | > +---------+-------------------+--------------+
531 | > |        0|0.26744186046511625|            86|
532 | > |        1|0.27058823529411763|           425|
533 | > |        2| 0.4122137404580153|           131|
534 | > |        3| 0.5652173913043478|            23|
535 | > |        4| 0.2222222222222222|             9|
536 | > |        5| 0.5714285714285714|            70|
537 | > |        6|             0.5625|            32|
538 | > |        7|               0.56|            25|
539 | > |        8|                0.6|            15|
540 | > |        9|               0.75|             8|
541 | > |       10| 0.4166666666666667|            12|
542 | > |       11|                0.8|            10|
543 | > |       13|                1.0|             3|
544 | > |       14|               0.25|             4|
545 | > |       15| 0.6666666666666666|             9|
546 | > |       16|                1.0|             3|
547 | > |       17|                1.0|             3|
548 | > |       18|                1.0|             1|
549 | > |       21|                1.0|             3|
550 | > |       22|                1.0|             2|
551 | > +---------+-------------------+--------------+
552 | > only showing top 20 rows
553 | > 
554 | > survivedByFareRange: org.apache.spark.sql.DataFrame = [Survived: int, fareRange: int]
555 | >
556 | 557 | 558 | 559 | We can see that passengers with cheapest tickets had lowest chances to survive. To obtain ticket cost per passenger we had to divide ticket fare by number of persons (one person itself + number of Siblings/Spouses aboard + number of parents/children aboard) included in fare. 560 | 561 | ```scala 562 | // Answer for q3 563 | val survivedByAgeDecade = df.select(df("Survived"), 564 | ((df("Age") / 10).cast(IntegerType)).alias("decade")) 565 | survivedByAgeDecade.filter(survivedByAgeDecade("decade").isNotNull). 566 | groupBy("decade") 567 | .agg((sum("Survived") / count("Survived")).alias("Survived part"), 568 | count("Survived").alias("passengers num")) 569 | .sort("decade") 570 | .show() 571 | ``` 572 | 573 | 574 | >
575 | > +------+-------------------+--------------+
576 | > |decade|      Survived part|passengers num|
577 | > +------+-------------------+--------------+
578 | > |     0| 0.6129032258064516|            62|
579 | > |     1| 0.4019607843137255|           102|
580 | > |     2|               0.35|           220|
581 | > |     3|  0.437125748502994|           167|
582 | > |     4|0.38202247191011235|            89|
583 | > |     5| 0.4166666666666667|            48|
584 | > |     6| 0.3157894736842105|            19|
585 | > |     7|                0.0|             6|
586 | > |     8|                1.0|             1|
587 | > +------+-------------------+--------------+
588 | > 
589 | > survivedByAgeDecade: org.apache.spark.sql.DataFrame = [Survived: int, decade: int]
590 | >
591 | 592 | 593 | 594 | Here we can see that youngest passengers had more chances to survive 595 | **Problem 3.** Find all features with missing values. Suggest ways of handling features with missing values and specify their advantages nad disadvantages. Apply these methods to a given data set. 596 | **A.** Missing values can be replaced by the mean, the median or the most frequent value. The mean is not a robust tool since it is largely influenced by outliers and is better suited for normaly distributed features. The median is a more robust estimator for data with high magnitude variables and is generally used for skewed distributions. Fost frequent value is better suited for categorical features. 597 | 598 | ```scala 599 | df.columns.filter(col => df.filter(df(col).isNull).count > 0) 600 | ``` 601 | 602 | 603 | >
604 | > res37: Array[String] = Array(Age)
605 | >
606 | 607 | 608 | 609 | 610 | ```scala 611 | // using mean value 612 | val meanAge = df.select(mean("Age")).first.getDouble(0) 613 | df.select("Age").na.fill(meanAge).limit(10) 614 | ``` 615 | 616 | 617 | >
618 | > meanAge: Double = 29.69911764705882
619 | > res39: org.apache.spark.sql.DataFrame = [Age: double]
620 | >
621 | 622 | 623 | 624 | 625 | ```scala 626 | // using median value 627 | import org.apache.spark.SparkContext._ 628 | 629 | def getMedian(rdd: RDD[Double]): Double = { 630 | val sorted = rdd.sortBy(identity).zipWithIndex().map { 631 | case (v, idx) => (idx, v) 632 | } 633 | 634 | val count = sorted.count() 635 | 636 | if (count % 2 == 0) { 637 | val l = count / 2 - 1 638 | val r = l + 1 639 | (sorted.lookup(l).head + sorted.lookup(r).head).toDouble / 2 640 | } else sorted.lookup(count / 2).head.toDouble 641 | } 642 | val ageRDD = df.filter(df("Age").isNotNull).select("Age").map(row => row.getDouble(0)) 643 | val medianAge = getMedian(ageRDD) 644 | 645 | df.select("Age").na.fill(medianAge).limit(10) 646 | ``` 647 | 648 | 649 | >
650 | > import org.apache.spark.SparkContext._
651 | > getMedian: (rdd: org.apache.spark.rdd.RDD[Double])Double
652 | > ageRDD: org.apache.spark.rdd.RDD[Double] = MapPartitionsRDD[282] at map at :91
653 | > medianAge: Double = 28.0
654 | > res41: org.apache.spark.sql.DataFrame = [Age: double]
655 | >
656 | 657 | 658 | 659 | ### C3 Charts 660 | * http://c3js.org/examples.html 661 | * also have a look at `viz/Simple & Flexible Custom C3 Charts` notebook supplied with spark-notebook distribution. 662 | 663 | ```scala 664 | import notebook.front.widgets.CustomC3Chart 665 | ``` 666 | 667 | 668 | >
669 | > import notebook.front.widgets.CustomC3Chart
670 | >
671 | 672 | 673 | 674 | ** Problem 1. ** Plot funtion y(x) with blue color and it's confidence interval with green shaded area on the graph using data generated by following function. 675 | 676 | ```scala 677 | import breeze.linalg._ 678 | import breeze.numerics._ 679 | import breeze.stats.distributions._ 680 | import math.{Pi=>pi} 681 | 682 | val genData = () => { 683 | val x = linspace(0, 30, 100) 684 | val y = sin(x*pi/6.0) + DenseVector.rand(x.size, new Gaussian(0, 0.02)) 685 | val error = DenseVector.rand(y.size, new Gaussian(0.1, 0.02)) 686 | (x, y, error) 687 | } 688 | ``` 689 | 690 | 691 | >
692 | > import breeze.linalg._
693 | > import breeze.numerics._
694 | > import breeze.stats.distributions._
695 | > import math.{Pi=>pi}
696 | > genData: () => (breeze.linalg.DenseVector[Double], breeze.linalg.DenseVector[Double], breeze.linalg.DenseVector[Double]) = 
697 | >
698 | 699 | 700 | 701 | 702 | ```scala 703 | // Incomplete solution (follow the issue https://github.com/c3js/c3/issues/402) 704 | 705 | val (x, y, error) = genData() 706 | 707 | case class Point(x: Double, y: Double, plusError: Double, minusError: Double) 708 | 709 | val plotData = x.toArray.zip(y.toArray).zip(error.toArray).map(pp => Point(pp._1._1, 710 | pp._1._2, 711 | pp._1._2 + pp._2, 712 | pp._1._2 - pp._2)) 713 | CustomC3Chart(plotData, 714 | """{ data: { x: 'x', 715 | types: {y: 'line', plusError: 'line', minusError: 'line'}, 716 | colors: {y: 'blue', 717 | plusError: 'green', 718 | minusError: 'green'} 719 | }, 720 | point: { 721 | show: false 722 | } 723 | }""") 724 | ``` 725 | 726 | 727 | >
728 | > x: breeze.linalg.DenseVector[Double] = DenseVector(0.0, 0.30303030303030304, 0.6060606060606061, 0.9090909090909092, 1.2121212121212122, 1.5151515151515151, 1.8181818181818183, 2.121212121212121, 2.4242424242424243, 2.7272727272727275, 3.0303030303030303, 3.3333333333333335, 3.6363636363636367, 3.9393939393939394, 4.242424242424242, 4.545454545454546, 4.848484848484849, 5.151515151515151, 5.454545454545455, 5.757575757575758, 6.0606060606060606, 6.363636363636364, 6.666666666666667, 6.96969696969697, 7.272727272727273, 7.575757575757576, 7.878787878787879, 8.181818181818182, 8.484848484848484, 8.787878787878789, 9.090909090909092, 9.393939393939394, 9.696969696969697, 10.0, 10.303030303030303, 10.606060606060606, 10.90909090909091, 11.212121212121213, 11.515151515151516, 11.818181818181...
729 | >
730 | 731 | plot 732 | 733 | 734 | ** Problem 2. ** Plot histogram of ages for each passenger class (use data from Titanic dataset). 735 | 736 | ```scala 737 | // Let's start with histogram of ages of all passengers. 738 | val ageRdd = df.select("Age").rdd.map(r => r.getAs[Double](0)) 739 | val ageHist = ageRdd.histogram(10) 740 | 741 | case class AgeHistPoint(ageBucket: Double, age: Long) 742 | 743 | val ageHistData = ageHist._1.zip(ageHist._2).map(pp => AgeHistPoint(pp._1, pp._2)) 744 | 745 | CustomC3Chart(ageHistData, 746 | chartOptions = """ 747 | { data: { x: 'ageBucket', 748 | type: 'bar'}, 749 | bar: { 750 | width: {ratio: 0.9} 751 | }, 752 | axis: { 753 | y: { 754 | label: 'Count' 755 | } 756 | } 757 | } 758 | """) 759 | ``` 760 | 761 | 762 | >
763 | > ageRdd: org.apache.spark.rdd.RDD[Double] = MapPartitionsRDD[312] at map at :36
764 | > ageHist: (Array[Double], Array[Long]) = (Array(0.0, 8.0, 16.0, 24.0, 32.0, 40.0, 48.0, 56.0, 64.0, 72.0, 80.0),Array(227, 33, 164, 181, 123, 74, 50, 26, 11, 2))
765 | > defined class AgeHistPoint
766 | > ageHistData: Array[AgeHistPoint] = Array(AgeHistPoint(0.0,227), AgeHistPoint(8.0,33), AgeHistPoint(16.0,164), AgeHistPoint(24.0,181), AgeHistPoint(32.0,123), AgeHistPoint(40.0,74), AgeHistPoint(48.0,50), AgeHistPoint(56.0,26), AgeHistPoint(64.0,11), AgeHistPoint(72.0,2))
767 | > res47: notebook.front.widgets.CustomC3Chart[Array[AgeHistPoint]] = 
768 | >
769 | 770 | hist 771 | 772 | 773 | ```scala 774 | // Now let's expand our solution. 775 | val buckets = linspace(0, 100, 11).toArray 776 | val p1AgesHist = df.filter(df("Pclass")===1) 777 | .select("Age") 778 | .rdd 779 | .map(r => r.getAs[Double](0)) 780 | .histogram(buckets) 781 | val p2AgesHist = df.filter(df("Pclass")===2) 782 | .select("Age") 783 | .rdd 784 | .map(r => r.getAs[Double](0)) 785 | .histogram(buckets) 786 | val p3AgesHist = df.filter(df("Pclass")===3) 787 | .select("Age") 788 | .rdd 789 | .map(r => r.getAs[Double](0)) 790 | .histogram(buckets) 791 | ``` 792 | 793 | 794 | >
795 | > buckets: Array[Double] = Array(0.0, 10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0)
796 | > p1AgesHist: Array[Long] = Array(33, 18, 34, 50, 37, 27, 13, 3, 1, 0)
797 | > p2AgesHist: Array[Long] = Array(28, 18, 53, 48, 18, 15, 3, 1, 0, 0)
798 | > p3AgesHist: Array[Long] = Array(178, 66, 133, 69, 34, 6, 3, 2, 0, 0)
799 | >
800 | 801 | 802 | 803 | 804 | ```scala 805 | case class AgeHistPoint(ageBucket: Double, c1: Long, c2: Long, c3: Long) 806 | 807 | val ageHistData = (0 until buckets.length - 1).map(i => AgeHistPoint(buckets(i), p1AgesHist(i), p2AgesHist(i), p3AgesHist(i))).toArray 808 | ``` 809 | 810 | 811 | >
812 | > defined class AgeHistPoint
813 | > ageHistData: Array[AgeHistPoint] = Array(AgeHistPoint(0.0,33,28,178), AgeHistPoint(10.0,18,18,66), AgeHistPoint(20.0,34,53,133), AgeHistPoint(30.0,50,48,69), AgeHistPoint(40.0,37,18,34), AgeHistPoint(50.0,27,15,6), AgeHistPoint(60.0,13,3,3), AgeHistPoint(70.0,3,1,2), AgeHistPoint(80.0,1,0,0), AgeHistPoint(90.0,0,0,0))
814 | >
815 | 816 | 817 | 818 | 819 | ```scala 820 | CustomC3Chart(ageHistData, 821 | chartOptions = """ 822 | { data: { x: 'ageBucket', 823 | type: 'bar'}, 824 | bar: { 825 | width: {ratio: 0.9} 826 | }, 827 | axis: { 828 | y: {label: 'Count'} 829 | } 830 | } 831 | """) 832 | ``` 833 | 834 | 835 | >
836 | > res51: notebook.front.widgets.CustomC3Chart[Array[AgeHistPoint]] = 
837 | >
838 | 839 | ageHistPerClassStacked 840 | 841 | 842 | ```scala 843 | // Using stacked bar chart 844 | CustomC3Chart(ageHistData, 845 | chartOptions = """ 846 | { data: { x: 'ageBucket', 847 | type: 'bar', 848 | groups: [['c1', 'c2', 'c3']]}, 849 | bar: { 850 | width: {ratio: 0.9} 851 | }, 852 | axis: { 853 | y: {label: 'Count'} 854 | } 855 | } 856 | """) 857 | ``` 858 | 859 | 860 | >
861 | > res53: notebook.front.widgets.CustomC3Chart[Array[AgeHistPoint]] = 
862 | >
863 | 864 | ageHistPerClassStacked 865 | -------------------------------------------------------------------------------- /labs/DataAnalysisToolbox/images/ageHist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/DataAnalysisToolbox/images/ageHist.png -------------------------------------------------------------------------------- /labs/DataAnalysisToolbox/images/ageHistPerClass.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/DataAnalysisToolbox/images/ageHistPerClass.png -------------------------------------------------------------------------------- /labs/DataAnalysisToolbox/images/ageHistPerClassStacked.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/DataAnalysisToolbox/images/ageHistPerClassStacked.png -------------------------------------------------------------------------------- /labs/DataAnalysisToolbox/images/plotFunction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/DataAnalysisToolbox/images/plotFunction.png -------------------------------------------------------------------------------- /labs/IntroToMLandSparkMLPipelines/Intro To Machine Learning and SparkML Pipelines.snb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata" : { 3 | "name" : "Intro To Machine Learning and SparkML Pipelines", 4 | "user_save_timestamp" : "1199-01-01T03:00:00.000Z", 5 | "auto_save_timestamp" : "1970-01-01T03:00:00.000Z", 6 | "language_info" : { 7 | "name" : "scala", 8 | "file_extension" : "scala", 9 | "codemirror_mode" : "text/x-scala" 10 | }, 11 | "trusted" : true, 12 | "customLocalRepo" : null, 13 | "customRepos" : null, 14 | "customDeps" : null, 15 | "customImports" : null, 16 | "customArgs" : null, 17 | "customSparkConf" : { 18 | "spark.app.name" : "ScalaIO Machine Learning Pipeline", 19 | "spark.master" : "local[4]", 20 | "spark.executor.memory" : "2G" 21 | } 22 | }, 23 | "cells" : [ { 24 | "metadata" : { 25 | "id" : "2DD07D009297418F8AD85CE169ABCD6F" 26 | }, 27 | "cell_type" : "markdown", 28 | "source" : "# Introduction to Machine Learning and Spark ML Pipelines" 29 | }, { 30 | "metadata" : { 31 | "id" : "3300198B3B0943B080DC7DBE9884D190" 32 | }, 33 | "cell_type" : "markdown", 34 | "source" : "
\n \n \n
" 35 | }, { 36 | "metadata" : { 37 | "id" : "C426E89F077D4458812ADBD3017E7300" 38 | }, 39 | "cell_type" : "markdown", 40 | "source" : "# Machine learning Pipeline" 41 | }, { 42 | "metadata" : { 43 | "id" : "3D7218E315774439978B9F859CCC5CE1" 44 | }, 45 | "cell_type" : "markdown", 46 | "source" : "In this lab we are going to learn how to teach machine learning models, how to correctly set up an experiment, how to tune model hyperparameters and how to compare models. Also we'are going to get familiar with spark.ml package as soon as all of the work we'are going to get done using this package." 47 | }, { 48 | "metadata" : { 49 | "id" : "5EEC312DD9A34425884B50E36008151E" 50 | }, 51 | "cell_type" : "markdown", 52 | "source" : "* http://spark.apache.org/docs/latest/ml-guide.html\n* http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.package" 53 | }, { 54 | "metadata" : { 55 | "id" : "F1CC9FFD55F045598503E6FDF35276E8" 56 | }, 57 | "cell_type" : "markdown", 58 | "source" : "## Evaluation Metrics\nModel training and model quality assessment is performed on independent sets of examples. As a rule, the available examples are divided into two subsets: training (train) and control (test). The choice of the proportions of the split is a compromise. Indeed, the large size of the training leads to better quality of algorithms, but more noisy estimation of the model on the control. Conversely, the large size of the test sample leads to a less noisy assessment of the quality, however, models are less accurate.\n\nMany classification models produce estimation of belonging to the class $\\tilde{h}(x) \\in R$ (for example, the probability of belonging to the class 1). They then make a decision about the class of the object by comparing the estimates with a certain threshold $\\theta$:\n\n$h(x) = +1$, if $\\tilde{h}(x) \\geq \\theta$, $h(x) = -1$, if $\\tilde{h}(x) < \\theta$\n\nIn this case, we can consider metrics that are able to work with estimates of belonging to a class.\nIn this lab, we will work with [AUC-ROC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) metric. Detailed understanding of the operating principle of AUC-ROC metric is not required to perform the lab.\n## Model Hyperparameter Tuning\nIn machine learning problems it is necessary to distinguish the parameters of the model and hyperparameters (structural parameters). The model parameters are adjusted during the training (e.g., weights in the linear model or the structure of the decision tree), while hyperparameters are set in advance (for example, the regularization in linear model or maximum depth of the decision tree). Each model usually has many hyperparameters, and there is no universal set of hyperparameters optimal working in all tasks, for each task one should choose a different set of hyperparameters. _Grid search_ is commonly used to optimize model hyperparameters: for each parameter several values are selected and combination of parameter values where the model shows the best quality (in terms of the metric that is being optimized) is selected. However, in this case, it is necessary to correctly assess the constructed model, namely to do the split into training and test sample. There are several ways how it can be implemented:\n\n - Split the available samples into training and test samples. In this case, the comparison of a large number of models in the search of parameters leads to a situation when the best model on test data does not maintain its quality on new data. We can say that there is overfitting on the test data.\n - To eliminate the problem described above, it is possible to split data into 3 disjoint sub-samples: `train`, `validation` and `test`. The `validation` set is used for models comparison, and `test` set is used for the final quality assessment and comparison of families of models with selected parameters.\n - Another way to compare models is [cross-validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics). There are different schemes of cross-validation:\n - Leave-one-out cross-validation\n - K-fold cross-validation\n - Repeated random sub-sampling validation\n \nCross-validation is computationally expensive operation, especially if you are doing a grid search with a very large number of combinations. So there are a number of compromises:\n - the grid can be made more sparse, touching fewer values for each parameter, however, we must not forget that in such case one can skip a good combination of parameters;\n - cross-validation can be done with a smaller number of partitions or folds, but in this case the quality assessment of cross-validation becomes more noisy and increases the risk to choose a suboptimal set of parameters due to the random nature of the split;\n - the parameters can be optimized sequentially (greedy) — one after another, and not to iterate over all combinations; this strategy does not always lead to the optimal set;\n - enumerate only small number of randomly selected combinations of values of hyperparameters." 59 | }, { 60 | "metadata" : { 61 | "id" : "681A5B7696EE4E75941D30477B87D473" 62 | }, 63 | "cell_type" : "markdown", 64 | "source" : "## Data\n\nWe'are going to solve binary classification problem by building the algorithm which determines whether a person makes over 50K a year. Following variables are available:\n* age\n* workclass\n* fnlwgt\n* education\n* education-num\n* marital-status\n* occupation\n* relationship\n* race\n* sex\n* capital-gain\n* capital-loss\n* hours-per-week\n\nMore on this data one can read in [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names)" 65 | }, { 66 | "metadata" : { 67 | "trusted" : true, 68 | "input_collapsed" : false, 69 | "collapsed" : false, 70 | "id" : "3986C01E03884C09B551F434EAB5DA89" 71 | }, 72 | "cell_type" : "code", 73 | "source" : "val spark = sparkSession", 74 | "outputs" : [ ] 75 | }, { 76 | "metadata" : { 77 | "trusted" : true, 78 | "input_collapsed" : false, 79 | "collapsed" : false, 80 | "id" : "80AD7CC4C72645D78E62C11F9F1C838D" 81 | }, 82 | "cell_type" : "code", 83 | "source" : "val df = spark.read\n .option(\"header\", \"true\")\n .option(\"inferSchema\", \"true\")\n .csv(\"notebooks/spark-notebook-ml-labs/labs/IntroToMLandSparkMLPipelines/data/data.adult.csv\") ", 84 | "outputs" : [ ] 85 | }, { 86 | "metadata" : { 87 | "trusted" : true, 88 | "input_collapsed" : false, 89 | "collapsed" : false, 90 | "id" : "5CC83488592D4FFBA9D63A226935D96D" 91 | }, 92 | "cell_type" : "code", 93 | "source" : "df.limit(5)", 94 | "outputs" : [ ] 95 | }, { 96 | "metadata" : { 97 | "id" : "44EA054CC3514338B1F53B9B9722F0BF" 98 | }, 99 | "cell_type" : "markdown", 100 | "source" : "Sometimes there are missing values in the data. Sometimes, in the description of the dataset one can found the description of format of missing values. Particularly in the given dataset missing values are identified by '?' sign.\n\n**Problem** Find all the variables with missing values. Remove from the dataset all objects with missing values in any variable." 101 | }, { 102 | "metadata" : { 103 | "trusted" : true, 104 | "input_collapsed" : false, 105 | "collapsed" : false, 106 | "id" : "D22FE57836E541998BA905B0252FB07B" 107 | }, 108 | "cell_type" : "code", 109 | "source" : "val missingValsFeatures = df.columns.filter(column => df.filter(df(column) === \"?\").count > 0)\n\nprintln(\"Features with missing values: \" + missingValsFeatures.mkString(\", \"))\n\nval data = missingValsFeatures.foldLeft(df)((dfstage, column) => dfstage.filter(!dfstage(column).equalTo(\"?\")))", 110 | "outputs" : [ ] 111 | }, { 112 | "metadata" : { 113 | "id" : "D8D5DBE2B17641939EB490F5A2E80C9C" 114 | }, 115 | "cell_type" : "markdown", 116 | "source" : "Split on training and test datasets." 117 | }, { 118 | "metadata" : { 119 | "trusted" : true, 120 | "input_collapsed" : false, 121 | "collapsed" : false, 122 | "id" : "043DA92BF95A46D9B1CE7171EB007DA2" 123 | }, 124 | "cell_type" : "code", 125 | "source" : "val Array(training, test) = data.randomSplit(Array(0.8, 0.2), seed = 1234)", 126 | "outputs" : [ ] 127 | }, { 128 | "metadata" : { 129 | "id" : "7AE65B7403934817B40C0C18D3AFE774" 130 | }, 131 | "cell_type" : "markdown", 132 | "source" : "### MLlib Transformers and Estimators" 133 | }, { 134 | "metadata" : { 135 | "id" : "429E47695E694CCF9F3EDDA47C1B1343" 136 | }, 137 | "cell_type" : "markdown", 138 | "source" : "`Transformer` transforms one `DataFrame` into another `DataFrame`." 139 | }, { 140 | "metadata" : { 141 | "id" : "B3D8D8D439CA4AA583BF354AE382B80B" 142 | }, 143 | "cell_type" : "markdown", 144 | "source" : "
\n \n
" 145 | }, { 146 | "metadata" : { 147 | "id" : "BC347194CA8E49E0A8FD0430038614D6" 148 | }, 149 | "cell_type" : "markdown", 150 | "source" : "`Estimator` fits on a `DataFrame` to produce a `Transformer`." 151 | }, { 152 | "metadata" : { 153 | "id" : "2943774D4879475382D91BCB027CFCD7" 154 | }, 155 | "cell_type" : "markdown", 156 | "source" : "
\n \n
" 157 | }, { 158 | "metadata" : { 159 | "id" : "FDC6151EF3B3469481C85B1CE0B103B3" 160 | }, 161 | "cell_type" : "markdown", 162 | "source" : "## Training classifiers on numeric features" 163 | }, { 164 | "metadata" : { 165 | "id" : "AABD089AF368408F81FAC58D9C7D0DFC" 166 | }, 167 | "cell_type" : "markdown", 168 | "source" : "Some preprocessing steps are usually required after loading and cleaning dataset. In this case, these steps will include the following:\n\n - At first we will work only with numeric features. So let's select them separately in the feature vector \"numFeatures\" using [VectorAssembler](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler).\n - Select the target variable (the one we want to predict, string column of labels) and map it to an ML column of label indices using [StringIndexer](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.StringIndexer), give the name \"labelIndex\" to a new variable." 169 | }, { 170 | "metadata" : { 171 | "trusted" : true, 172 | "input_collapsed" : false, 173 | "collapsed" : false, 174 | "id" : "295D6680F4B2436384642C3EAA6E218B" 175 | }, 176 | "cell_type" : "code", 177 | "source" : "import org.apache.spark.ml.feature.VectorAssembler\nimport org.apache.spark.ml.feature.StringIndexer\n\nval assembler = new VectorAssembler()\n .setInputCols(Array(\"age\",\n \"fnlwgt\", \n \"education-num\", \n \"capital-gain\", \n \"capital-loss\",\n \"hours-per-week\"))\n .setOutputCol(\"numFeatures\")\n\nval labelIndexer = new StringIndexer()\n .setInputCol(\">50K,<=50K\")\n .setOutputCol(\"label\")\n .fit(training)", 178 | "outputs" : [ ] 179 | }, { 180 | "metadata" : { 181 | "trusted" : true, 182 | "input_collapsed" : false, 183 | "collapsed" : false, 184 | "id" : "995D5E14FD31463D8C57B7B1485CBC9B" 185 | }, 186 | "cell_type" : "code", 187 | "source" : "labelIndexer.transform(training).select(\">50K,<=50K\", \"label\").show(8)", 188 | "outputs" : [ ] 189 | }, { 190 | "metadata" : { 191 | "trusted" : true, 192 | "input_collapsed" : false, 193 | "collapsed" : false, 194 | "id" : "00D124318ED3471F8D0E9464E0991432" 195 | }, 196 | "cell_type" : "code", 197 | "source" : "assembler.transform(training)\n .select(\"age\", \"fnlwgt\", \"education-num\", \"capital-gain\", \"capital-loss\", \"hours-per-week\", \"numFeatures\")\n .limit(5)", 198 | "outputs" : [ ] 199 | }, { 200 | "metadata" : { 201 | "trusted" : true, 202 | "input_collapsed" : false, 203 | "collapsed" : false, 204 | "id" : "524AD8781A2C40E78FB103D1A215730E" 205 | }, 206 | "cell_type" : "code", 207 | "source" : "val trainData = assembler.transform{\n labelIndexer.transform(training)\n }.select(\"label\", \"numFeatures\")\ntrainData.show(5, truncate=false)", 208 | "outputs" : [ ] 209 | }, { 210 | "metadata" : { 211 | "trusted" : true, 212 | "input_collapsed" : false, 213 | "collapsed" : false, 214 | "id" : "F5C8FDC16C4D482F9D876C00BEA165B5" 215 | }, 216 | "cell_type" : "code", 217 | "source" : "import org.apache.spark.ml.classification.LogisticRegression\nimport org.apache.spark.ml.evaluation.BinaryClassificationEvaluator\n\n\nval lr = new LogisticRegression()\n .setFeaturesCol(\"numFeatures\")\n .setLabelCol(\"label\")\n .setRegParam(0.1)\n\nval lrModel = lr.fit(trainData)", 218 | "outputs" : [ ] 219 | }, { 220 | "metadata" : { 221 | "trusted" : true, 222 | "input_collapsed" : false, 223 | "collapsed" : false, 224 | "id" : "6B1E03D882D84D698E81C3B798FD7E3B" 225 | }, 226 | "cell_type" : "code", 227 | "source" : "val testData = assembler.transform{\n labelIndexer.transform(test)\n }", 228 | "outputs" : [ ] 229 | }, { 230 | "metadata" : { 231 | "trusted" : true, 232 | "input_collapsed" : false, 233 | "collapsed" : false, 234 | "id" : "BC029CEEF51E4A8596F87D2B78A77F94" 235 | }, 236 | "cell_type" : "code", 237 | "source" : "val eval = new BinaryClassificationEvaluator()\n .setMetricName(\"areaUnderROC\")\n\nprintln(eval.evaluate(lrModel.transform(testData)))", 238 | "outputs" : [ ] 239 | }, { 240 | "metadata" : { 241 | "id" : "E1F6FC29B90A428D800EF818579F58D8" 242 | }, 243 | "cell_type" : "markdown", 244 | "source" : "## Model selection with MLlib\nApache Spark MLlib supports model hyperparameter tuning using tools such as `CrossValidator` and `TrainValidationSplit`. These tools require the following items:\n\n - Estimator: algorithm or Pipeline to tune\n - Set of ParamMaps: parameters to choose from, sometimes called a “parameter grid” to search over\n - Evaluator: metric to measure how well a fitted Model does on held-out test data" 245 | }, { 246 | "metadata" : { 247 | "id" : "2797280B7D8E490D8C69DE421E730265" 248 | }, 249 | "cell_type" : "markdown", 250 | "source" : "In this section we will need to work only with numeric features and a target variable.\nAt the beginning let's have a look at grid search in action.\nWe will consider 2 algorithms:\n - [LogisticRegression](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression)\n - [DecisionTreeClassifier](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.classification.DecisionTreeClassifier)\n \nTo start with, let's choose one parameter to optimize for each algorithm:\n - LogisticRegression — regularization parameter (*regParam*)\n - DecisonTreeClassifier — maximum depth of the tree (*maxDepth*)\n \nThe remaining parameters we will leave at their default values. \nTo implement grid search procedure one can use\n[CrossValidator](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) class\ncombining with [ParamGridBuilder](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.tuning.ParamGridBuilder) class. \nAlso we need to specify appropriate evaluator for this task, in our case we should use [BinaryClassificationEvaluator](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator)\n(note that its default metric is areaUnderROC, so we don't neet to specify metric via `setMetricName` method call).\nSet up 5-fold cross validation scheme." 251 | }, { 252 | "metadata" : { 253 | "id" : "FF01A8CDB111416ABE957E877A793ABB" 254 | }, 255 | "cell_type" : "markdown", 256 | "source" : "
K-fold cross-validation
\n
\n \n
\n
\n By Fabian Flöck (Own work) [CC BY-SA 3.0 (http://creativecommons.org/licenses/by-sa/3.0)], via Wikimedia Commons\n
" 257 | }, { 258 | "metadata" : { 259 | "id" : "EF27CAA566E04232AEC9CFC49A2F2B81" 260 | }, 261 | "cell_type" : "markdown", 262 | "source" : "**Problem** Try to find the optimal values of these hyperparameters for each algorithm. Plot the average cross-validation metrics for a given value of hyperparameter for each algorithm (hint: use `avgMetrics` field of resulting `CrossValidatorModel`)." 263 | }, { 264 | "metadata" : { 265 | "trusted" : true, 266 | "input_collapsed" : false, 267 | "collapsed" : false, 268 | "id" : "7BFE57FA1AC340E28BBA55FE9513BD60" 269 | }, 270 | "cell_type" : "code", 271 | "source" : "import org.apache.spark.ml.classification.{LogisticRegression, DecisionTreeClassifier, RandomForestClassifier}\nimport org.apache.spark.ml.evaluation.BinaryClassificationEvaluator\nimport org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}\n\n\nval lr = new LogisticRegression()\n .setFeaturesCol(\"numFeatures\")\n .setLabelCol(\"label\")\n\nval lrParamGrid = new ParamGridBuilder()\n .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4))\n .build()\n\nval lrCV = new CrossValidator()\n .setEstimator(lr)\n .setEvaluator(new BinaryClassificationEvaluator)\n .setEstimatorParamMaps(lrParamGrid)\n .setNumFolds(5)\n\nval lrCVModel = lrCV.fit(trainData)", 272 | "outputs" : [ ] 273 | }, { 274 | "metadata" : { 275 | "trusted" : true, 276 | "input_collapsed" : false, 277 | "collapsed" : false, 278 | "id" : "8E42871F7DF2428F8C7901A12A063319" 279 | }, 280 | "cell_type" : "code", 281 | "source" : "println(\"cross-validated areaUnderROC: \" + lrCVModel.avgMetrics.max)\nprintln(\"test areaUnderROC: \" + eval.evaluate(lrCVModel.transform(testData)))", 282 | "outputs" : [ ] 283 | }, { 284 | "metadata" : { 285 | "trusted" : true, 286 | "input_collapsed" : false, 287 | "collapsed" : false, 288 | "id" : "D2CB230B21174A878C811CE6F13F21A3" 289 | }, 290 | "cell_type" : "code", 291 | "source" : "val tree = new DecisionTreeClassifier()\n .setFeaturesCol(\"numFeatures\")\n .setLabelCol(\"label\")\n\nval treeParamGrid = new ParamGridBuilder()\n .addGrid(tree.maxDepth, Array(5, 10, 20, 25, 30))\n .build()\n\nval treeCV = new CrossValidator()\n .setEstimator(tree)\n .setEvaluator(new BinaryClassificationEvaluator)\n .setEstimatorParamMaps(treeParamGrid)\n .setNumFolds(5)\n\nval treeCVModel = treeCV.fit(trainData)", 292 | "outputs" : [ ] 293 | }, { 294 | "metadata" : { 295 | "trusted" : true, 296 | "input_collapsed" : false, 297 | "collapsed" : false, 298 | "id" : "2185D3BB0F144FC284AF94D49B1F3F57" 299 | }, 300 | "cell_type" : "code", 301 | "source" : "println(\"cross-validated areaUnderROC: \" + treeCVModel.avgMetrics.max)\nprintln(\"test areaUnderROC: \" + eval.evaluate(treeCVModel.transform(testData)))", 302 | "outputs" : [ ] 303 | }, { 304 | "metadata" : { 305 | "trusted" : true, 306 | "input_collapsed" : false, 307 | "collapsed" : false, 308 | "presentation" : { 309 | "tabs_state" : "{\n \"tab_id\": \"#tab1099791619-2\"\n}", 310 | "pivot_chart_state" : "{\n \"hiddenAttributes\": [],\n \"menuLimit\": 200,\n \"cols\": [],\n \"rows\": [],\n \"vals\": [],\n \"exclusions\": {},\n \"inclusions\": {},\n \"unusedAttrsVertical\": 85,\n \"autoSortUnusedAttrs\": false,\n \"inclusionsInfo\": {},\n \"aggregatorName\": \"Count\",\n \"rendererName\": \"Table\"\n}" 311 | }, 312 | "id" : "7D0EFB1819E742B19D2276280F94B5E0" 313 | }, 314 | "cell_type" : "code", 315 | "source" : "lrCVModel.getEstimatorParamMaps\n .map(paramMap => paramMap(lr.regParam))\n .zip(lrCVModel.avgMetrics)\n .toSeq.toDF(\"regParam\", \"AUC-ROC\")\n .collect", 316 | "outputs" : [ ] 317 | }, { 318 | "metadata" : { 319 | "trusted" : true, 320 | "input_collapsed" : false, 321 | "collapsed" : false, 322 | "presentation" : { 323 | "tabs_state" : "{\n \"tab_id\": \"#tab1136416947-2\"\n}", 324 | "pivot_chart_state" : "{\n \"hiddenAttributes\": [],\n \"menuLimit\": 200,\n \"cols\": [],\n \"rows\": [],\n \"vals\": [],\n \"exclusions\": {},\n \"inclusions\": {},\n \"unusedAttrsVertical\": 85,\n \"autoSortUnusedAttrs\": false,\n \"inclusionsInfo\": {},\n \"aggregatorName\": \"Count\",\n \"rendererName\": \"Table\"\n}" 325 | }, 326 | "id" : "D576959DAFBC443E80CFDA84833A5BBE" 327 | }, 328 | "cell_type" : "code", 329 | "source" : "treeCVModel.getEstimatorParamMaps\n .map(paramMap => paramMap(tree.maxDepth))\n .zip(treeCVModel.avgMetrics)\n .toSeq.toDF(\"maxDepth\", \"AUC-ROC\")\n .collect", 330 | "outputs" : [ ] 331 | }, { 332 | "metadata" : { 333 | "id" : "5796CB09E6A04C868B06834372440656" 334 | }, 335 | "cell_type" : "markdown", 336 | "source" : "## Adding categorical features" 337 | }, { 338 | "metadata" : { 339 | "id" : "F329C50DFF5F4439BDABB14D308E1632" 340 | }, 341 | "cell_type" : "markdown", 342 | "source" : "Up to this point we did not use categorical features from the dataset. Let's see how additional categorical features will affect the quality of the classification. A common technique to convert categorical feature into numerical ones is [one-hot](https://en.wikipedia.org/wiki/One-hot) encoding. This can be done using [StringIndexer](http://spark.apache.org/docs/1.6.1/api/scala/index.html#org.apache.spark.ml.feature.StringIndexer) transformation followed by [OneHotEncoder](http://spark.apache.org/docs/1.6.1/api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder) transformation.\n\n*Let's start with encoding just one new feature `occupation` and after that generalize encoding step for all categorical features and combine all processing steps using [pipeline](http://spark.apache.org/docs/1.6.1/ml-guide.html#pipeline)*" 343 | }, { 344 | "metadata" : { 345 | "trusted" : true, 346 | "input_collapsed" : false, 347 | "collapsed" : false, 348 | "id" : "1EF8BB8D58BE403AADE26E26E14DEEEE" 349 | }, 350 | "cell_type" : "code", 351 | "source" : "data.groupBy(\"occupation\").count.show(truncate=false)\nprintln(data.select(\"occupation\").distinct.count)", 352 | "outputs" : [ ] 353 | }, { 354 | "metadata" : { 355 | "trusted" : true, 356 | "input_collapsed" : false, 357 | "collapsed" : false, 358 | "id" : "80CD66B45BE146998C04BFF9D13FCE6A" 359 | }, 360 | "cell_type" : "code", 361 | "source" : "import org.apache.spark.ml.feature.OneHotEncoder\n\nval occupationIndexer = new StringIndexer()\n .setInputCol(\"occupation\")\n .setOutputCol(\"occupationIndex\")\n .fit(training)\n\nval indexedTrainData = occupationIndexer.transform(training)\n\nval occupationEncoder = new OneHotEncoder()\n .setInputCol(\"occupationIndex\")\n .setOutputCol(\"occupationVec\")\n\nval oheEncodedTrainData = occupationEncoder.transform(indexedTrainData)\n\noheEncodedTrainData.select(\"occupation\", \"occupationVec\").limit(5)", 362 | "outputs" : [ ] 363 | }, { 364 | "metadata" : { 365 | "trusted" : true, 366 | "input_collapsed" : false, 367 | "collapsed" : false, 368 | "id" : "2E841C874D65405188408597B0FC8F50" 369 | }, 370 | "cell_type" : "code", 371 | "source" : "val assembler = new VectorAssembler()\n .setInputCols(Array(\"age\",\n \"fnlwgt\", \n \"education-num\", \n \"capital-gain\", \n \"capital-loss\",\n \"hours-per-week\",\n \"occupationVec\"))\n .setOutputCol(\"features\")\n\n\nval trainDataWithOccupation = assembler.transform{\n labelIndexer.transform(oheEncodedTrainData)\n }.select(\"label\", \"features\")", 372 | "outputs" : [ ] 373 | }, { 374 | "metadata" : { 375 | "id" : "A165BD5B117349CA91CEF187CEAFF007" 376 | }, 377 | "cell_type" : "markdown", 378 | "source" : "*For the sake of brevity, from now let's use only LogisticRegression model.*" 379 | }, { 380 | "metadata" : { 381 | "trusted" : true, 382 | "input_collapsed" : false, 383 | "collapsed" : false, 384 | "id" : "F16D7F76854D452B8E5133CD8A5EBC0F" 385 | }, 386 | "cell_type" : "code", 387 | "source" : "val lr = new LogisticRegression()\n .setFeaturesCol(\"features\")\n\nval lrParamGrid = new ParamGridBuilder()\n .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4))\n .build()\n\nval lrCV = new CrossValidator()\n .setEstimator(lr)\n .setEvaluator(new BinaryClassificationEvaluator)\n .setEstimatorParamMaps(lrParamGrid)\n .setNumFolds(5)\n\nval lrCVModel = lrCV.fit(trainDataWithOccupation)", 388 | "outputs" : [ ] 389 | }, { 390 | "metadata" : { 391 | "trusted" : true, 392 | "input_collapsed" : false, 393 | "collapsed" : false, 394 | "id" : "CF24B52C5DA141CA865C45735CE4551C" 395 | }, 396 | "cell_type" : "code", 397 | "source" : "val testDataWithOccupation = assembler.transform{\n labelIndexer.transform(occupationEncoder.transform(occupationIndexer.transform(test)))\n }.select(\"label\", \"features\")", 398 | "outputs" : [ ] 399 | }, { 400 | "metadata" : { 401 | "trusted" : true, 402 | "input_collapsed" : false, 403 | "collapsed" : false, 404 | "id" : "E7902B1AC34C4B8A8FA321B5951987E0" 405 | }, 406 | "cell_type" : "code", 407 | "source" : "println(\"cross-validated areaUnderROC: \" + lrCVModel.avgMetrics.max)\nprintln(\"test areaUnderROC: \" + eval.evaluate(lrCVModel.transform(testDataWithOccupation)))", 408 | "outputs" : [ ] 409 | }, { 410 | "metadata" : { 411 | "id" : "04CDB18D4A9E4A6384B950358A54675C" 412 | }, 413 | "cell_type" : "markdown", 414 | "source" : "Adding `occupation` categorical variable yielded an increase in quality." 415 | }, { 416 | "metadata" : { 417 | "id" : "D893CE81ECB84308A552138EB876433A" 418 | }, 419 | "cell_type" : "markdown", 420 | "source" : "## Pipelines" 421 | }, { 422 | "metadata" : { 423 | "id" : "D279C9B3198946D48A1A9C9E40A368EE" 424 | }, 425 | "cell_type" : "markdown", 426 | "source" : "Using [pipelines](http://spark.apache.org/docs/1.6.1/ml-guide.html#pipeline) one can combine all the processing stages into one pipeline and perform grid search against hyperparameters of all stages included in the pipeline. Also it's easy to extend given pipeline with new steps.\n\nA Pipeline chains multiple Transformers and Estimators together to specify an ML workflow." 427 | }, { 428 | "metadata" : { 429 | "trusted" : true, 430 | "input_collapsed" : false, 431 | "collapsed" : true, 432 | "id" : "8FBBD6915E364B3BBE564CC6DD77A0E6" 433 | }, 434 | "cell_type" : "markdown", 435 | "source" : "
\n \n
" 436 | }, { 437 | "metadata" : { 438 | "id" : "F33CFE5BAB1C4B278FB9A7C28016ABB8" 439 | }, 440 | "cell_type" : "markdown", 441 | "source" : " Let's see how we can combine all the preprocessing steps made so far into one pipeline." 442 | }, { 443 | "metadata" : { 444 | "trusted" : true, 445 | "input_collapsed" : false, 446 | "collapsed" : false, 447 | "id" : "8CBA081604CC4D11A7B370250DB6A335" 448 | }, 449 | "cell_type" : "code", 450 | "source" : "import org.apache.spark.ml.Pipeline\n\n\n// Chain indexers, encoders and assembler in a Pipeline\nval featurePipelineModel = new Pipeline()\n .setStages(Array(occupationIndexer, \n occupationEncoder,\n assembler,\n labelIndexer))\n .fit(training)", 451 | "outputs" : [ ] 452 | }, { 453 | "metadata" : { 454 | "trusted" : true, 455 | "input_collapsed" : false, 456 | "collapsed" : false, 457 | "id" : "C6B076016C2F406F808218B6E4A6D859" 458 | }, 459 | "cell_type" : "code", 460 | "source" : "featurePipelineModel.transform(test).select(\"features\", \"label\").limit(3)", 461 | "outputs" : [ ] 462 | }, { 463 | "metadata" : { 464 | "trusted" : true, 465 | "input_collapsed" : false, 466 | "collapsed" : false, 467 | "id" : "56F99D760F4D46CC8F0366495D55842B" 468 | }, 469 | "cell_type" : "code", 470 | "source" : "eval.evaluate(lrCVModel.transform(labelIndexer.transform(assembler.transform(occupationEncoder.transform(occupationIndexer.transform(test))))))", 471 | "outputs" : [ ] 472 | }, { 473 | "metadata" : { 474 | "trusted" : true, 475 | "input_collapsed" : false, 476 | "collapsed" : false, 477 | "id" : "5A6C6F21D60340D4BCF548C4D3C6AA42" 478 | }, 479 | "cell_type" : "code", 480 | "source" : "eval.evaluate(lrCVModel.transform(featurePipelineModel.transform(test)))", 481 | "outputs" : [ ] 482 | }, { 483 | "metadata" : { 484 | "id" : "547273F4614F4E8E95E374D2F9759053" 485 | }, 486 | "cell_type" : "markdown", 487 | "source" : "Now let's extend our pipeline by adding one-hot encoding step for each categorical feature." 488 | }, { 489 | "metadata" : { 490 | "trusted" : true, 491 | "input_collapsed" : false, 492 | "collapsed" : false, 493 | "id" : "E62240E6A06B42CC8AA49AD8A7272910" 494 | }, 495 | "cell_type" : "code", 496 | "source" : "val categCols = Array(\"workclass\", \"education\", \"marital-status\", \"occupation\", \"relationship\", \"race\", \"sex\")\n\nval featureIndexers: Array[org.apache.spark.ml.PipelineStage] = categCols.map(\n cname => new StringIndexer()\n .setInputCol(cname)\n .setOutputCol(s\"${cname}_index\")\n)\n\nval oneHotEncoders = categCols.map(\n cname => new OneHotEncoder()\n .setInputCol(s\"${cname}_index\")\n .setOutputCol(s\"${cname}_vec\")\n)\n\nval assembler = new VectorAssembler()\n .setInputCols(Array(\"age\",\n \"fnlwgt\", \n \"education-num\", \n \"capital-gain\", \n \"capital-loss\",\n \"hours-per-week\") ++\n categCols.map(cname => s\"${cname}_vec\"))\n .setOutputCol(\"features\")\n\nval rawDataProcessor = new Pipeline()\n .setStages(featureIndexers ++\n oneHotEncoders ++\n Array(assembler, labelIndexer))\n .fit(training)", 497 | "outputs" : [ ] 498 | }, { 499 | "metadata" : { 500 | "trusted" : true, 501 | "input_collapsed" : false, 502 | "collapsed" : false, 503 | "id" : "8DD31D3BA4054DF08AE02D82489228EB" 504 | }, 505 | "cell_type" : "code", 506 | "source" : "rawDataProcessor.transform(test).limit(3).select(\"features\", \"label\")", 507 | "outputs" : [ ] 508 | }, { 509 | "metadata" : { 510 | "trusted" : true, 511 | "input_collapsed" : false, 512 | "collapsed" : false, 513 | "id" : "BF3105D54D0646488BB1BD7168A2E95E" 514 | }, 515 | "cell_type" : "code", 516 | "source" : "val lr = new LogisticRegression()\n .setFeaturesCol(\"features\")\n\nval lrParamGrid = new ParamGridBuilder()\n .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4))\n .build()\n\nval lrCV = new CrossValidator()\n .setEstimator(lr)\n .setEvaluator(new BinaryClassificationEvaluator)\n .setEstimatorParamMaps(lrParamGrid)\n .setNumFolds(5)\n\nval lrCVModel = lrCV.fit(rawDataProcessor.transform(training))", 517 | "outputs" : [ ] 518 | }, { 519 | "metadata" : { 520 | "trusted" : true, 521 | "input_collapsed" : false, 522 | "collapsed" : false, 523 | "id" : "E6CE1231B7914499BE7F01C498F38ED1" 524 | }, 525 | "cell_type" : "code", 526 | "source" : "println(\"cross-validated areaUnderROC: \" + lrCVModel.avgMetrics.max)\nprintln(\"test areaUnderROC: \" + eval.evaluate(lrCVModel.transform(rawDataProcessor.transform(test))))", 527 | "outputs" : [ ] 528 | }, { 529 | "metadata" : { 530 | "id" : "544E5F9A75484F238B8E03A4A32CC949" 531 | }, 532 | "cell_type" : "markdown", 533 | "source" : "Adding one-hot encoding for each categorical variable yielded a significant increase in quality." 534 | }, { 535 | "metadata" : { 536 | "id" : "381BE3ADF02346668B5C2F242089ED97" 537 | }, 538 | "cell_type" : "markdown", 539 | "source" : "We also can combine several stages with LogisticRegression stage into one pipeline and perform grid search against hyperparameters of several stages included in the pipeline." 540 | }, { 541 | "metadata" : { 542 | "trusted" : true, 543 | "input_collapsed" : false, 544 | "collapsed" : true, 545 | "id" : "85ECF5E33F154845A038EC9F71F2F5A3" 546 | }, 547 | "cell_type" : "markdown", 548 | "source" : "For example, let's try to add [Buketizer](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.Bucketizer)\ntransformation applied to `age` column and add `splits` parameter values\nto pipeline parameters grid and see how it will affect metric score." 549 | }, { 550 | "metadata" : { 551 | "trusted" : true, 552 | "input_collapsed" : false, 553 | "collapsed" : false, 554 | "id" : "BEF557AD59414C568B55E45B1CC38606" 555 | }, 556 | "cell_type" : "code", 557 | "source" : "data.select(min(\"age\"), max(\"age\"))", 558 | "outputs" : [ ] 559 | }, { 560 | "metadata" : { 561 | "trusted" : true, 562 | "input_collapsed" : false, 563 | "collapsed" : false, 564 | "id" : "0C7C0F51A07C436A823D55D5E7319155" 565 | }, 566 | "cell_type" : "code", 567 | "source" : "// We need to cast age column to DoubleType to apply Bucketizer transformation.\nimport org.apache.spark.sql.types.DoubleType\n\nval castData = data.withColumn(\"age\", data(\"age\").cast(DoubleType))\n\nval Array(castTraining, castTest) = castData.randomSplit(Array(0.8, 0.2), seed = 12345)", 568 | "outputs" : [ ] 569 | }, { 570 | "metadata" : { 571 | "trusted" : true, 572 | "input_collapsed" : false, 573 | "collapsed" : false, 574 | "id" : "9601D696BA2B44C193B42429F0A96AA9" 575 | }, 576 | "cell_type" : "code", 577 | "source" : "import org.apache.spark.ml.feature.Bucketizer\n\nval ageBucketizer = new Bucketizer()\n .setInputCol(\"age\")\n .setOutputCol(\"age-buckets\")\n\nval lr = new LogisticRegression()\n .setFeaturesCol(\"features\")\n\nval pipelineParamGrid = new ParamGridBuilder()\n .addGrid(lr.regParam, Array(1e-3, 5e-4, 1e-4, 5e-5, 1e-5))\n .addGrid(ageBucketizer.splits, Array(Array(15.0, 30.0, 40.0, 50.0, 100.0),\n Array(15.0, 21.0, 25.0, 30.0, 40.0, 50.0, 70.0, 100.0)))\n .build()\n\nval assembler = new VectorAssembler()\n .setInputCols(Array(\"age-buckets\",\n \"fnlwgt\", \n \"education-num\", \n \"capital-gain\", \n \"capital-loss\",\n \"hours-per-week\") ++\n categCols.map(cname => s\"${cname}_vec\"))\n .setOutputCol(\"features\")\n\nval mlPipeline = new Pipeline()\n .setStages(Array(ageBucketizer) ++\n featureIndexers ++\n oneHotEncoders ++\n Array(assembler, labelIndexer, lr))\n\nval pipelineCV = new CrossValidator()\n .setEstimator(mlPipeline)\n .setEvaluator(new BinaryClassificationEvaluator)\n .setEstimatorParamMaps(pipelineParamGrid)\n .setNumFolds(5)\n\nval pipelineCVModel = pipelineCV.fit(castTraining)", 578 | "outputs" : [ ] 579 | }, { 580 | "metadata" : { 581 | "trusted" : true, 582 | "input_collapsed" : false, 583 | "collapsed" : false, 584 | "id" : "7F129445DA8F4AEC8B4DFC8720451AA8" 585 | }, 586 | "cell_type" : "code", 587 | "source" : "println(\"cross-validated areaUnderROC: \" + pipelineCVModel.avgMetrics.max)\nprintln(\"test areaUnderROC: \" + eval.evaluate(pipelineCVModel.transform(castTest)))", 588 | "outputs" : [ ] 589 | }, { 590 | "metadata" : { 591 | "trusted" : true, 592 | "input_collapsed" : false, 593 | "collapsed" : true, 594 | "id" : "981D044870FA41F8B6B927580837FA53" 595 | }, 596 | "cell_type" : "markdown", 597 | "source" : "We can see what adding `Bucketizer` step into pipeline combained with simultanious param grid search over several stages (`Bucketizer` and `LogisticRegression`) boosted the quality of our ml pipeline.\n\nYou can continue to modify and expand the pipeline by adding new stages of data transformation and add new parameters into parameter grid for cross-validation." 598 | } ], 599 | "nbformat" : 4 600 | } -------------------------------------------------------------------------------- /labs/IntroToMLandSparkMLPipelines/README.md: -------------------------------------------------------------------------------- 1 | # Introduction to Machine Learning and Spark ML Pipelines 2 | 3 |
4 | 5 | 6 |
7 | 8 | # Machine learning Pipeline 9 | 10 | In this lab we are going to learn how to teach machine learning models, how to correctly set up an experiment, how to tune model hyperparameters and how to compare models. Also we'are going to get familiar with spark.ml package as soon as all of the work we'are going to get done using this package. 11 | 12 | * http://spark.apache.org/docs/latest/ml-guide.html 13 | * http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.package 14 | 15 | ## Evaluation Metrics 16 | Model training and model quality assessment is performed on independent sets of examples. As a rule, the available examples are divided into two subsets: training (train) and control (test). The choice of the proportions of the split is a compromise. Indeed, the large size of the training leads to better quality of algorithms, but more noisy estimation of the model on the control. Conversely, the large size of the test sample leads to a less noisy assessment of the quality, however, models are less accurate. 17 | 18 | Many classification models produce estimation of belonging to the class (for example, the probability of belonging to the class 1). They then make a decision about the class of the object by comparing the estimates with a certain threshold $\theta$: 19 | 20 | 21 | 22 | 23 | In this case, we can consider metrics that are able to work with estimates of belonging to a class. 24 | In this lab, we will work with [AUC-ROC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve) metric. Detailed understanding of the operating principle of AUC-ROC metric is not required to perform the lab. 25 | ## Model Hyperparameter Tuning 26 | In machine learning problems it is necessary to distinguish the parameters of the model and hyperparameters (structural parameters). The model parameters are adjusted during the training (e.g., weights in the linear model or the structure of the decision tree), while hyperparameters are set in advance (for example, the regularization in linear model or maximum depth of the decision tree). Each model usually has many hyperparameters, and there is no universal set of hyperparameters optimal working in all tasks, for each task one should choose a different set of hyperparameters. _Grid search_ is commonly used to optimize model hyperparameters: for each parameter several values are selected and combination of parameter values where the model shows the best quality (in terms of the metric that is being optimized) is selected. However, in this case, it is necessary to correctly assess the constructed model, namely to do the split into training and test sample. There are several ways how it can be implemented: 27 | 28 | - Split the available samples into training and test samples. In this case, the comparison of a large number of models in the search of parameters leads to a situation when the best model on test data does not maintain its quality on new data. We can say that there is overfitting on the test data. 29 | - To eliminate the problem described above, it is possible to split data into 3 disjoint sub-samples: `train`, `validation` and `test`. The `validation` set is used for models comparison, and `test` set is used for the final quality assessment and comparison of families of models with selected parameters. 30 | - Another way to compare models is [cross-validation](https://en.wikipedia.org/wiki/Cross-validation_(statistics). There are different schemes of cross-validation: 31 | - Leave-one-out cross-validation 32 | - K-fold cross-validation 33 | - Repeated random sub-sampling validation 34 | 35 | Cross-validation is computationally expensive operation, especially if you are doing a grid search with a very large number of combinations. So there are a number of compromises: 36 | - the grid can be made more sparse, touching fewer values for each parameter, however, we must not forget that in such case one can skip a good combination of parameters; 37 | - cross-validation can be done with a smaller number of partitions or folds, but in this case the quality assessment of cross-validation becomes more noisy and increases the risk to choose a suboptimal set of parameters due to the random nature of the split; 38 | - the parameters can be optimized sequentially (greedy) — one after another, and not to iterate over all combinations; this strategy does not always lead to the optimal set; 39 | - enumerate only small number of randomly selected combinations of values of hyperparameters. 40 | 41 | ## Data 42 | 43 | We'are going to solve binary classification problem by building the algorithm which determines whether a person makes over 50K a year. Following variables are available: 44 | * age 45 | * workclass 46 | * fnlwgt 47 | * education 48 | * education-num 49 | * marital-status 50 | * occupation 51 | * relationship 52 | * race 53 | * sex 54 | * capital-gain 55 | * capital-loss 56 | * hours-per-week 57 | 58 | More on this data one can read in [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names) 59 | 60 | ```scala 61 | val spark = sparkSession 62 | 63 | val df = spark.read 64 | .option("header", "true") 65 | .option("inferSchema", "true") 66 | .csv("notebooks/spark-notebook-ml-labs/labs/IntroToMLandSparkMLPipelines/data/data.adult.csv") 67 | 68 | df.show(5) 69 | ``` 70 | 71 | ``` 72 | +---+---------+------+------------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+----------+ 73 | |age|workclass|fnlwgt| education|education-num| marital-status| occupation| relationship| race| sex|capital-gain|capital-loss|hours-per-week|>50K,<=50K| 74 | +---+---------+------+------------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+----------+ 75 | | 34|Local-gov|284843| HS-grad| 9| Never-married|Farming-fishing|Not-in-family|Black| Male| 594| 0| 60| <=50K| 76 | | 40| Private|190290|Some-college| 10| Divorced| Sales|Not-in-family|White| Male| 0| 0| 40| <=50K| 77 | | 36|Local-gov|177858| Bachelors| 13|Married-civ-spouse| Prof-specialty| Own-child|White| Male| 0| 0| 40| <=50K| 78 | | 22| Private|184756|Some-college| 10| Never-married| Sales| Own-child|White|Female| 0| 0| 30| <=50K| 79 | | 47| Private|149700| Bachelors| 13|Married-civ-spouse| Tech-support| Husband|White| Male| 15024| 0| 40| >50K| 80 | +---+---------+------+------------+-------------+------------------+---------------+-------------+-----+------+------------+------------+--------------+----------+ 81 | only showing top 5 rows 82 | ``` 83 | 84 | Sometimes there are missing values in the data. Sometimes, in the description of the dataset one can found the description of format of missing values. Particularly in the given dataset missing values are identified by '?' sign. 85 | 86 | **Problem** Find all the variables with missing values. Remove from the dataset all objects with missing values in any variable. 87 | 88 | ```scala 89 | val missingValsFeatures = df.columns.filter(column => df.filter(df(column) === "?").count > 0) 90 | 91 | println("Features with missing values: " + missingValsFeatures.mkString(", ")) 92 | 93 | val data = missingValsFeatures.foldLeft(df)((dfstage, column) => dfstage.filter(!dfstage(column).equalTo("?"))) 94 | ``` 95 | 96 | Split on training and test datasets. 97 | 98 | ```scala 99 | val Array(training, test) = data.randomSplit(Array(0.8, 0.2), seed = 1234) 100 | ``` 101 | 102 | ### MLlib Transformers and Estimators 103 | 104 | `Transformer` transforms one `DataFrame` into another `DataFrame`. 105 | 106 |
107 | 108 |
109 | 110 | `Estimator` fits on a `DataFrame` to produce a `Transformer`. 111 | 112 |
113 | 114 |
115 | 116 | ## Training classifiers on numeric features 117 | 118 | Some preprocessing steps are usually required after loading and cleaning dataset. In this case, these steps will include the following: 119 | 120 | - At first we will work only with numeric features. So let's select them separately in the feature vector "numFeatures" using [VectorAssembler](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.VectorAssembler). 121 | - Select the target variable (the one we want to predict, string column of labels) and map it to an ML column of label indices using [StringIndexer](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.StringIndexer), give the name "labelIndex" to a new variable. 122 | 123 | ```scala 124 | import org.apache.spark.ml.feature.VectorAssembler 125 | import org.apache.spark.ml.feature.StringIndexer 126 | 127 | val assembler = new VectorAssembler() 128 | .setInputCols(Array("age", 129 | "fnlwgt", 130 | "education-num", 131 | "capital-gain", 132 | "capital-loss", 133 | "hours-per-week")) 134 | .setOutputCol("numFeatures") 135 | 136 | val labelIndexer = new StringIndexer() 137 | .setInputCol(">50K,<=50K") 138 | .setOutputCol("label") 139 | .fit(training) 140 | ``` 141 | 142 | ```scala 143 | labelIndexer.transform(training).select(">50K,<=50K", "label").show(8) 144 | ``` 145 | ``` 146 | +----------+-----+ 147 | |>50K,<=50K|label| 148 | +----------+-----+ 149 | | <=50K| 0.0| 150 | | <=50K| 0.0| 151 | | <=50K| 0.0| 152 | | <=50K| 0.0| 153 | | <=50K| 0.0| 154 | | <=50K| 0.0| 155 | | <=50K| 0.0| 156 | | <=50K| 0.0| 157 | +----------+-----+ 158 | only showing top 8 rows 159 | ``` 160 | 161 | ```scala 162 | assembler.transform(training) 163 | .select("age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week", "numFeatures") 164 | .show(5, truncate=false) 165 | ``` 166 | ``` 167 | +-----+--------------------------------+ 168 | |label|numFeatures | 169 | +-----+--------------------------------+ 170 | |0.0 |[17.0,192387.0,5.0,0.0,0.0,45.0]| 171 | |0.0 |[17.0,340043.0,8.0,0.0,0.0,12.0]| 172 | |0.0 |[17.0,24090.0,9.0,0.0,0.0,35.0] | 173 | |0.0 |[17.0,25690.0,6.0,0.0,0.0,10.0] | 174 | |0.0 |[17.0,28031.0,5.0,0.0,0.0,16.0] | 175 | +-----+--------------------------------+ 176 | only showing top 5 rows 177 | ``` 178 | 179 | ```scala 180 | import org.apache.spark.ml.classification.LogisticRegression 181 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 182 | 183 | 184 | val lr = new LogisticRegression() 185 | .setFeaturesCol("numFeatures") 186 | .setLabelCol("label") 187 | .setRegParam(0.1) 188 | 189 | val lrModel = lr.fit(trainData) 190 | 191 | val testData = assembler.transform{ 192 | labelIndexer.transform(test) 193 | } 194 | 195 | val eval = new BinaryClassificationEvaluator() 196 | .setMetricName("areaUnderROC") 197 | 198 | println(eval.evaluate(lrModel.transform(testData))) 199 | ``` 200 | ``` 201 | 0.7937381854879748 202 | ``` 203 | 204 | ## Model selection with MLlib 205 | Apache Spark MLlib supports model hyperparameter tuning using tools such as `CrossValidator` and `TrainValidationSplit`. These tools require the following items: 206 | 207 | - Estimator: algorithm or Pipeline to tune 208 | - Set of ParamMaps: parameters to choose from, sometimes called a “parameter grid” to search over 209 | - Evaluator: metric to measure how well a fitted Model does on held-out test data 210 | 211 | In this section we will need to work only with numeric features and a target variable. 212 | At the beginning let's have a look at grid search in action. 213 | We will consider 2 algorithms: 214 | - [LogisticRegression](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression) 215 | - [DecisionTreeClassifier](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.classification.DecisionTreeClassifier) 216 | 217 | To start with, let's choose one parameter to optimize for each algorithm: 218 | - LogisticRegression — regularization parameter (*regParam*) 219 | - DecisonTreeClassifier — maximum depth of the tree (*maxDepth*) 220 | 221 | The remaining parameters we will leave at their default values. 222 | To implement grid search procedure one can use 223 | [CrossValidator](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.tuning.CrossValidator) class 224 | combining with [ParamGridBuilder](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.tuning.ParamGridBuilder) class. 225 | Also we need to specify appropriate evaluator for this task, in our case we should use [BinaryClassificationEvaluator](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.evaluation.BinaryClassificationEvaluator) 226 | (note that its default metric is areaUnderROC, so we don't neet to specify metric via `setMetricName` method call). 227 | Set up 5-fold cross validation scheme. 228 | 229 |
K-fold cross-validation
230 |
231 | 232 |
233 |
234 | By Fabian Flöck (Own work) [CC BY-SA 3.0 (http://creativecommons.org/licenses/by-sa/3.0)], via Wikimedia Commons 235 |
236 | 237 | **Problem** Try to find the optimal values of these hyperparameters for each algorithm. Plot the average cross-validation metrics for a given value of hyperparameter for each algorithm (hint: use `avgMetrics` field of resulting `CrossValidatorModel`). 238 | 239 | ```scala 240 | import org.apache.spark.ml.classification.{LogisticRegression, DecisionTreeClassifier, RandomForestClassifier} 241 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 242 | import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} 243 | 244 | 245 | val lr = new LogisticRegression() 246 | .setFeaturesCol("numFeatures") 247 | .setLabelCol("label") 248 | 249 | val lrParamGrid = new ParamGridBuilder() 250 | .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4)) 251 | .build() 252 | 253 | val lrCV = new CrossValidator() 254 | .setEstimator(lr) 255 | .setEvaluator(new BinaryClassificationEvaluator) 256 | .setEstimatorParamMaps(lrParamGrid) 257 | .setNumFolds(5) 258 | 259 | val lrCVModel = lrCV.fit(trainData) 260 | 261 | println("cross-validated areaUnderROC: " + lrCVModel.avgMetrics.max) 262 | println("test areaUnderROC: " + eval.evaluate(lrCVModel.transform(testData))) 263 | ``` 264 | ``` 265 | cross-validated areaUnderROC: 0.8297755442702006 266 | test areaUnderROC: 0.8068812315222861 267 | ``` 268 | 269 | ```scala 270 | val tree = new DecisionTreeClassifier() 271 | .setFeaturesCol("numFeatures") 272 | .setLabelCol("label") 273 | 274 | val treeParamGrid = new ParamGridBuilder() 275 | .addGrid(tree.maxDepth, Array(5, 10, 20, 25, 30)) 276 | .build() 277 | 278 | val treeCV = new CrossValidator() 279 | .setEstimator(tree) 280 | .setEvaluator(new BinaryClassificationEvaluator) 281 | .setEstimatorParamMaps(treeParamGrid) 282 | .setNumFolds(5) 283 | 284 | val treeCVModel = treeCV.fit(trainData) 285 | 286 | println("cross-validated areaUnderROC: " + treeCVModel.avgMetrics.max) 287 | println("test areaUnderROC: " + eval.evaluate(treeCVModel.transform(testData))) 288 | ``` 289 | ``` 290 | cross-validated areaUnderROC: 0.7105377328054816 291 | test areaUnderROC: 0.6934402983359256 292 | ``` 293 | 294 | ```scala 295 | lrCVModel.getEstimatorParamMaps 296 | .map(paramMap => paramMap(lr.regParam)) 297 | .zip(lrCVModel.avgMetrics) 298 | .toSeq.toDF("regParam", "AUC-ROC") 299 | .collect 300 | ``` 301 | 302 | 303 | 304 | 305 | ```scala 306 | treeCVModel.getEstimatorParamMaps 307 | .map(paramMap => paramMap(tree.maxDepth)) 308 | .zip(treeCVModel.avgMetrics) 309 | .toSeq.toDF("maxDepth", "AUC-ROC") 310 | .collect 311 | ``` 312 | 313 | 314 | 315 | 316 | ## Adding categorical features 317 | 318 | Up to this point we did not use categorical features from the dataset. Let's see how additional categorical features will affect the quality of the classification. A common technique to convert categorical feature into numerical ones is [one-hot](https://en.wikipedia.org/wiki/One-hot) encoding. This can be done using [StringIndexer](http://spark.apache.org/docs/1.6.1/api/scala/index.html#org.apache.spark.ml.feature.StringIndexer) transformation followed by [OneHotEncoder](http://spark.apache.org/docs/1.6.1/api/scala/index.html#org.apache.spark.ml.feature.OneHotEncoder) transformation. 319 | 320 | *Let's start with encoding just one new feature `occupation` and after that generalize encoding step for all categorical features and combine all processing steps using [pipeline](http://spark.apache.org/docs/1.6.1/ml-guide.html#pipeline)* 321 | 322 | ```scala 323 | data.groupBy("occupation").count.show(truncate=false) 324 | println(data.select("occupation").distinct.count) 325 | ``` 326 | ``` 327 | +-----------------+-----+ 328 | |occupation |count| 329 | +-----------------+-----+ 330 | |Sales |1840 | 331 | |Exec-managerial |2017 | 332 | |Prof-specialty |2095 | 333 | |Handlers-cleaners|674 | 334 | |Farming-fishing |481 | 335 | |Craft-repair |2057 | 336 | |Transport-moving |799 | 337 | |Priv-house-serv |90 | 338 | |Protective-serv |343 | 339 | |Other-service |1617 | 340 | |Tech-support |464 | 341 | |Machine-op-inspct|1023 | 342 | |Armed-Forces |3 | 343 | |Adm-clerical |1844 | 344 | +-----------------+-----+ 345 | 346 | 14 347 | ``` 348 | 349 | ```scala 350 | import org.apache.spark.ml.feature.OneHotEncoder 351 | 352 | val occupationIndexer = new StringIndexer() 353 | .setInputCol("occupation") 354 | .setOutputCol("occupationIndex") 355 | .fit(training) 356 | 357 | val indexedTrainData = occupationIndexer.transform(training) 358 | 359 | val occupationEncoder = new OneHotEncoder() 360 | .setInputCol("occupationIndex") 361 | .setOutputCol("occupationVec") 362 | 363 | val oheEncodedTrainData = occupationEncoder.transform(indexedTrainData) 364 | 365 | oheEncodedTrainData.select("occupation", "occupationVec").show(5, truncate=false) 366 | ``` 367 | ``` 368 | +---------------+--------------+ 369 | |occupation |occupationVec | 370 | +---------------+--------------+ 371 | |Other-service |(13,[5],[1.0])| 372 | |Adm-clerical |(13,[4],[1.0])| 373 | |Exec-managerial|(13,[2],[1.0])| 374 | |Other-service |(13,[5],[1.0])| 375 | |Other-service |(13,[5],[1.0])| 376 | +---------------+--------------+ 377 | only showing top 5 rows 378 | ``` 379 | 380 | ```scala 381 | val assembler = new VectorAssembler() 382 | .setInputCols(Array("age", 383 | "fnlwgt", 384 | "education-num", 385 | "capital-gain", 386 | "capital-loss", 387 | "hours-per-week", 388 | "occupationVec")) 389 | .setOutputCol("features") 390 | 391 | 392 | val trainDataWithOccupation = assembler.transform{ 393 | labelIndexer.transform(oheEncodedTrainData) 394 | }.select("label", "features") 395 | ``` 396 | 397 | *For the sake of brevity, from now let's use only LogisticRegression model.* 398 | 399 | ```scala 400 | val lr = new LogisticRegression() 401 | .setFeaturesCol("features") 402 | 403 | val lrParamGrid = new ParamGridBuilder() 404 | .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4)) 405 | .build() 406 | 407 | val lrCV = new CrossValidator() 408 | .setEstimator(lr) 409 | .setEvaluator(new BinaryClassificationEvaluator) 410 | .setEstimatorParamMaps(lrParamGrid) 411 | .setNumFolds(5) 412 | 413 | val lrCVModel = lrCV.fit(trainDataWithOccupation) 414 | 415 | val testDataWithOccupation = assembler.transform{ 416 | labelIndexer.transform(occupationEncoder.transform(occupationIndexer.transform(test))) 417 | }.select("label", "features") 418 | 419 | println("cross-validated areaUnderROC: " + lrCVModel.avgMetrics.max) 420 | println("test areaUnderROC: " + eval.evaluate(lrCVModel.transform(testDataWithOccupation))) 421 | ``` 422 | ``` 423 | cross-validated areaUnderROC: 0.8447936545404254 424 | test areaUnderROC: 0.823490779891881 425 | ``` 426 | 427 | Adding `occupation` categorical variable yielded an increase in quality. 428 | 429 | ## Pipelines 430 | 431 | Using [pipelines](http://spark.apache.org/docs/1.6.1/ml-guide.html#pipeline) one can combine all the processing stages into one pipeline and perform grid search against hyperparameters of all stages included in the pipeline. Also it's easy to extend given pipeline with new steps. 432 | 433 | A Pipeline chains multiple Transformers and Estimators together to specify an ML workflow. 434 | 435 |
436 | 437 |
438 | 439 | Let's see how we can combine all the preprocessing steps made so far into one pipeline. 440 | 441 | ```scala 442 | import org.apache.spark.ml.Pipeline 443 | 444 | 445 | // Chain indexers, encoders and assembler in a Pipeline 446 | val featurePipelineModel = new Pipeline() 447 | .setStages(Array(occupationIndexer, 448 | occupationEncoder, 449 | assembler, 450 | labelIndexer)) 451 | .fit(training) 452 | 453 | featurePipelineModel.transform(test).select("features", "label").show(3, truncate=false) 454 | ``` 455 | ``` 456 | +----------------------------------------------+-----+ 457 | |features |label| 458 | +----------------------------------------------+-----+ 459 | |(19,[0,1,2,5,11],[17.0,39815.0,6.0,25.0,1.0]) |0.0 | 460 | |(19,[0,1,2,5,17],[17.0,175587.0,7.0,30.0,1.0])|0.0 | 461 | |(19,[0,1,2,5,9],[17.0,191910.0,7.0,20.0,1.0]) |0.0 | 462 | +----------------------------------------------+-----+ 463 | only showing top 3 rows 464 | ``` 465 | 466 | Now compare this 467 | ```scala 468 | eval.evaluate(lrCVModel.transform(labelIndexer.transform(assembler.transform(occupationEncoder.transform(occupationIndexer.transform(test)))))) 469 | ``` 470 | ``` 471 | 0.823490779891881 472 | ``` 473 | 474 | and this 475 | 476 | ```scala 477 | eval.evaluate(lrCVModel.transform(featurePipelineModel.transform(test))) 478 | ``` 479 | ``` 480 | 0.823490779891881 481 | ``` 482 | 483 | Now let's extend our pipeline by adding one-hot encoding step for each categorical feature. 484 | 485 | ```scala 486 | val categCols = Array("workclass", "education", "marital-status", "occupation", "relationship", "race", "sex") 487 | 488 | val featureIndexers: Array[org.apache.spark.ml.PipelineStage] = categCols.map( 489 | cname => new StringIndexer() 490 | .setInputCol(cname) 491 | .setOutputCol(s"${cname}_index") 492 | ) 493 | 494 | val oneHotEncoders = categCols.map( 495 | cname => new OneHotEncoder() 496 | .setInputCol(s"${cname}_index") 497 | .setOutputCol(s"${cname}_vec") 498 | ) 499 | 500 | val assembler = new VectorAssembler() 501 | .setInputCols(Array("age", 502 | "fnlwgt", 503 | "education-num", 504 | "capital-gain", 505 | "capital-loss", 506 | "hours-per-week") ++ 507 | categCols.map(cname => s"${cname}_vec")) 508 | .setOutputCol("features") 509 | 510 | val rawDataProcessor = new Pipeline() 511 | .setStages(featureIndexers ++ 512 | oneHotEncoders ++ 513 | Array(assembler, labelIndexer)) 514 | .fit(training) 515 | 516 | rawDataProcessor.transform(test).limit(3).select("features", "label").show(truncate=false) 517 | ``` 518 | ``` 519 | +---------------------------------------------------------------------------------------+-----+ 520 | |features |label| 521 | +---------------------------------------------------------------------------------------+-----+ 522 | |(56,[0,1,2,5,8,19,28,38,48,51],[17.0,39815.0,6.0,25.0,1.0,1.0,1.0,1.0,1.0,1.0]) |0.0 | 523 | |(56,[0,1,2,5,8,18,28,44,48,51,55],[17.0,175587.0,7.0,30.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|0.0 | 524 | |(56,[0,1,2,5,8,18,28,36,48,51,55],[17.0,191910.0,7.0,20.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|0.0 | 525 | +---------------------------------------------------------------------------------------+-----+ 526 | ``` 527 | 528 | ```scala 529 | val lr = new LogisticRegression() 530 | .setFeaturesCol("features") 531 | 532 | val lrParamGrid = new ParamGridBuilder() 533 | .addGrid(lr.regParam, Array(1e-2, 5e-3, 1e-3, 5e-4, 1e-4)) 534 | .build() 535 | 536 | val lrCV = new CrossValidator() 537 | .setEstimator(lr) 538 | .setEvaluator(new BinaryClassificationEvaluator) 539 | .setEstimatorParamMaps(lrParamGrid) 540 | .setNumFolds(5) 541 | 542 | val lrCVModel = lrCV.fit(rawDataProcessor.transform(training)) 543 | 544 | println("cross-validated areaUnderROC: " + lrCVModel.avgMetrics.max) 545 | println("test areaUnderROC: " + eval.evaluate(lrCVModel.transform(rawDataProcessor.transform(test)))) 546 | ``` 547 | ``` 548 | cross-validated areaUnderROC: 0.9070537976977229 549 | test areaUnderROC: 0.8893619862500176 550 | ``` 551 | 552 | Adding one-hot encoding for each categorical variable yielded a significant increase in quality. 553 | 554 | We also can combine several stages with LogisticRegression stage into one pipeline and perform grid search against hyperparameters of several stages included in the pipeline. 555 | 556 | For example, let's try to add [Buketizer](http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.ml.feature.Bucketizer) 557 | transformation applied to `age` column and add `splits` parameter values 558 | to pipeline parameters grid and see how it will affect metric score. 559 | 560 | ```scala 561 | data.select(min("age"), max("age")).show() 562 | ``` 563 | ``` 564 | +--------+--------+ 565 | |min(age)|max(age)| 566 | +--------+--------+ 567 | | 17| 90| 568 | +--------+--------+ 569 | ``` 570 | 571 | ```scala 572 | // We need to cast age column to DoubleType to apply Bucketizer transformation. 573 | import org.apache.spark.sql.types.DoubleType 574 | 575 | val castData = data.withColumn("age", data("age").cast(DoubleType)) 576 | 577 | val Array(castTraining, castTest) = castData.randomSplit(Array(0.8, 0.2), seed = 12345) 578 | ``` 579 | 580 | ```scala 581 | import org.apache.spark.ml.feature.Bucketizer 582 | 583 | val ageBucketizer = new Bucketizer() 584 | .setInputCol("age") 585 | .setOutputCol("age-buckets") 586 | 587 | val lr = new LogisticRegression() 588 | .setFeaturesCol("features") 589 | 590 | val pipelineParamGrid = new ParamGridBuilder() 591 | .addGrid(lr.regParam, Array(1e-3, 5e-4, 1e-4, 5e-5, 1e-5)) 592 | .addGrid(ageBucketizer.splits, Array(Array(15.0, 30.0, 40.0, 50.0, 100.0), 593 | Array(15.0, 21.0, 25.0, 30.0, 40.0, 50.0, 70.0, 100.0))) 594 | .build() 595 | 596 | val assembler = new VectorAssembler() 597 | .setInputCols(Array("age-buckets", 598 | "fnlwgt", 599 | "education-num", 600 | "capital-gain", 601 | "capital-loss", 602 | "hours-per-week") ++ 603 | categCols.map(cname => s"${cname}_vec")) 604 | .setOutputCol("features") 605 | 606 | val mlPipeline = new Pipeline() 607 | .setStages(Array(ageBucketizer) ++ 608 | featureIndexers ++ 609 | oneHotEncoders ++ 610 | Array(assembler, labelIndexer, lr)) 611 | 612 | val pipelineCV = new CrossValidator() 613 | .setEstimator(mlPipeline) 614 | .setEvaluator(new BinaryClassificationEvaluator) 615 | .setEstimatorParamMaps(pipelineParamGrid) 616 | .setNumFolds(5) 617 | 618 | val pipelineCVModel = pipelineCV.fit(castTraining) 619 | 620 | println("cross-validated areaUnderROC: " + pipelineCVModel.avgMetrics.max) 621 | println("test areaUnderROC: " + eval.evaluate(pipelineCVModel.transform(castTest))) 622 | ``` 623 | ``` 624 | cross-validated areaUnderROC: 0.9052412424175416 625 | test areaUnderROC: 0.9033115341268361 626 | ``` 627 | 628 | We can see what adding `Bucketizer` step into pipeline combained with simultanious param grid search over several stages (`Bucketizer` and `LogisticRegression`) boosted the quality of our ml pipeline. 629 | 630 | You can continue to modify and expand the pipeline by adding new stages of data transformation and add new parameters into parameter grid for cross-validation. 631 | -------------------------------------------------------------------------------- /labs/IntroToMachineLearning/images/ageHistData.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/ageHistData.png -------------------------------------------------------------------------------- /labs/IntroToMachineLearning/images/cgainHistData.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/cgainHistData.png -------------------------------------------------------------------------------- /labs/IntroToMachineLearning/images/fnlwgtHistData.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/fnlwgtHistData.png -------------------------------------------------------------------------------- /labs/IntroToMachineLearning/images/lrAvgMetrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/lrAvgMetrics.png -------------------------------------------------------------------------------- /labs/IntroToMachineLearning/images/rfAvgMetrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/rfAvgMetrics.png -------------------------------------------------------------------------------- /labs/IntroToMachineLearning/images/rfAvgMetrics2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/rfAvgMetrics2.png -------------------------------------------------------------------------------- /labs/IntroToMachineLearning/images/treeAvgMetrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drewnoff/spark-notebook-ml-labs/26f80824cece2c3b78c050937ccb62e843e0de65/labs/IntroToMachineLearning/images/treeAvgMetrics.png -------------------------------------------------------------------------------- /labs/TitanicSurvivalExploration/README.md: -------------------------------------------------------------------------------- 1 | # Titanic Survival Exploration 2 | 3 |
4 | 5 | 6 |
7 | 8 | ## Spark quick review 9 | 10 | Spark provides convenient programming abstraction and parallel runtime to hide distributed computations complexities. 11 | 12 | 13 | 14 | 15 | 16 | In this first lab we will focus on DataFrames and SQL. 17 | In second lab we will use Spark MLlib for building machine learning pipelines. 18 | 19 | ### Spark Cluster 20 | 21 |
22 | 23 |
24 | 25 | Main entry point for Spark functionality is a `SparkContex`. `SparkContext` tells Spark how to access a cluster. 26 | `Spark Notebook` automatically creates `SparkContext`. 27 | 28 | Examples of `master` parameter configuration for `SparkContext`: 29 | 30 | | Master Parameter | Description | 31 | | ----------------- |----------------------------------------:| 32 | | local[K] | run Spark locally with K worker threads | 33 | | spark://HOST:PORT | connect to Spark Standalone cluster | 34 | | mesos://HOST:PORT | connect to Mesos cluster | 35 | 36 | ```scala 37 | sparkContext 38 | ``` 39 | 40 | ## Spark SQL and DataFrames 41 | 42 | * http://spark.apache.org/docs/latest/sql-programming-guide.html 43 | * http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.Dataset 44 | 45 | A DataFrame is a distributed collection of data organized into named columns. 46 | It is conceptually equivalent to a table in a relational database or a data frame in R/Python 47 | 48 | The entry point to programming Spark with SQL and DataFrame API in Spark 2.0 is the new `SparkSession` class: 49 | 50 | ```scala 51 | sparkSession 52 | ``` 53 | 54 | ```scala 55 | val spark = sparkSession 56 | 57 | // This import is needed to use the $-notation 58 | import spark.implicits._ 59 | ``` 60 | 61 | With a SparkSession you can create DataFrames from an existing RDD, from files in HDFS or any other storage system, or from Scala collections. 62 | 63 | ```scala 64 | Seq(("Alice", 20, "female"), ("Bob", 31, "male"), ("Eva", 16, "female")).toDF("name", "age", "gender").show() 65 | ``` 66 | 67 | ``` 68 | +-----+---+------+ 69 | | name|age|gender| 70 | +-----+---+------+ 71 | |Alice| 20|female| 72 | | Bob| 31| male| 73 | | Eva| 16|female| 74 | +-----+---+------+ 75 | ``` 76 | 77 | ```scala 78 | case class Person(name: String, age: Int, gender: String) 79 | 80 | val persons = Seq(Person("Alice", 20, "female"), Person("Bob", 31, "male"), Person("Eva", 16, "female")).toDF() 81 | persons.show() 82 | ``` 83 | 84 | ``` 85 | +-----+---+------+ 86 | | name|age|gender| 87 | +-----+---+------+ 88 | |Alice| 20|female| 89 | | Bob| 31| male| 90 | | Eva| 16|female| 91 | +-----+---+------+ 92 | 93 | persons: org.apache.spark.sql.DataFrame = [name: string, age: int ... 1 more field] 94 | ``` 95 | 96 | ```scala 97 | persons.select("name", "age").show() 98 | ``` 99 | 100 | ``` 101 | +-----+---+ 102 | | name|age| 103 | +-----+---+ 104 | |Alice| 20| 105 | | Bob| 31| 106 | | Eva| 16| 107 | +-----+---+ 108 | ``` 109 | 110 | ```scala 111 | val young = persons.filter($"age" < 21) 112 | young.show() 113 | ``` 114 | 115 | ``` 116 | +-----+---+------+ 117 | | name|age|gender| 118 | +-----+---+------+ 119 | |Alice| 20|female| 120 | | Eva| 16|female| 121 | +-----+---+------+ 122 | 123 | young: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [name: string, age: int ... 1 more field] 124 | ``` 125 | 126 | ```scala 127 | young.select(young("name"), ($"age" + 1).alias("incremented age")).show() 128 | ``` 129 | 130 | ``` 131 | +-----+---------------+ 132 | | name|incremented age| 133 | +-----+---------------+ 134 | |Alice| 21| 135 | | Eva| 17| 136 | +-----+---------------+ 137 | ``` 138 | 139 | ```scala 140 | persons.groupBy("gender").count.show 141 | ``` 142 | 143 | ``` 144 | +------+-----+ 145 | |gender|count| 146 | +------+-----+ 147 | |female| 2| 148 | | male| 1| 149 | +------+-----+ 150 | ``` 151 | 152 | # Titanic Dataset 153 | 154 | More on this dataset you can read [here](https://www.kaggle.com/c/titanic/data). 155 | 156 |
157 | 158 |
159 |
160 | By Willy Stöwer, died on 31st May 1931 - Magazine Die Gartenlaube, en:Die Gartenlaube and de:Die Gartenlaube, Public Domain, Link 161 |
162 | 163 | Out of the box, DataFrame supports reading data from the most popular formats, including JSON files, CSV files, Parquet files, Hive tables. 164 | 165 | ```scala 166 | val passengersDF = spark.read 167 | .option("header", "true") 168 | .option("inferSchema", "true") 169 | .csv("notebooks/spark-notebook-ml-labs/labs/TitanicSurvivalExploration/data/titanic_train.csv") 170 | 171 | passengersDF.printSchema 172 | ``` 173 | 174 | ``` 175 | root 176 | |-- PassengerId: integer (nullable = true) 177 | |-- Survived: integer (nullable = true) 178 | |-- Pclass: integer (nullable = true) 179 | |-- Name: string (nullable = true) 180 | |-- Sex: string (nullable = true) 181 | |-- Age: double (nullable = true) 182 | |-- SibSp: integer (nullable = true) 183 | |-- Parch: integer (nullable = true) 184 | |-- Ticket: string (nullable = true) 185 | |-- Fare: double (nullable = true) 186 | |-- Cabin: string (nullable = true) 187 | |-- Embarked: string (nullable = true) 188 | ``` 189 | 190 | Look at 5 records in passengers DataFrame: 191 | 192 | ```scala 193 | passengersDF.show(5, truncate=false) 194 | ``` 195 | 196 | ``` 197 | +-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+ 198 | |PassengerId|Survived|Pclass|Name |Sex |Age |SibSp|Parch|Ticket |Fare |Cabin|Embarked| 199 | +-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+ 200 | |1 |0 |3 |Braund, Mr. Owen Harris |male |22.0|1 |0 |A/5 21171 |7.25 |null |S | 201 | |2 |1 |1 |Cumings, Mrs. John Bradley (Florence Briggs Thayer)|female|38.0|1 |0 |PC 17599 |71.2833|C85 |C | 202 | |3 |1 |3 |Heikkinen, Miss. Laina |female|26.0|0 |0 |STON/O2. 3101282|7.925 |null |S | 203 | |4 |1 |1 |Futrelle, Mrs. Jacques Heath (Lily May Peel) |female|35.0|1 |0 |113803 |53.1 |C123 |S | 204 | |5 |0 |3 |Allen, Mr. William Henry |male |35.0|0 |0 |373450 |8.05 |null |S | 205 | +-----------+--------+------+---------------------------------------------------+------+----+-----+-----+----------------+-------+-----+--------+ 206 | only showing top 5 rows 207 | ``` 208 | 209 | The sql function on a SparkSession enables applications to run SQL queries programmatically and returns the result as a DataFrame. 210 | To do this we need to register the DataFrame as a SQL temporary view 211 | 212 | ```scala 213 | passengersDF.createOrReplaceTempView("passengers") 214 | 215 | spark.sql(""" 216 | SELECT Name, Age, Pclass, Survived FROM passengers 217 | WHERE Age < 30 218 | """).show(3, truncate=false) 219 | ``` 220 | 221 | ``` 222 | +------------------------------+----+------+--------+ 223 | |Name |Age |Pclass|Survived| 224 | +------------------------------+----+------+--------+ 225 | |Braund, Mr. Owen Harris |22.0|3 |0 | 226 | |Heikkinen, Miss. Laina |26.0|3 |1 | 227 | |Palsson, Master. Gosta Leonard|2.0 |3 |0 | 228 | +------------------------------+----+------+--------+ 229 | only showing top 3 rows 230 | ``` 231 | 232 | ### Transformations and Actions 233 | 234 | Spark operations on DataFrames are one of two types. 235 | * Transformations are lazily evaluated and create new Dataframes from existing ones. 236 | * Actions trigger computation and return results or write DataFrames to storage. 237 | 238 | *Computations are only triggered when an action is invoked.* 239 | 240 | Here are some examples. 241 | 242 | 243 | | Transformations | Actions | 244 | | :-----------------: |:------------:| 245 | | select | count | 246 | | filter | show | 247 | | groupBy | save | 248 | | orderBy | **collect** | 249 | | sample | take | 250 | | limit | reduce | 251 | | withColumn || 252 | | join || 253 | 254 | **Q-1. How many different classes of passengers were aboard the Titanic?** 255 | 256 | ```scala 257 | val pclasses = passengersDF.select("Pclass").distinct 258 | 259 | pclasses.count 260 | ``` 261 | ``` 262 | res141: Long = 3 263 | 3 264 | ``` 265 | 266 | ```scala 267 | pclasses.show 268 | ``` 269 | ``` 270 | +------+ 271 | |Pclass| 272 | +------+ 273 | | 1| 274 | | 3| 275 | | 2| 276 | +------+ 277 | ``` 278 | 279 | ```scala 280 | spark.sql(""" 281 | SELECT DISTINCT Pclass from passengers 282 | """).count 283 | ``` 284 | ``` 285 | res145: Long = 3 286 | 3 287 | ``` 288 | 289 | **Q-2. How many passengers were in each class?** 290 | 291 | ```scala 292 | val numByClass = passengersDF.groupBy("Pclass").count 293 | numByClass.show 294 | ``` 295 | ``` 296 | +------+-----+ 297 | |Pclass|count| 298 | +------+-----+ 299 | | 1| 216| 300 | | 3| 491| 301 | | 2| 184| 302 | +------+-----+ 303 | ``` 304 | 305 | ```scala 306 | spark.sql(""" 307 | SELECT Pclass, count(PassengerID) as class_count FROM passengers 308 | GROUP BY Pclass 309 | ORDER BY class_count DESC 310 | """).show 311 | ``` 312 | ``` 313 | +------+-----------+ 314 | |Pclass|class_count| 315 | +------+-----------+ 316 | | 3| 491| 317 | | 1| 216| 318 | | 2| 184| 319 | +------+-----------+ 320 | ``` 321 | 322 | ```scala 323 | CustomPlotlyChart(numByClass, 324 | layout="{title: 'Passengers per class', xaxis: {title: 'Pclass'}}", 325 | dataOptions="{type: 'bar'}", 326 | dataSources="{x: 'Pclass', y: 'count'}") 327 | ``` 328 | 329 | 330 | 331 | 332 | **Q-3. How many women and men were in each class?** 333 | ```scala 334 | val grByGenderAndClass = passengersDF.groupBy("Pclass", "Sex").count 335 | grByGenderAndClass.show() 336 | ``` 337 | ``` 338 | +------+------+-----+ 339 | |Pclass| Sex|count| 340 | +------+------+-----+ 341 | | 2|female| 76| 342 | | 3| male| 347| 343 | | 1| male| 122| 344 | | 3|female| 144| 345 | | 1|female| 94| 346 | | 2| male| 108| 347 | +------+------+-----+ 348 | ``` 349 | 350 | ```scala 351 | CustomPlotlyChart(grByGenderAndClass, 352 | layout="{title: 'Passengers per class', xaxis: {title: 'Pclass'}, barmode: 'group'}", 353 | dataOptions="{type: 'bar', splitBy: 'Sex'}", 354 | dataSources="{x: 'Pclass', y: 'count'}") 355 | ``` 356 | 357 | 358 | 359 | 360 | 361 | ### DataFrame Functions and UDF 362 | 363 | http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.functions$ 364 | 365 | ```scala 366 | import org.apache.spark.sql.functions.{mean, min, max} 367 | 368 | passengersDF.select(mean("Age").alias("Average Age"), min("Age"), max("Age")).show() 369 | ``` 370 | ``` 371 | +-----------------+--------+--------+ 372 | | Average Age|min(Age)|max(Age)| 373 | +-----------------+--------+--------+ 374 | |29.69911764705882| 0.42| 80.0| 375 | +-----------------+--------+--------+ 376 | ``` 377 | 378 | ```scala 379 | import org.apache.spark.sql.functions.count 380 | 381 | passengersDF.groupBy("Pclass") 382 | .agg(count("Pclass").alias("class_count")) 383 | .orderBy(-$"class_count") 384 | .show 385 | ``` 386 | ``` 387 | +------+-----------+ 388 | |Pclass|class_count| 389 | +------+-----------+ 390 | | 3| 491| 391 | | 1| 216| 392 | | 2| 184| 393 | +------+-----------+ 394 | ``` 395 | 396 | For more specific tasks one can use User Defined Functions. 397 | 398 | Let's say we want to get a column with full names of port of embarkation. 399 | 400 | ```scala 401 | passengersDF.select("Embarked").distinct.show 402 | ``` 403 | ``` 404 | +--------+ 405 | |Embarked| 406 | +--------+ 407 | | Q| 408 | | null| 409 | | C| 410 | | S| 411 | +--------+ 412 | ``` 413 | 414 | From dataset description we know that C = Cherbourg; Q = Queenstown; S = Southampton. 415 | 416 | ```scala 417 | import org.apache.spark.sql.functions.udf 418 | 419 | val embarkedFullName: (String) => String = (embarked: String) => 420 | if (embarked == "Q") 421 | "Queenstown" 422 | else if (embarked == "C") 423 | "Cherbourg" 424 | else 425 | "Southampton" 426 | 427 | 428 | val embarkedFullNameUDF = udf(embarkedFullName) 429 | ``` 430 | 431 | Also we want to get a column with more verbose survival status of passenger: `survived` and `died`. 432 | 433 | ```scala 434 | val survivedStatus: (Integer) => String = (survived: Integer) => 435 | if (survived == 1) 436 | "survived" 437 | else 438 | "died" 439 | 440 | val survivedStatusUDF = udf(survivedStatus) 441 | 442 | val pdf = passengersDF 443 | .withColumn("Embarkation", embarkedFullNameUDF($"Embarked")) 444 | .drop("Embarked") 445 | .withColumn("SurvivedStatus", survivedStatusUDF($"Survived")) 446 | .cache() 447 | 448 | pdf.select("Name", "Embarkation", "SurvivedStatus").show(5, truncate=false) 449 | ``` 450 | ``` 451 | +---------------------------------------------------+-----------+--------------+ 452 | |Name |Embarkation|SurvivedStatus| 453 | +---------------------------------------------------+-----------+--------------+ 454 | |Braund, Mr. Owen Harris |Southampton|died | 455 | |Cumings, Mrs. John Bradley (Florence Briggs Thayer)|Cherbourg |survived | 456 | |Heikkinen, Miss. Laina |Southampton|survived | 457 | |Futrelle, Mrs. Jacques Heath (Lily May Peel) |Southampton|survived | 458 | |Allen, Mr. William Henry |Southampton|died | 459 | +---------------------------------------------------+-----------+--------------+ 460 | only showing top 5 rows 461 | ``` 462 | 463 | **Q-5. Count the number and percentage of survivors and dead passengers.** 464 | 465 | ```scala 466 | import org.apache.spark.sql.functions.count 467 | 468 | val numPassengers = pdf.count() 469 | 470 | val grBySurvived = pdf.groupBy("SurvivedStatus") 471 | .agg(count("PassengerId").alias("count"), 472 | ((count("PassengerId") / numPassengers) * 100).alias("%")) 473 | grBySurvived.show 474 | ``` 475 | ``` 476 | +--------------+-----+-----------------+ 477 | |SurvivedStatus|count| %| 478 | +--------------+-----+-----------------+ 479 | | died| 549|61.61616161616161| 480 | | survived| 342|38.38383838383838| 481 | +--------------+-----+-----------------+ 482 | ``` 483 | 484 | **Q-6.** 485 | - **Plot the distribution of dead and surviving passengers.** 486 | - **Plot the distribution of survivors and dead passengers by class.** 487 | - **Plot the distribution of survivors and dead passengers by gender.** 488 | - **Plot the distribution of survivors and dead passengers by port of embarkation.** 489 | - **Plot the % of survivors by port of embarkation.** 490 | - **Plot the distribution of passenger classes by port of embarkation.** 491 | 492 | ```scala 493 | // Distribution of dead and survived passengers 494 | 495 | CustomPlotlyChart(grBySurvived, 496 | layout="{title: 'Passengers by status', xaxis: {title: 'status'}, yaxis: {title: '%'}}", 497 | dataOptions="{type: 'bar'}", 498 | dataSources="{x: 'SurvivedStatus', y: '%'}") 499 | ``` 500 | 501 | 502 | 503 | 504 | ```scala 505 | // Distribution of the number of survivors and dead passengers by class. 506 | 507 | CustomPlotlyChart(pdf.groupBy("SurvivedStatus", "Pclass").count, 508 | layout="{title: 'Number of passengers by survival status per class', xaxis: {title: 'Pclass'}, barmode: 'group'}", 509 | dataOptions="{type: 'bar', splitBy: 'SurvivedStatus'}", 510 | dataSources="{x: 'Pclass', y: 'count'}") 511 | ``` 512 | 513 | 514 | 515 | 516 | ```scala 517 | // Distribution of survivors and dead passengers by gender. 518 | 519 | CustomPlotlyChart(pdf.groupBy("SurvivedStatus", "Sex").count, 520 | layout="{title: 'Number of passengers by status by gender', xaxis: {title: 'Gender'}, barmode: 'group'}", 521 | dataOptions="{type: 'bar', splitBy: 'SurvivedStatus'}", 522 | dataSources="{x: 'Sex', y: 'count'}") 523 | ``` 524 | 525 | 526 | 527 | 528 | ```scala 529 | // Distribution of survivors and dead passengers by port of embarkation. 530 | 531 | CustomPlotlyChart(pdf.groupBy("Embarkation", "SurvivedStatus").count, 532 | layout="{barmode: 'group'}", 533 | dataOptions="{type: 'bar', splitBy: 'SurvivedStatus'}", 534 | dataSources="{x: 'Embarkation', y: 'count'}") 535 | ``` 536 | 537 | 538 | 539 | 540 | ```scala 541 | // % of survivors by port of embarkation. 542 | 543 | CustomPlotlyChart(pdf.groupBy("Embarkation").agg((sum("Survived") / count("Survived") * 100).alias("SurvivalRate")), 544 | layout="{title: '% of survival per embarkation'}", 545 | dataOptions="{type: 'bar'}", 546 | dataSources="{x: 'Embarkation', y: 'SurvivalRate'}") 547 | ``` 548 | 549 | 550 | 551 | 552 | ```scala 553 | // Distribution of passenger classes by port of embarkation. 554 | 555 | CustomPlotlyChart(pdf.groupBy("Embarkation", "Pclass").count, 556 | layout="{barmode: 'stack', title: 'Pclass distribution by Embarkation'}", 557 | dataOptions="{type: 'bar', splitBy: 'Pclass'}", 558 | dataSources="{x: 'Embarkation', y: 'count'}") 559 | ``` 560 | 561 | 562 | 563 | How to get the % of survived passengers by port of embarkation in this case? 564 | 565 | ```scala 566 | val byEmbark = pdf.groupBy("Embarkation").agg(count("PassengerId").alias("totalCount")) 567 | val byEmbarkByClass = pdf.groupBy("Embarkation", "Pclass").count 568 | 569 | val embarkClassDistr = byEmbarkByClass.join(byEmbark, usingColumn="Embarkation") 570 | .select($"Embarkation", 571 | $"Pclass", 572 | ($"count" / $"totalCount" * 100).alias("%")) 573 | 574 | CustomPlotlyChart(embarkClassDistr, 575 | layout="{barmode: 'stack', title: 'Pclass distribution by Embarkation', yaxis: {title: '%'}}", 576 | dataOptions="{type: 'bar', splitBy: 'Pclass'}", 577 | dataSources="{x: 'Embarkation', y: '%'}") 578 | ``` 579 | 580 | 581 | 582 | 583 | ### Histograms and Box Plots 584 | 585 | **Q-7 Obtain age distributions by passengers survival status.** 586 | 587 | ```scala 588 | CustomPlotlyChart(pdf, 589 | layout="{title: 'Age distribution by status', xaxis: {title: 'Age'}, barmode: 'overlay'}", 590 | dataOptions="{type: 'histogram', opacity: 0.6, splitBy: 'SurvivedStatus'}", 591 | dataSources="{x: 'Age'}") 592 | ``` 593 | 594 | 595 | 596 | ```scala 597 | CustomPlotlyChart(pdf, 598 | layout="{yaxis: {title: 'Age'}}", 599 | dataOptions="{type: 'box', splitBy: 'SurvivedStatus'}", 600 | dataSources="{y: 'Age'}") 601 | ``` 602 | 603 | 604 | 605 | **Q-8. Plot box plots of age distributions by passengers classes.** 606 | 607 | ```scala 608 | CustomPlotlyChart(pdf, 609 | layout="{yaxis: {title: 'Age'}}", 610 | dataOptions="{type: 'box', splitBy: 'Pclass'}", 611 | dataSources="{y: 'Age'}") 612 | ``` 613 | 614 | 615 | 616 | 617 | This scatter plots show the dependences of the chances of survival from the cabin class, age and gender: 618 | 619 | ```scala 620 | val survByClassAndAge = List("male", "female").map{ 621 | gender => 622 | CustomPlotlyChart(pdf.filter($"Sex" === gender), 623 | layout=s"""{ 624 | title: 'Survival by class and age, $gender.', 625 | yaxis: {title: 'class'}, 626 | xaxis: {title: 'age'} 627 | }""", 628 | dataOptions="""{ 629 | splitBy: 'SurvivedStatus', 630 | byTrace: { 631 | 'survived': { 632 | mode: 'markers', 633 | marker: { 634 | size: 20, 635 | opacity: 0.3, 636 | color: 'orange' 637 | } 638 | }, 639 | 'died': { 640 | mode: 'markers', 641 | marker: { 642 | size: 15, 643 | opacity: 0.9, 644 | color: 'rgba(55, 128, 191, 0.6)' 645 | } 646 | } 647 | } 648 | }""", 649 | dataSources = "{x: 'Age', y: 'Pclass'}" 650 | ) 651 | } 652 | 653 | survByClassAndAge(0) 654 | ``` 655 | 656 | 657 | 658 | 659 | ```scala 660 | survByClassAndAge(1) 661 | ``` 662 | 663 | 664 | 665 | 666 | ### More practice with UDF and Box Plots 667 | 668 | The titles of passengers could be useful source of information. Let's explore that. 669 | 670 | **Q-9. Plot box plots of age distributions by title.** 671 | 672 | ```scala 673 | pdf.select("Name").show(3, truncate=false) 674 | ``` 675 | ``` 676 | +---------------------------------------------------+ 677 | |Name | 678 | +---------------------------------------------------+ 679 | |Braund, Mr. Owen Harris | 680 | |Cumings, Mrs. John Bradley (Florence Briggs Thayer)| 681 | |Heikkinen, Miss. Laina | 682 | +---------------------------------------------------+ 683 | only showing top 3 rows 684 | ``` 685 | 686 | ```scala 687 | val parseTitle: String => String = (name: String) => 688 | name.split(", ")(1).split("\\.")(0) 689 | 690 | val parseTitleUDF = udf(parseTitle) 691 | 692 | CustomPlotlyChart(pdf.withColumn("Title", parseTitleUDF($"Name")), 693 | layout="{yaxis: {title: 'Age'}}", 694 | dataOptions="{type: 'box', splitBy: 'Title'}", 695 | dataSources="{y: 'Age'}") 696 | ``` 697 | 698 | 699 | 700 | 701 | Often it is good practice to group the values of the categorical feature, especially when there are rare individual feature values such as `Don`, `Lady`, `Capt` in our case. 702 | 703 | **Q-10. Write UDF to group all the titles into five groups according to the following table:** 704 | 705 | | Group | Title | 706 | | :------------:|:------------:| 707 | | Aristocratic | Capt, Col, Don, Dr, Jonkheer, Lady, Major, Rev, Sir, Countess | 708 | | Mrs | Mrs, Ms | 709 | | Miss | Miss, Mlle, Mme | 710 | | Mr | Mr | 711 | | Master | Master | 712 | 713 | ** Create new column called 'TitleGroup' and plot box plots of age distributions by title group.** 714 | 715 | ```scala 716 | val titleGroup: String => String = (title: String) => { 717 | val aristocratic = Set("Capt", "Col", "Don", "Dr", "Jonkheer", "Lady", "Major", "Rev", "Sir", "the Countess") 718 | val mrs = Set("Mrs", "Ms") 719 | val miss = Set("Miss", "Mlle", "Mme") 720 | if (aristocratic.contains(title)) 721 | "Aristocratic" 722 | else if (mrs.contains(title)) 723 | "Mrs" 724 | else if (miss.contains(title)) 725 | "Miss" 726 | else 727 | title 728 | } 729 | 730 | // given column with passenger name obtain column with passenger title group. 731 | val parseTitleGroupUDF = udf(parseTitle andThen titleGroup) 732 | ``` 733 | 734 | ```scala 735 | val withTitleDF = pdf.withColumn("TitleGroup", parseTitleGroupUDF($"Name")) 736 | 737 | CustomPlotlyChart(withTitleDF, 738 | layout="{yaxis: {title: 'Age'}}", 739 | dataOptions="{type: 'box', splitBy: 'TitleGroup'}", 740 | dataSources="{y: 'Age'}") 741 | ``` 742 | 743 | 744 | 745 | 746 | 747 | **Q-11 Plot the distribution of the % of survivors by title group.** 748 | 749 | ```scala 750 | val byTitleGr = withTitleDF 751 | .groupBy("TitleGroup") 752 | .agg((sum("Survived") / count("Survived") * 100).alias("%")) 753 | 754 | CustomPlotlyChart(byTitleGr, 755 | layout="{title: '% of survival by title group'}", 756 | dataOptions="{type: 'bar'}", 757 | dataSources="{x: 'TitleGroup', y: '%'}") 758 | ``` 759 | 760 | 761 | 762 | 763 | ### Handling missing values 764 | 765 | ```scala 766 | import org.apache.spark.sql.functions.isnull 767 | 768 | 100.0 * pdf.filter(isnull($"Age")).count / pdf.count 769 | ``` 770 | ``` 771 | res209: Double = 19.865319865319865 772 | 19.865319865319865 773 | ``` 774 | 775 | ```scala 776 | 100.0 * pdf.filter(isnull($"Cabin")).count / pdf.count 777 | ``` 778 | ``` 779 | res237: Double = 77.10437710437711 780 | 77.10437710437711 781 | ``` 782 | 783 | ```scala 784 | val cabinStatus: (String) => String = (cabin: String) => 785 | if (cabin == null) 786 | "noname" 787 | else 788 | "hasNumber" 789 | 790 | val cabinStatusUDF = udf(cabinStatus) 791 | ``` 792 | 793 | ```scala 794 | val withCabinStatusDF = pdf.withColumn("CabinStatus", cabinStatusUDF($"Cabin")) 795 | ``` 796 | 797 | ```scala 798 | CustomPlotlyChart(withCabinStatusDF.groupBy("CabinStatus", "SurvivedStatus").count, 799 | layout="{title: 'Number of passengers by survival status by cabin type', xaxis: {title: 'Cabin'}}", 800 | dataOptions="{type: 'bar', splitBy: 'SurvivedStatus'}", 801 | dataSources="{x: 'CabinStatus', y: 'count'}") 802 | ``` 803 | 804 | 805 | 806 | 807 | ### On your own 808 | 809 | Explore family relationships variables (SibSp and Parch). 810 | How does the number of siblings/spouses aboard affect the chances of survival? 811 | How does the number of parents/children aboard affect the chances of survival? 812 | 813 | Invent a new variable called `Family` to represent total number of relatives aboard and explore how does it affect hte chances of survival. 814 | -------------------------------------------------------------------------------- /labs/TitanicSurvivalExploration/TitanicSurvivalExploration.snb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata" : { 3 | "name" : "TitanicSurvivalExploration", 4 | "user_save_timestamp" : "1970-01-01T03:00:00.000Z", 5 | "auto_save_timestamp" : "1970-01-01T03:00:00.000Z", 6 | "language_info" : { 7 | "name" : "scala", 8 | "file_extension" : "scala", 9 | "codemirror_mode" : "text/x-scala" 10 | }, 11 | "trusted" : true, 12 | "customLocalRepo" : null, 13 | "customRepos" : null, 14 | "customDeps" : null, 15 | "customImports" : null, 16 | "customArgs" : null, 17 | "customSparkConf" : { 18 | "spark.app.name" : "ScalaIO Getting Started", 19 | "spark.master" : "local[4]", 20 | "spark.executor.memory" : "2G" 21 | } 22 | }, 23 | "cells" : [ { 24 | "metadata" : { 25 | "id" : "0BA359D1BEC942DB8031E9858B9DD1AA" 26 | }, 27 | "cell_type" : "markdown", 28 | "source" : "# Titanic Survival Exploration" 29 | }, { 30 | "metadata" : { 31 | "id" : "A7F1BB711F984380B18A2940C47EF8E5" 32 | }, 33 | "cell_type" : "markdown", 34 | "source" : "
\n \n \n
" 35 | }, { 36 | "metadata" : { 37 | "id" : "E4266CA3E6134B5E8A554817A707C970" 38 | }, 39 | "cell_type" : "markdown", 40 | "source" : "## Spark quick review" 41 | }, { 42 | "metadata" : { 43 | "id" : "567C1BC18DE446EE98E0CE36009E9831" 44 | }, 45 | "cell_type" : "markdown", 46 | "source" : "Spark provides convenient programming abstraction and parallel runtime to hide distributed computations complexities.\n\n\n\n\n\nIn this first lab we will focus on DataFrames and SQL.\nIn second lab we will use Spark MLlib for building machine learning pipelines." 47 | }, { 48 | "metadata" : { 49 | "id" : "C8DC749EF8F4478DB39C467D73068FF6" 50 | }, 51 | "cell_type" : "markdown", 52 | "source" : "### Spark Cluster" 53 | }, { 54 | "metadata" : { 55 | "id" : "4AD326A983274CEA8B87BE4C98D3CDAD" 56 | }, 57 | "cell_type" : "markdown", 58 | "source" : "
\n \n
" 59 | }, { 60 | "metadata" : { 61 | "id" : "15413540BBFD4F388A44153BB4CF069B" 62 | }, 63 | "cell_type" : "markdown", 64 | "source" : "Main entry point for Spark functionality is a `SparkContex`. `SparkContext` tells Spark how to access a cluster.\n`Spark Notebook` automatically creates `SparkContext`." 65 | }, { 66 | "metadata" : { 67 | "id" : "C825C743E52D43C7BCAEA3F3891995E7" 68 | }, 69 | "cell_type" : "markdown", 70 | "source" : "Examples of `master` parameter configuration for `SparkContext`:\n\n| Master Parameter | Description |\n| ----------------- |----------------------------------------:|\n| local[K] | run Spark locally with K worker threads |\n| spark://HOST:PORT | connect to Spark Standalone cluster |\n| mesos://HOST:PORT | connect to Mesos cluster |" 71 | }, { 72 | "metadata" : { 73 | "trusted" : true, 74 | "input_collapsed" : false, 75 | "collapsed" : false, 76 | "id" : "ABD9BF935E294C88805CD1AEC1E96ADA" 77 | }, 78 | "cell_type" : "code", 79 | "source" : "sparkContext", 80 | "outputs" : [ ] 81 | }, { 82 | "metadata" : { 83 | "trusted" : true, 84 | "input_collapsed" : false, 85 | "collapsed" : true, 86 | "id" : "0A2F5803720C45CA9A889FD74DC877DA" 87 | }, 88 | "cell_type" : "markdown", 89 | "source" : "## Spark SQL and DataFrames" 90 | }, { 91 | "metadata" : { 92 | "id" : "1D4AF5C1BA434D9E829C556E8C2DEE9B" 93 | }, 94 | "cell_type" : "markdown", 95 | "source" : "* http://spark.apache.org/docs/latest/sql-programming-guide.html\n* http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.Dataset" 96 | }, { 97 | "metadata" : { 98 | "id" : "824436EF882B4EACBAC707DF156DD69F" 99 | }, 100 | "cell_type" : "markdown", 101 | "source" : "A DataFrame is a distributed collection of data organized into named columns.\nIt is conceptually equivalent to a table in a relational database or a data frame in R/Python" 102 | }, { 103 | "metadata" : { 104 | "trusted" : true, 105 | "input_collapsed" : false, 106 | "collapsed" : true, 107 | "id" : "F3CCD76E01224741BFFC12D770595288" 108 | }, 109 | "cell_type" : "markdown", 110 | "source" : "The entry point to programming Spark with SQL and DataFrame API in Spark 2.0 is the new `SparkSession` class:" 111 | }, { 112 | "metadata" : { 113 | "trusted" : true, 114 | "input_collapsed" : false, 115 | "collapsed" : false, 116 | "id" : "2E2F6A73B390466997E51CE0BADA4ECD" 117 | }, 118 | "cell_type" : "code", 119 | "source" : "sparkSession", 120 | "outputs" : [ ] 121 | }, { 122 | "metadata" : { 123 | "trusted" : true, 124 | "input_collapsed" : false, 125 | "collapsed" : false, 126 | "id" : "F0D3F939A3B44F5893ECEC40FA6D1B84" 127 | }, 128 | "cell_type" : "code", 129 | "source" : "val spark = sparkSession", 130 | "outputs" : [ ] 131 | }, { 132 | "metadata" : { 133 | "trusted" : true, 134 | "input_collapsed" : false, 135 | "collapsed" : true, 136 | "id" : "07E8DBC462CA4A7282B9A36CED349356" 137 | }, 138 | "cell_type" : "code", 139 | "source" : "// This import is needed to use the $-notation\nimport spark.implicits._", 140 | "outputs" : [ ] 141 | }, { 142 | "metadata" : { 143 | "id" : "2CDC4126ACA04B7AA763E6D33DD9625A" 144 | }, 145 | "cell_type" : "markdown", 146 | "source" : "With a SparkSession you can create DataFrames from an existing RDD, from files in HDFS or any other storage system, or from Scala collections." 147 | }, { 148 | "metadata" : { 149 | "trusted" : true, 150 | "input_collapsed" : false, 151 | "collapsed" : false, 152 | "id" : "A1BDE4957E344C76963EEF5F0FA057D7" 153 | }, 154 | "cell_type" : "code", 155 | "source" : "Seq((\"Alice\", 20, \"female\"), (\"Bob\", 31, \"male\"), (\"Eva\", 16, \"female\")).toDF(\"name\", \"age\", \"gender\").show()", 156 | "outputs" : [ ] 157 | }, { 158 | "metadata" : { 159 | "trusted" : true, 160 | "input_collapsed" : false, 161 | "collapsed" : false, 162 | "presentation" : { 163 | "tabs_state" : "{\n \"tab_id\": \"#tab539102187-0\"\n}", 164 | "pivot_chart_state" : "{\n \"hiddenAttributes\": [],\n \"menuLimit\": 200,\n \"cols\": [],\n \"rows\": [],\n \"vals\": [],\n \"exclusions\": {},\n \"inclusions\": {},\n \"unusedAttrsVertical\": 85,\n \"autoSortUnusedAttrs\": false,\n \"inclusionsInfo\": {},\n \"aggregatorName\": \"Count\",\n \"rendererName\": \"Table\"\n}" 165 | }, 166 | "id" : "F953AF369A1D49999B1F547696C898F9" 167 | }, 168 | "cell_type" : "code", 169 | "source" : "case class Person(name: String, age: Int, gender: String)", 170 | "outputs" : [ ] 171 | }, { 172 | "metadata" : { 173 | "trusted" : true, 174 | "input_collapsed" : false, 175 | "collapsed" : false, 176 | "id" : "4D8CE8E8E05340DCB6E4087D9CC583C2" 177 | }, 178 | "cell_type" : "code", 179 | "source" : "val persons = Seq(Person(\"Alice\", 20, \"female\"), Person(\"Bob\", 31, \"male\"), Person(\"Eva\", 16, \"female\")).toDF()\npersons.show()", 180 | "outputs" : [ ] 181 | }, { 182 | "metadata" : { 183 | "trusted" : true, 184 | "input_collapsed" : false, 185 | "collapsed" : false, 186 | "id" : "F4837DEF00DB49518CA0B3E9A79A4B2B" 187 | }, 188 | "cell_type" : "code", 189 | "source" : "persons.select(\"name\", \"age\").show()", 190 | "outputs" : [ ] 191 | }, { 192 | "metadata" : { 193 | "trusted" : true, 194 | "input_collapsed" : false, 195 | "collapsed" : false, 196 | "id" : "12C92D4BA4FE4C35954609339303CDBA" 197 | }, 198 | "cell_type" : "code", 199 | "source" : "val young = persons.filter($\"age\" < 21)\nyoung.show()", 200 | "outputs" : [ ] 201 | }, { 202 | "metadata" : { 203 | "trusted" : true, 204 | "input_collapsed" : false, 205 | "collapsed" : false, 206 | "id" : "153803FA749D4E768373D0FCEABEB951" 207 | }, 208 | "cell_type" : "code", 209 | "source" : "young.select(young(\"name\"), ($\"age\" + 1).alias(\"incremented age\"))", 210 | "outputs" : [ ] 211 | }, { 212 | "metadata" : { 213 | "trusted" : true, 214 | "input_collapsed" : false, 215 | "collapsed" : false, 216 | "id" : "B3D06BA78056468F9359DC1262B02FAA" 217 | }, 218 | "cell_type" : "code", 219 | "source" : "persons.groupBy(\"gender\").count.show", 220 | "outputs" : [ ] 221 | }, { 222 | "metadata" : { 223 | "id" : "E4C4B34A08814E478FB00FB3788F0E36" 224 | }, 225 | "cell_type" : "markdown", 226 | "source" : "## Titanic Dataset" 227 | }, { 228 | "metadata" : { 229 | "id" : "7BF47E9F62104B648B12C00F0B139D3F" 230 | }, 231 | "cell_type" : "markdown", 232 | "source" : "More on this dataset you can read [here](https://www.kaggle.com/c/titanic/data)." 233 | }, { 234 | "metadata" : { 235 | "id" : "ABE465E9052F4B84A8369D2872C159D3" 236 | }, 237 | "cell_type" : "markdown", 238 | "source" : "
\n \n
\n
\n By Willy Stöwer, died on 31st May 1931 - Magazine Die Gartenlaube, en:Die Gartenlaube and de:Die Gartenlaube, Public Domain, Link\n
" 239 | }, { 240 | "metadata" : { 241 | "id" : "6C9768C541FA402583A3991F6F64F981" 242 | }, 243 | "cell_type" : "markdown", 244 | "source" : "Out of the box, DataFrame supports reading data from the most popular formats, including JSON files, CSV files, Parquet files, Hive tables." 245 | }, { 246 | "metadata" : { 247 | "trusted" : true, 248 | "input_collapsed" : false, 249 | "collapsed" : false, 250 | "id" : "39C8B97E811A4F48810AEF1D886F26BF" 251 | }, 252 | "cell_type" : "code", 253 | "source" : "val passengersDF = spark.read\n .option(\"header\", \"true\")\n .option(\"inferSchema\", \"true\")\n .csv(\"notebooks/spark-notebook-ml-labs/labs/TitanicSurvivalExploration/data/titanic_train.csv\") ", 254 | "outputs" : [ ] 255 | }, { 256 | "metadata" : { 257 | "trusted" : true, 258 | "input_collapsed" : false, 259 | "collapsed" : false, 260 | "id" : "1FCA01D29C584C3C8CD81564234B66B8" 261 | }, 262 | "cell_type" : "code", 263 | "source" : "passengersDF.printSchema", 264 | "outputs" : [ ] 265 | }, { 266 | "metadata" : { 267 | "id" : "8BFFFBF5773B431CB9B48760CB5D2E4C" 268 | }, 269 | "cell_type" : "markdown", 270 | "source" : "Look at 5 records in passengers DataFrame:" 271 | }, { 272 | "metadata" : { 273 | "trusted" : true, 274 | "input_collapsed" : false, 275 | "collapsed" : false, 276 | "presentation" : { 277 | "tabs_state" : "{\n \"tab_id\": \"#tab1709273702-0\"\n}", 278 | "pivot_chart_state" : "{\n \"hiddenAttributes\": [],\n \"menuLimit\": 200,\n \"cols\": [],\n \"rows\": [],\n \"vals\": [],\n \"exclusions\": {},\n \"inclusions\": {},\n \"unusedAttrsVertical\": 85,\n \"autoSortUnusedAttrs\": false,\n \"inclusionsInfo\": {},\n \"aggregatorName\": \"Count\",\n \"rendererName\": \"Table\"\n}" 279 | }, 280 | "id" : "0FBC0EB917884F5092DB5870A778C20D" 281 | }, 282 | "cell_type" : "code", 283 | "source" : "passengersDF.limit(5)", 284 | "outputs" : [ ] 285 | }, { 286 | "metadata" : { 287 | "id" : "C7F79F4F8FC44F9FBC0D0007C0CA9D02" 288 | }, 289 | "cell_type" : "markdown", 290 | "source" : "The sql function on a SparkSession enables applications to run SQL queries programmatically and returns the result as a DataFrame.\nTo do this we need to register the DataFrame as a SQL temporary view" 291 | }, { 292 | "metadata" : { 293 | "trusted" : true, 294 | "input_collapsed" : false, 295 | "collapsed" : false, 296 | "id" : "CE6823262FC64FDB9F3F0A561F1A3EA6" 297 | }, 298 | "cell_type" : "code", 299 | "source" : "passengersDF.createOrReplaceTempView(\"passengers\")", 300 | "outputs" : [ ] 301 | }, { 302 | "metadata" : { 303 | "trusted" : true, 304 | "input_collapsed" : false, 305 | "collapsed" : false, 306 | "id" : "1EBC8EA3E63D47418D87F6C9D643344A" 307 | }, 308 | "cell_type" : "code", 309 | "source" : "spark.sql(\"\"\"\n SELECT Name, Age, Pclass, Survived FROM passengers\n WHERE Age < 30\n\"\"\").show(3, truncate=false)", 310 | "outputs" : [ ] 311 | }, { 312 | "metadata" : { 313 | "id" : "4DD3EA91E8F647C285CBF053308ECE28" 314 | }, 315 | "cell_type" : "markdown", 316 | "source" : "### Transformations and Actions" 317 | }, { 318 | "metadata" : { 319 | "id" : "1AD3002411144A64B75E212B903D2D2A" 320 | }, 321 | "cell_type" : "markdown", 322 | "source" : "Spark operations on DataFrames are one of two types. \n* Transformations are lazily evaluated and create new Dataframes from existing ones. \n* Actions trigger computation and return results or write DataFrames to storage.\n\n*Computations are only triggered when an action is invoked.*\n\nHere are some examples.\n\n\n| Transformations | Actions |\n| :-----------------: |:------------:|\n| select | count |\n| filter | show |\n| groupBy | save |\n| orderBy | **collect** |\n| sample | take |\n| limit | reduce |\n| withColumn ||\n| join ||" 323 | }, { 324 | "metadata" : { 325 | "trusted" : true, 326 | "input_collapsed" : false, 327 | "collapsed" : true, 328 | "id" : "66265A8E3F3C4682B120E4491E19FE80" 329 | }, 330 | "cell_type" : "markdown", 331 | "source" : "**Q-1. How many different classes of passengers were aboard the Titanic?**" 332 | }, { 333 | "metadata" : { 334 | "trusted" : true, 335 | "input_collapsed" : false, 336 | "collapsed" : false, 337 | "id" : "E06494D18F2A41A988ED237FEE686936" 338 | }, 339 | "cell_type" : "code", 340 | "source" : "val pclasses = passengersDF.select(\"Pclass\").distinct\n\npclasses.count", 341 | "outputs" : [ ] 342 | }, { 343 | "metadata" : { 344 | "trusted" : true, 345 | "input_collapsed" : false, 346 | "collapsed" : false, 347 | "id" : "2D1113AEF17F420C9FE869661E10A7A7" 348 | }, 349 | "cell_type" : "code", 350 | "source" : "pclasses.show", 351 | "outputs" : [ ] 352 | }, { 353 | "metadata" : { 354 | "trusted" : true, 355 | "input_collapsed" : false, 356 | "collapsed" : false, 357 | "id" : "E689CD08EF394D178E415C5F1EB5D164" 358 | }, 359 | "cell_type" : "code", 360 | "source" : "spark.sql(\"\"\"\n SELECT DISTINCT Pclass from passengers\n\"\"\").count", 361 | "outputs" : [ ] 362 | }, { 363 | "metadata" : { 364 | "id" : "ABB5DAB6E64B49F589EC1A3E0EA25756" 365 | }, 366 | "cell_type" : "markdown", 367 | "source" : "**Q-2. How many passengers were in each class?**" 368 | }, { 369 | "metadata" : { 370 | "trusted" : true, 371 | "input_collapsed" : false, 372 | "collapsed" : false, 373 | "id" : "F83908E659D148A29C2B08C20D3F20D3" 374 | }, 375 | "cell_type" : "code", 376 | "source" : "val numByClass = passengersDF.groupBy(\"Pclass\").count\nnumByClass.show", 377 | "outputs" : [ ] 378 | }, { 379 | "metadata" : { 380 | "trusted" : true, 381 | "input_collapsed" : false, 382 | "collapsed" : false, 383 | "id" : "2DE1CB257F9B42979B67818B65835825" 384 | }, 385 | "cell_type" : "code", 386 | "source" : "spark.sql(\"\"\"\n SELECT Pclass, count(PassengerID) as class_count FROM passengers\n GROUP BY Pclass\n ORDER BY class_count DESC\n\"\"\").show", 387 | "outputs" : [ ] 388 | }, { 389 | "metadata" : { 390 | "trusted" : true, 391 | "input_collapsed" : false, 392 | "collapsed" : false, 393 | "presentation" : { 394 | "tabs_state" : "{\n \"tab_id\": \"#tab1686677314-0\"\n}", 395 | "pivot_chart_state" : "{\n \"hiddenAttributes\": [],\n \"menuLimit\": 200,\n \"cols\": [],\n \"rows\": [],\n \"vals\": [],\n \"exclusions\": {},\n \"inclusions\": {},\n \"unusedAttrsVertical\": 85,\n \"autoSortUnusedAttrs\": false,\n \"inclusionsInfo\": {},\n \"aggregatorName\": \"Count\",\n \"rendererName\": \"Table\"\n}" 396 | }, 397 | "id" : "241BF06A77504EE5835B7706DC3D6A80" 398 | }, 399 | "cell_type" : "code", 400 | "source" : "numByClass.collect", 401 | "outputs" : [ ] 402 | }, { 403 | "metadata" : { 404 | "trusted" : true, 405 | "input_collapsed" : false, 406 | "collapsed" : false, 407 | "id" : "539395181C18426A8A92BDFE30B38FF5" 408 | }, 409 | "cell_type" : "code", 410 | "source" : "CustomPlotlyChart(numByClass,\n layout=\"{title: 'Passengers per class', xaxis: {title: 'Pclass'}}\",\n dataOptions=\"{type: 'bar'}\",\n dataSources=\"{x: 'Pclass', y: 'count'}\")", 411 | "outputs" : [ ] 412 | }, { 413 | "metadata" : { 414 | "trusted" : true, 415 | "input_collapsed" : false, 416 | "collapsed" : true, 417 | "id" : "B22F8FEBB82E49348B5788109E706B07" 418 | }, 419 | "cell_type" : "markdown", 420 | "source" : "**Q-3. How many women and men were in each class?**" 421 | }, { 422 | "metadata" : { 423 | "trusted" : true, 424 | "input_collapsed" : false, 425 | "collapsed" : false, 426 | "id" : "D9767491DA4147ED883CC035089C8C76" 427 | }, 428 | "cell_type" : "code", 429 | "source" : "val grByGenderAndClass = passengersDF.groupBy(\"Pclass\", \"Sex\").count", 430 | "outputs" : [ ] 431 | }, { 432 | "metadata" : { 433 | "trusted" : true, 434 | "input_collapsed" : false, 435 | "collapsed" : false, 436 | "id" : "D6A740F6875146578887A67D62CABE51" 437 | }, 438 | "cell_type" : "code", 439 | "source" : "grByGenderAndClass", 440 | "outputs" : [ ] 441 | }, { 442 | "metadata" : { 443 | "trusted" : true, 444 | "input_collapsed" : false, 445 | "collapsed" : false, 446 | "id" : "B3E3EC5E5ED8444883D27AE4050BAB4D" 447 | }, 448 | "cell_type" : "code", 449 | "source" : "CustomPlotlyChart(grByGenderAndClass,\n layout=\"{title: 'Passengers per class', xaxis: {title: 'Pclass'}, barmode: 'group'}\",\n dataOptions=\"{type: 'bar', splitBy: 'Sex'}\",\n dataSources=\"{x: 'Pclass', y: 'count'}\")", 450 | "outputs" : [ ] 451 | }, { 452 | "metadata" : { 453 | "id" : "8852C6D27A7548678558B726FA4EC0CA" 454 | }, 455 | "cell_type" : "markdown", 456 | "source" : "### DataFrame Functions and UDF" 457 | }, { 458 | "metadata" : { 459 | "id" : "0C12589398E54FDB9CB5303E2ACF6600" 460 | }, 461 | "cell_type" : "markdown", 462 | "source" : "http://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.functions$" 463 | }, { 464 | "metadata" : { 465 | "trusted" : true, 466 | "input_collapsed" : false, 467 | "collapsed" : false, 468 | "id" : "B4A586C3124641D08FD92E291914325C" 469 | }, 470 | "cell_type" : "code", 471 | "source" : "import org.apache.spark.sql.functions.{mean, min, max}\n\npassengersDF.select(mean(\"Age\").alias(\"Average Age\"), min(\"Age\"), max(\"Age\")).show()", 472 | "outputs" : [ ] 473 | }, { 474 | "metadata" : { 475 | "trusted" : true, 476 | "input_collapsed" : false, 477 | "collapsed" : false, 478 | "id" : "36483A5F3D3148FA867F7C00FCBDFE5E" 479 | }, 480 | "cell_type" : "code", 481 | "source" : "import org.apache.spark.sql.functions.count\n\npassengersDF.groupBy(\"Pclass\")\n .agg(count(\"Pclass\").alias(\"class_count\"))\n .orderBy(-$\"class_count\")\n .show", 482 | "outputs" : [ ] 483 | }, { 484 | "metadata" : { 485 | "id" : "857AD3D31D3447388F2EF03F2150EF7E" 486 | }, 487 | "cell_type" : "markdown", 488 | "source" : "For more specific tasks one can use User Defined Functions.\n\nLet's say we want to get a column with full names of port of embarkation." 489 | }, { 490 | "metadata" : { 491 | "trusted" : true, 492 | "input_collapsed" : false, 493 | "collapsed" : false, 494 | "id" : "51DB33FCFB394E358EDE6C6C7D4AEF8B" 495 | }, 496 | "cell_type" : "code", 497 | "source" : "passengersDF.select(\"Embarked\").distinct.show", 498 | "outputs" : [ ] 499 | }, { 500 | "metadata" : { 501 | "id" : "1ACADA9161D94D6393E9639507EF6B77" 502 | }, 503 | "cell_type" : "markdown", 504 | "source" : "From dataset description we know that C = Cherbourg; Q = Queenstown; S = Southampton." 505 | }, { 506 | "metadata" : { 507 | "trusted" : true, 508 | "input_collapsed" : false, 509 | "collapsed" : false, 510 | "id" : "B2C67EAA4DAD4D048297CCEF98601BD9" 511 | }, 512 | "cell_type" : "code", 513 | "source" : "import org.apache.spark.sql.functions.udf\n\nval embarkedFullName: (String) => String = (embarked: String) =>\n if (embarked == \"Q\")\n \"Queenstown\"\n else if (embarked == \"C\")\n \"Cherbourg\"\n else\n \"Southampton\"\n\n\nval embarkedFullNameUDF = udf(embarkedFullName)", 514 | "outputs" : [ ] 515 | }, { 516 | "metadata" : { 517 | "id" : "CBA7286CBB9B40C08E961A5AFFC562AA" 518 | }, 519 | "cell_type" : "markdown", 520 | "source" : "Also we want to get a column with more verbose survival status of passenger: `survived` and `died`." 521 | }, { 522 | "metadata" : { 523 | "trusted" : true, 524 | "input_collapsed" : false, 525 | "collapsed" : false, 526 | "id" : "E7158FE3637440D29AA7C0611532686D" 527 | }, 528 | "cell_type" : "code", 529 | "source" : "val survivedStatus: (Integer) => String = (survived: Integer) =>\n if (survived == 1)\n \"survived\"\n else\n \"died\"\n\nval survivedStatusUDF = udf(survivedStatus)", 530 | "outputs" : [ ] 531 | }, { 532 | "metadata" : { 533 | "trusted" : true, 534 | "input_collapsed" : false, 535 | "collapsed" : false, 536 | "id" : "6EBB943DEAB247D084A70AD43DAAD151" 537 | }, 538 | "cell_type" : "code", 539 | "source" : "val pdf = passengersDF\n .withColumn(\"Embarkation\", embarkedFullNameUDF($\"Embarked\"))\n .drop(\"Embarked\")\n .withColumn(\"SurvivedStatus\", survivedStatusUDF($\"Survived\"))\n .cache()", 540 | "outputs" : [ ] 541 | }, { 542 | "metadata" : { 543 | "trusted" : true, 544 | "input_collapsed" : false, 545 | "collapsed" : false, 546 | "id" : "9111446574A142638399B4B2FDBFE2E0" 547 | }, 548 | "cell_type" : "code", 549 | "source" : "pdf.select(\"Name\", \"Embarkation\", \"SurvivedStatus\").limit(5)", 550 | "outputs" : [ ] 551 | }, { 552 | "metadata" : { 553 | "id" : "7478E5D34328498FAFAFCE7587733B8B" 554 | }, 555 | "cell_type" : "markdown", 556 | "source" : "### Practice session" 557 | }, { 558 | "metadata" : { 559 | "trusted" : true, 560 | "input_collapsed" : false, 561 | "collapsed" : false, 562 | "id" : "FE0F6FE181A14D70B5BC90E69DC471BE" 563 | }, 564 | "cell_type" : "markdown", 565 | "source" : "**Q-5. Count the number and percentage of survivors and dead passengers.**" 566 | }, { 567 | "metadata" : { 568 | "trusted" : true, 569 | "input_collapsed" : false, 570 | "collapsed" : false, 571 | "id" : "D8C3FC886C224BA1B40FC6B96E221410" 572 | }, 573 | "cell_type" : "code", 574 | "source" : "import org.apache.spark.sql.functions.count\n\nval numPassengers = pdf.count()\n\nval grBySurvived = pdf.groupBy(\"SurvivedStatus\")\n .agg(count(\"PassengerId\").alias(\"count\"), \n ((count(\"PassengerId\") / numPassengers) * 100).alias(\"%\"))\ngrBySurvived.show", 575 | "outputs" : [ ] 576 | }, { 577 | "metadata" : { 578 | "id" : "F2E89C82208D4CE1BD756D6147752C26" 579 | }, 580 | "cell_type" : "markdown", 581 | "source" : "**Q-6.** \n- **Plot the distribution of dead and surviving passengers.**\n- **Plot the distribution of survivors and dead passengers by class.**\n- **Plot the distribution of survivors and dead passengers by gender.**\n- **Plot the distribution of survivors and dead passengers by port of embarkation.**\n- **Plot the % of survivors by port of embarkation.**\n- **Plot the distribution of passenger classes by port of embarkation.**" 582 | }, { 583 | "metadata" : { 584 | "trusted" : true, 585 | "input_collapsed" : false, 586 | "collapsed" : false, 587 | "id" : "DCFE583EBC614827B2192913336A270C" 588 | }, 589 | "cell_type" : "code", 590 | "source" : "// Distribution of dead and survived passengers\n\nCustomPlotlyChart(grBySurvived,\n layout=\"{title: 'Passengers by status', xaxis: {title: 'status'}, yaxis: {title: '%'}}\",\n dataOptions=\"{type: 'bar'}\",\n dataSources=\"{x: 'SurvivedStatus', y: '%'}\")", 591 | "outputs" : [ ] 592 | }, { 593 | "metadata" : { 594 | "trusted" : true, 595 | "input_collapsed" : false, 596 | "collapsed" : false, 597 | "id" : "FF53CEB29DED42CCBBEF5061F090D300" 598 | }, 599 | "cell_type" : "code", 600 | "source" : "// Distribution of the number of survivors and dead passengers by class.\n\nCustomPlotlyChart(pdf.groupBy(\"SurvivedStatus\", \"Pclass\").count,\n layout=\"{title: 'Number of passengers by survival status per class', xaxis: {title: 'Pclass'}, barmode: 'group'}\",\n dataOptions=\"{type: 'bar', splitBy: 'SurvivedStatus'}\",\n dataSources=\"{x: 'Pclass', y: 'count'}\")", 601 | "outputs" : [ ] 602 | }, { 603 | "metadata" : { 604 | "trusted" : true, 605 | "input_collapsed" : false, 606 | "collapsed" : false, 607 | "id" : "B0A55CA3DB65485D8EAD5DC01B1EBD43" 608 | }, 609 | "cell_type" : "code", 610 | "source" : "// Distribution of survivors and dead passengers by gender.\n\nCustomPlotlyChart(pdf.groupBy(\"SurvivedStatus\", \"Sex\").count,\n layout=\"{title: 'Number of passengers by status by gender', xaxis: {title: 'Gender'}, barmode: 'group'}\",\n dataOptions=\"{type: 'bar', splitBy: 'SurvivedStatus'}\",\n dataSources=\"{x: 'Sex', y: 'count'}\")", 611 | "outputs" : [ ] 612 | }, { 613 | "metadata" : { 614 | "trusted" : true, 615 | "input_collapsed" : false, 616 | "collapsed" : false, 617 | "id" : "4382CC3408474AC591A917BE71C0D6DD" 618 | }, 619 | "cell_type" : "code", 620 | "source" : "// Distribution of survivors and dead passengers by port of embarkation.\n\nCustomPlotlyChart(pdf.groupBy(\"Embarkation\", \"SurvivedStatus\").count,\n layout=\"{barmode: 'group'}\",\n dataOptions=\"{type: 'bar', splitBy: 'SurvivedStatus'}\",\n dataSources=\"{x: 'Embarkation', y: 'count'}\")", 621 | "outputs" : [ ] 622 | }, { 623 | "metadata" : { 624 | "trusted" : true, 625 | "input_collapsed" : false, 626 | "collapsed" : false, 627 | "presentation" : { 628 | "tabs_state" : "{\n \"tab_id\": \"#tab348620047-1\"\n}", 629 | "pivot_chart_state" : "{\n \"hiddenAttributes\": [],\n \"menuLimit\": 200,\n \"cols\": [],\n \"rows\": [],\n \"vals\": [],\n \"exclusions\": {},\n \"inclusions\": {},\n \"unusedAttrsVertical\": 85,\n \"autoSortUnusedAttrs\": false,\n \"inclusionsInfo\": {},\n \"aggregatorName\": \"Count\",\n \"rendererName\": \"Table\"\n}" 630 | }, 631 | "id" : "C92E240948514A769695125D47A6F3E8" 632 | }, 633 | "cell_type" : "code", 634 | "source" : "// % of survivors by port of embarkation.\n\nCustomPlotlyChart(pdf.groupBy(\"Embarkation\").agg((sum(\"Survived\") / count(\"Survived\") * 100).alias(\"SurvivalRate\")),\n layout=\"{title: '% of survival per embarkation'}\",\n dataOptions=\"{type: 'bar'}\",\n dataSources=\"{x: 'Embarkation', y: 'SurvivalRate'}\")", 635 | "outputs" : [ ] 636 | }, { 637 | "metadata" : { 638 | "trusted" : true, 639 | "input_collapsed" : false, 640 | "collapsed" : false, 641 | "id" : "EE39CF0D57B1481A917D276C5D55275D" 642 | }, 643 | "cell_type" : "code", 644 | "source" : "// Distribution of passenger classes by port of embarkation.\n\nCustomPlotlyChart(pdf.groupBy(\"Embarkation\", \"Pclass\").count,\n layout=\"{barmode: 'stack', title: 'Pclass distribution by Embarkation'}\",\n dataOptions=\"{type: 'bar', splitBy: 'Pclass'}\",\n dataSources=\"{x: 'Embarkation', y: 'count'}\")", 645 | "outputs" : [ ] 646 | }, { 647 | "metadata" : { 648 | "id" : "B94C5A499DEF4344AFAFC71EDEA4E4EB" 649 | }, 650 | "cell_type" : "markdown", 651 | "source" : "How to get the % of survived passengers by port of embarkation in this case?" 652 | }, { 653 | "metadata" : { 654 | "trusted" : true, 655 | "input_collapsed" : false, 656 | "collapsed" : false, 657 | "id" : "42A9EA0451BF421E91A475E4ABD0A186" 658 | }, 659 | "cell_type" : "code", 660 | "source" : "val byEmbark = pdf.groupBy(\"Embarkation\").agg(count(\"PassengerId\").alias(\"totalCount\"))\nval byEmbarkByClass = pdf.groupBy(\"Embarkation\", \"Pclass\").count", 661 | "outputs" : [ ] 662 | }, { 663 | "metadata" : { 664 | "trusted" : true, 665 | "input_collapsed" : false, 666 | "collapsed" : false, 667 | "id" : "02AC8277D19648188507F5312C51E562" 668 | }, 669 | "cell_type" : "code", 670 | "source" : "val embarkClassDistr = byEmbarkByClass.join(byEmbark, usingColumn=\"Embarkation\")\n .select($\"Embarkation\",\n $\"Pclass\", \n ($\"count\" / $\"totalCount\" * 100).alias(\"%\"))\n\nCustomPlotlyChart(embarkClassDistr,\n layout=\"{barmode: 'stack', title: 'Pclass distribution by Embarkation', yaxis: {title: '%'}}\",\n dataOptions=\"{type: 'bar', splitBy: 'Pclass'}\",\n dataSources=\"{x: 'Embarkation', y: '%'}\")", 671 | "outputs" : [ ] 672 | }, { 673 | "metadata" : { 674 | "id" : "ED9BD96273124BE6B502992651149D91" 675 | }, 676 | "cell_type" : "markdown", 677 | "source" : "### Histograms and Box Plots" 678 | }, { 679 | "metadata" : { 680 | "id" : "AB9E955BCB114FB3882A6EDDDF015FAE" 681 | }, 682 | "cell_type" : "markdown", 683 | "source" : "**Q-7 Obtain age distributions by passengers survival status.**" 684 | }, { 685 | "metadata" : { 686 | "trusted" : true, 687 | "input_collapsed" : false, 688 | "collapsed" : false, 689 | "id" : "FD743A9848C74C8DB1B1A7E93E351BAD" 690 | }, 691 | "cell_type" : "code", 692 | "source" : "CustomPlotlyChart(pdf, \n layout=\"{title: 'Age distribution by status', xaxis: {title: 'Age'}, barmode: 'overlay'}\",\n dataOptions=\"{type: 'histogram', opacity: 0.6, splitBy: 'SurvivedStatus'}\",\n dataSources=\"{x: 'Age'}\")", 693 | "outputs" : [ ] 694 | }, { 695 | "metadata" : { 696 | "trusted" : true, 697 | "input_collapsed" : false, 698 | "collapsed" : false, 699 | "id" : "A825DE70B29E46A58167CF7D5F9DA4AF" 700 | }, 701 | "cell_type" : "code", 702 | "source" : "CustomPlotlyChart(pdf, \n layout=\"{yaxis: {title: 'Age'}}\",\n dataOptions=\"{type: 'box', splitBy: 'SurvivedStatus'}\",\n dataSources=\"{y: 'Age'}\")", 703 | "outputs" : [ ] 704 | }, { 705 | "metadata" : { 706 | "id" : "3B308D4AABF1491485890B86C4D4BFC8" 707 | }, 708 | "cell_type" : "markdown", 709 | "source" : "**Q-8. Plot box plots of age distributions by passengers classes.**" 710 | }, { 711 | "metadata" : { 712 | "trusted" : true, 713 | "input_collapsed" : false, 714 | "collapsed" : false, 715 | "id" : "D26A8F4FF341485AAD1034460EDA6761" 716 | }, 717 | "cell_type" : "code", 718 | "source" : "CustomPlotlyChart(pdf, \n layout=\"{yaxis: {title: 'Age'}}\",\n dataOptions=\"{type: 'box', splitBy: 'Pclass'}\",\n dataSources=\"{y: 'Age'}\")", 719 | "outputs" : [ ] 720 | }, { 721 | "metadata" : { 722 | "id" : "D683E06CFC934A098175544B47E6C701" 723 | }, 724 | "cell_type" : "markdown", 725 | "source" : "This scatter plots show the dependences of the chances of survival from the cabin class, age and gender:" 726 | }, { 727 | "metadata" : { 728 | "trusted" : true, 729 | "input_collapsed" : false, 730 | "collapsed" : false, 731 | "id" : "A6CB9EB11CED4E1F8623B9D2ED582AAE" 732 | }, 733 | "cell_type" : "code", 734 | "source" : "val survByClassAndAge = List(\"male\", \"female\").map{\n gender =>\n CustomPlotlyChart(pdf.filter($\"Sex\" === gender),\n layout=s\"\"\"{\n title: 'Survival by class and age, $gender.', \n yaxis: {title: 'class'}, \n xaxis: {title: 'age'}\n }\"\"\",\n dataOptions=\"\"\"{\n splitBy: 'SurvivedStatus',\n byTrace: {\n 'survived': {\n mode: 'markers',\n marker: {\n size: 20,\n opacity: 0.3,\n color: 'orange'\n }\n },\n 'died': {\n mode: 'markers',\n marker: {\n size: 15,\n opacity: 0.9,\n color: 'rgba(55, 128, 191, 0.6)'\n }\n }\n }\n }\"\"\",\n dataSources = \"{x: 'Age', y: 'Pclass'}\"\n )\n}", 735 | "outputs" : [ ] 736 | }, { 737 | "metadata" : { 738 | "trusted" : true, 739 | "input_collapsed" : false, 740 | "collapsed" : false, 741 | "id" : "A1F17AC9F5634BCC866AC03BD35CAF52" 742 | }, 743 | "cell_type" : "code", 744 | "source" : "survByClassAndAge(0)", 745 | "outputs" : [ ] 746 | }, { 747 | "metadata" : { 748 | "trusted" : true, 749 | "input_collapsed" : false, 750 | "collapsed" : false, 751 | "id" : "E752E11B5E3E415DB83B1752E0CAA39C" 752 | }, 753 | "cell_type" : "code", 754 | "source" : "survByClassAndAge(1)", 755 | "outputs" : [ ] 756 | }, { 757 | "metadata" : { 758 | "id" : "DF18F97FFCB44B42A657D6002C72B0E5" 759 | }, 760 | "cell_type" : "markdown", 761 | "source" : "### More practice with UDF and Box Plots" 762 | }, { 763 | "metadata" : { 764 | "id" : "895FA63345FC47F288917539A9E50014" 765 | }, 766 | "cell_type" : "markdown", 767 | "source" : "The titles of passengers could be useful source of information. Let's explore that." 768 | }, { 769 | "metadata" : { 770 | "id" : "9BDE6CD9C586463E8CE2C86FEC396932" 771 | }, 772 | "cell_type" : "markdown", 773 | "source" : "**Q-9. Plot box plots of age distributions by title.**" 774 | }, { 775 | "metadata" : { 776 | "trusted" : true, 777 | "input_collapsed" : false, 778 | "collapsed" : false, 779 | "id" : "69E2BD21E9E14113AE3842909DC135E7" 780 | }, 781 | "cell_type" : "code", 782 | "source" : "pdf.select(\"Name\").show(3, truncate=false)", 783 | "outputs" : [ ] 784 | }, { 785 | "metadata" : { 786 | "trusted" : true, 787 | "input_collapsed" : false, 788 | "collapsed" : false, 789 | "id" : "A16D246A68A34A13AB667BB060F8785F" 790 | }, 791 | "cell_type" : "code", 792 | "source" : "val parseTitle: String => String = (name: String) =>\n name.split(\", \")(1).split(\"\\\\.\")(0)\n\nval parseTitleUDF = udf(parseTitle)", 793 | "outputs" : [ ] 794 | }, { 795 | "metadata" : { 796 | "trusted" : true, 797 | "input_collapsed" : false, 798 | "collapsed" : false, 799 | "id" : "3FCBCF42719244908AD3271F198B723A" 800 | }, 801 | "cell_type" : "code", 802 | "source" : "CustomPlotlyChart(pdf.withColumn(\"Title\", parseTitleUDF($\"Name\")), \n layout=\"{yaxis: {title: 'Age'}}\",\n dataOptions=\"{type: 'box', splitBy: 'Title'}\",\n dataSources=\"{y: 'Age'}\")", 803 | "outputs" : [ ] 804 | }, { 805 | "metadata" : { 806 | "id" : "7C40B636686F448D880E4EE0E4C1DC4E" 807 | }, 808 | "cell_type" : "markdown", 809 | "source" : "Often it is good practice to group the values of the categorical feature, especially when there are rare individual feature values such as `Don`, `Lady`, `Capt` in our case." 810 | }, { 811 | "metadata" : { 812 | "id" : "5CAAC988C4C249E9899F435D95FA2BB8" 813 | }, 814 | "cell_type" : "markdown", 815 | "source" : "**Q-10. Write UDF to group all the titles into five groups according to the following table:**\n\n| Group | Title |\n| :------------:|:------------:|\n| Aristocratic | Capt, Col, Don, Dr, Jonkheer, Lady, Major, Rev, Sir, Countess |\n| Mrs | Mrs, Ms |\n| Miss | Miss, Mlle, Mme |\n| Mr | Mr |\n| Master | Master |\n\n** Create new column called 'TitleGroup' and plot box plots of age distributions by title group.**" 816 | }, { 817 | "metadata" : { 818 | "trusted" : true, 819 | "input_collapsed" : false, 820 | "collapsed" : false, 821 | "id" : "F32C7967F2C346548B5A2143BCE73D80" 822 | }, 823 | "cell_type" : "code", 824 | "source" : "val titleGroup: String => String = (title: String) => {\n val aristocratic = Set(\"Capt\", \"Col\", \"Don\", \"Dr\", \"Jonkheer\", \"Lady\", \"Major\", \"Rev\", \"Sir\", \"the Countess\")\n val mrs = Set(\"Mrs\", \"Ms\")\n val miss = Set(\"Miss\", \"Mlle\", \"Mme\")\n if (aristocratic.contains(title))\n \"Aristocratic\"\n else if (mrs.contains(title))\n \"Mrs\"\n else if (miss.contains(title))\n \"Miss\"\n else\n title\n}\n\n// given column with passenger name obtain column with passenger title group.\nval parseTitleGroupUDF = udf(parseTitle andThen titleGroup)", 825 | "outputs" : [ ] 826 | }, { 827 | "metadata" : { 828 | "trusted" : true, 829 | "input_collapsed" : false, 830 | "collapsed" : false, 831 | "id" : "115AAD76AD1C4FB385A3144AFCE13A92" 832 | }, 833 | "cell_type" : "code", 834 | "source" : "val withTitleDF = pdf.withColumn(\"TitleGroup\", parseTitleGroupUDF($\"Name\"))\n\nCustomPlotlyChart(withTitleDF, \n layout=\"{yaxis: {title: 'Age'}}\",\n dataOptions=\"{type: 'box', splitBy: 'TitleGroup'}\",\n dataSources=\"{y: 'Age'}\")", 835 | "outputs" : [ ] 836 | }, { 837 | "metadata" : { 838 | "id" : "5247C9EABE594E778E2012CA5161DA2E" 839 | }, 840 | "cell_type" : "markdown", 841 | "source" : "**Q-11 Plot the distribution of the % of survivors by title group.**" 842 | }, { 843 | "metadata" : { 844 | "trusted" : true, 845 | "input_collapsed" : false, 846 | "collapsed" : false, 847 | "id" : "C1412161CF1942288B64C419CEDC2A81" 848 | }, 849 | "cell_type" : "code", 850 | "source" : "val byTitleGr = withTitleDF\n .groupBy(\"TitleGroup\")\n .agg((sum(\"Survived\") / count(\"Survived\") * 100).alias(\"%\"))\n\nCustomPlotlyChart(byTitleGr,\n layout=\"{title: '% of survival by title group'}\",\n dataOptions=\"{type: 'bar'}\",\n dataSources=\"{x: 'TitleGroup', y: '%'}\")", 851 | "outputs" : [ ] 852 | }, { 853 | "metadata" : { 854 | "id" : "7C6408089C9F4EB5B32066C66E7E8306" 855 | }, 856 | "cell_type" : "markdown", 857 | "source" : "### Handling missing values" 858 | }, { 859 | "metadata" : { 860 | "trusted" : true, 861 | "input_collapsed" : false, 862 | "collapsed" : false, 863 | "id" : "339B055ADE124967B44CA329F13B857B" 864 | }, 865 | "cell_type" : "code", 866 | "source" : "import org.apache.spark.sql.functions.isnull\n\n100.0 * pdf.filter(isnull($\"Age\")).count / pdf.count", 867 | "outputs" : [ ] 868 | }, { 869 | "metadata" : { 870 | "trusted" : true, 871 | "input_collapsed" : false, 872 | "collapsed" : false, 873 | "id" : "5FD862836CBD4E3A80F284772118043D" 874 | }, 875 | "cell_type" : "code", 876 | "source" : "100.0 * pdf.filter(isnull($\"Cabin\")).count / pdf.count", 877 | "outputs" : [ ] 878 | }, { 879 | "metadata" : { 880 | "trusted" : true, 881 | "input_collapsed" : false, 882 | "collapsed" : false, 883 | "id" : "1EFDAA762DB1431DAEF1F3C9F34B545A" 884 | }, 885 | "cell_type" : "code", 886 | "source" : "val cabinStatus: (String) => String = (cabin: String) =>\n if (cabin == null)\n \"noname\"\n else\n \"hasNumber\"\n\nval cabinStatusUDF = udf(cabinStatus)", 887 | "outputs" : [ ] 888 | }, { 889 | "metadata" : { 890 | "trusted" : true, 891 | "input_collapsed" : false, 892 | "collapsed" : false, 893 | "id" : "0C1F6D0D8C04490385680BC9F370D7A4" 894 | }, 895 | "cell_type" : "code", 896 | "source" : "val withCabinStatusDF = pdf.withColumn(\"CabinStatus\", cabinStatusUDF($\"Cabin\"))", 897 | "outputs" : [ ] 898 | }, { 899 | "metadata" : { 900 | "trusted" : true, 901 | "input_collapsed" : false, 902 | "collapsed" : false, 903 | "id" : "2E23AA3B714F49E286B71AD626AEC7C5" 904 | }, 905 | "cell_type" : "code", 906 | "source" : "CustomPlotlyChart(withCabinStatusDF.groupBy(\"CabinStatus\", \"SurvivedStatus\").count,\n layout=\"{title: 'Number of passengers by survival status by cabin type', xaxis: {title: 'Cabin'}}\",\n dataOptions=\"{type: 'bar', splitBy: 'SurvivedStatus'}\",\n dataSources=\"{x: 'CabinStatus', y: 'count'}\")", 907 | "outputs" : [ ] 908 | }, { 909 | "metadata" : { 910 | "id" : "5AD2074CF32B4E40A18B17FCD63250AB" 911 | }, 912 | "cell_type" : "markdown", 913 | "source" : "### On your own" 914 | }, { 915 | "metadata" : { 916 | "id" : "CE92499FD6F74A79B7A5E3B2A5E90831" 917 | }, 918 | "cell_type" : "markdown", 919 | "source" : "Explore family relationships variables (SibSp and Parch).\nHow does the number of siblings/spouses aboard affect the chances of survival?\nHow does the number of parents/children aboard affect the chances of survival?\n\nInvent a new variable called `Family` to represent total number of relatives aboard and explore how does it affect hte chances of survival." 920 | } ], 921 | "nbformat" : 4 922 | } --------------------------------------------------------------------------------