├── Chapter 02
    └── Chapter_2_code.ipynb
├── Chapter 03
    └── Chapter_3_code.ipynb
├── Chapter 04
    └── Chapter_4_code.ipynb
├── Chapter 05
    └── Chapter_5_code.ipynb
├── Chapter 06
    └── Chapter_6_code.ipynb
├── Chapter 07
    └── Chapter_7_code.ipynb
├── Chapter 08
    ├── Chapter_8_code_HDFS.ipynb
    ├── Chapter_8_code_MR.ipynb
    ├── Chapter_8_code_Spark.ipynb
    └── Chapter_8_code_Vagrantfile
├── Chapter 09
    ├── Chapter_9_code_01.ipynb
    └── Chapter_9_code_02.ipynb
├── LICENSE
├── README.md
└── vowpal_wabbit_for_windows
    ├── x64
        └── vw.exe
    └── x86
        └── vw.exe


/Chapter 03/Chapter_3_code.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Datasets for experimenting yourself"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": 1,
  13 |    "metadata": {
  14 |     "collapsed": true
  15 |    },
  16 |    "outputs": [],
  17 |    "source": [
  18 |     "import urllib2 # import urllib.request as urllib2 in Python3\n",
  19 |     "import requests, io, os, StringIO\n",
  20 |     "import numpy as np\n",
  21 |     "import tarfile, zipfile, gzip\n",
  22 |     "\n",
  23 |     "\n",
  24 |     "def unzip_from_UCI(UCI_url, dest=''):\n",
  25 |     "    \"\"\"\n",
  26 |     "    Downloads and unpacks datasets from UCI in zip format\n",
  27 |     "    \"\"\"\n",
  28 |     "    response = requests.get(UCI_url)\n",
  29 |     "    compressed_file = io.BytesIO(response.content)\n",
  30 |     "    z = zipfile.ZipFile(compressed_file)\n",
  31 |     "    print ('Extracting in %s' %  os.getcwd()+'\\\\'+dest)\n",
  32 |     "    for name in z.namelist():\n",
  33 |     "        if '.csv' in name:\n",
  34 |     "            print ('\\tunzipping %s' %name)\n",
  35 |     "            z.extract(name, path=os.getcwd()+'\\\\'+dest)\n",
  36 |     "\n",
  37 |     "def gzip_from_UCI(UCI_url, dest=''):\n",
  38 |     "    \"\"\"\n",
  39 |     "    Downloads and unpacks datasets from UCI in gzip format\n",
  40 |     "    \"\"\"\n",
  41 |     "    response = urllib2.urlopen(UCI_url)\n",
  42 |     "    compressed_file = io.BytesIO(response.read())\n",
  43 |     "    decompressed_file = gzip.GzipFile(fileobj=compressed_file)\n",
  44 |     "    filename = UCI_url.split('/')[-1][:-3]\n",
  45 |     "    with open(os.getcwd()+'\\\\'+filename, 'wb') as outfile:\n",
  46 |     "        outfile.write(decompressed_file.read())\n",
  47 |     "    print ('File %s decompressed' % filename)\n",
  48 |     "            \n",
  49 |     "def targzip_from_UCI(UCI_url, dest='.'):\n",
  50 |     "    \"\"\"\n",
  51 |     "    Downloads and unpacks datasets from UCI in tar.gz format\n",
  52 |     "    \"\"\"\n",
  53 |     "    response = urllib2.urlopen(UCI_url)\n",
  54 |     "    compressed_file = StringIO.StringIO(response.read())\n",
  55 |     "    tar = tarfile.open(mode=\"r:gz\", fileobj = compressed_file)\n",
  56 |     "    tar.extractall(path=dest)\n",
  57 |     "    datasets = tar.getnames()\n",
  58 |     "    for dataset in datasets:\n",
  59 |     "        size = os.path.getsize(dest+'\\\\'+dataset)\n",
  60 |     "        print ('File %s is %i bytes' % (dataset,size))\n",
  61 |     "    tar.close()\n",
  62 |     "\n",
  63 |     "def load_matrix(UCI_url):\n",
  64 |     "    \"\"\"\n",
  65 |     "    Downloads datasets from UCI in matrix form\n",
  66 |     "    \"\"\"\n",
  67 |     "    return np.loadtxt(urllib2.urlopen(UCI_url))"
  68 |    ]
  69 |   },
  70 |   {
  71 |    "cell_type": "code",
  72 |    "execution_count": 2,
  73 |    "metadata": {
  74 |     "collapsed": false
  75 |    },
  76 |    "outputs": [
  77 |     {
  78 |      "name": "stdout",
  79 |      "output_type": "stream",
  80 |      "text": [
  81 |       "Current directory is: \"C:\\scisoft\\WinPython-64bit-2.7.9.4\\notebooks\\Packt - Large Scale\"\n"
  82 |      ]
  83 |     }
  84 |    ],
  85 |    "source": [
  86 |     "import os\n",
  87 |     "print \"Current directory is: \\\"%s\\\"\" % (os.getcwd())"
  88 |    ]
  89 |   },
  90 |   {
  91 |    "cell_type": "code",
  92 |    "execution_count": 3,
  93 |    "metadata": {
  94 |     "collapsed": true
  95 |    },
  96 |    "outputs": [],
  97 |    "source": [
  98 |     "import zlib\n",
  99 |     "from random import shuffle, seed\n",
 100 |     "\n",
 101 |     "def ram_shuffle(filename_in, filename_out, header=True, random_seed=0):\n",
 102 |     "    with open(filename_in, 'rb') as f:\n",
 103 |     "        zlines = [zlib.compress(line, 9) for line in f]\n",
 104 |     "        if header:\n",
 105 |     "            first_row = zlines.pop(0)\n",
 106 |     "    seed(random_seed)\n",
 107 |     "    shuffle(zlines)\n",
 108 |     "    with open(filename_out, 'wb') as f:\n",
 109 |     "        if header:\n",
 110 |     "            f.write(zlib.decompress(first_row))\n",
 111 |     "        for zline in zlines:\n",
 112 |     "            f.write(zlib.decompress(zline))"
 113 |    ]
 114 |   },
 115 |   {
 116 |    "cell_type": "markdown",
 117 |    "metadata": {},
 118 |    "source": [
 119 |     "###Bike Sharing Dataset Data Set"
 120 |    ]
 121 |   },
 122 |   {
 123 |    "cell_type": "code",
 124 |    "execution_count": 8,
 125 |    "metadata": {
 126 |     "collapsed": false
 127 |    },
 128 |    "outputs": [
 129 |     {
 130 |      "name": "stdout",
 131 |      "output_type": "stream",
 132 |      "text": [
 133 |       "Extracting in C:\\scisoft\\WinPython-64bit-2.7.9.4\\notebooks\\Packt - Large Scale\\bikesharing\n",
 134 |       "\tunzipping day.csv\n",
 135 |       "\tunzipping hour.csv\n"
 136 |      ]
 137 |     }
 138 |    ],
 139 |    "source": [
 140 |     "UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip'\n",
 141 |     "unzip_from_UCI(UCI_url, dest='bikesharing')"
 142 |    ]
 143 |   },
 144 |   {
 145 |    "cell_type": "markdown",
 146 |    "metadata": {},
 147 |    "source": [
 148 |     "###Covertype Data Set "
 149 |    ]
 150 |   },
 151 |   {
 152 |    "cell_type": "code",
 153 |    "execution_count": 10,
 154 |    "metadata": {
 155 |     "collapsed": false
 156 |    },
 157 |    "outputs": [
 158 |     {
 159 |      "name": "stdout",
 160 |      "output_type": "stream",
 161 |      "text": [
 162 |       "File covtype.data decompressed\n"
 163 |      ]
 164 |     }
 165 |    ],
 166 |    "source": [
 167 |     "UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz'\n",
 168 |     "gzip_from_UCI(UCI_url)"
 169 |    ]
 170 |   },
 171 |   {
 172 |    "cell_type": "code",
 173 |    "execution_count": 3,
 174 |    "metadata": {
 175 |     "collapsed": true
 176 |    },
 177 |    "outputs": [],
 178 |    "source": [
 179 |     "import os\n",
 180 |     "from random import seed\n",
 181 |     "local_path = os.getcwd()\n",
 182 |     "source = 'covtype.data'\n",
 183 |     "ram_shuffle(filename_in=local_path+'\\\\'+source, \\\n",
 184 |     "                   filename_out=local_path+'\\\\shuffled_covtype.data', header=False)"
 185 |    ]
 186 |   },
 187 |   {
 188 |    "cell_type": "markdown",
 189 |    "metadata": {},
 190 |    "source": [
 191 |     "#Non-linear & faster with Vowpal Wabbit "
 192 |    ]
 193 |   },
 194 |   {
 195 |    "cell_type": "markdown",
 196 |    "metadata": {},
 197 |    "source": [
 198 |     "###Useful functions"
 199 |    ]
 200 |   },
 201 |   {
 202 |    "cell_type": "code",
 203 |    "execution_count": 1,
 204 |    "metadata": {
 205 |     "collapsed": true
 206 |    },
 207 |    "outputs": [],
 208 |    "source": [
 209 |     "import numpy as np\n",
 210 |     "\n",
 211 |     "def sigmoid(x):\n",
 212 |     "    return 1. / (1. + np.exp(-x))\n",
 213 |     "\n",
 214 |     "def apply_log(x): \n",
 215 |     "    return np.log(x + 1.0)\n",
 216 |     "\n",
 217 |     "def apply_exp(x): \n",
 218 |     "    return np.exp(x) - 1.0"
 219 |    ]
 220 |   },
 221 |   {
 222 |    "cell_type": "markdown",
 223 |    "metadata": {},
 224 |    "source": [
 225 |     "###Useful dataset examples"
 226 |    ]
 227 |   },
 228 |   {
 229 |    "cell_type": "code",
 230 |    "execution_count": 37,
 231 |    "metadata": {
 232 |     "collapsed": false
 233 |    },
 234 |    "outputs": [
 235 |     {
 236 |      "name": "stdout",
 237 |      "output_type": "stream",
 238 |      "text": [
 239 |       "0 | price:.23 sqft:.25 age:.05 2006\n",
 240 |       "1 2 'second_house | price:.18 sqft:.15 age:.35 1976\n",
 241 |       "0 1 0.5 'third_house | price:.53 sqft:.32 age:.87 1924\n"
 242 |      ]
 243 |     }
 244 |    ],
 245 |    "source": [
 246 |     "with open('house_dataset','wb') as W:\n",
 247 |     "    W.write(\"0 | price:.23 sqft:.25 age:.05 2006\\n\")\n",
 248 |     "    W.write(\"1 2 'second_house | price:.18 sqft:.15 age:.35 1976\\n\")\n",
 249 |     "    W.write(\"0 1 0.5 'third_house | price:.53 sqft:.32 age:.87 1924\\n\")\n",
 250 |     "\n",
 251 |     "with open('house_dataset','rb') as R:\n",
 252 |     "    for line in R:\n",
 253 |     "        print line.strip()"
 254 |    ]
 255 |   },
 256 |   {
 257 |    "cell_type": "markdown",
 258 |    "metadata": {},
 259 |    "source": [
 260 |     "###A way to call VW from Python"
 261 |    ]
 262 |   },
 263 |   {
 264 |    "cell_type": "code",
 265 |    "execution_count": 2,
 266 |    "metadata": {
 267 |     "collapsed": false
 268 |    },
 269 |    "outputs": [
 270 |     {
 271 |      "name": "stdout",
 272 |      "output_type": "stream",
 273 |      "text": [
 274 |       "Num weight bits = 18\n",
 275 |       "learning rate = 0.5\n",
 276 |       "initial_t = 0\n",
 277 |       "power_t = 0.5\n",
 278 |       "using no cache\n",
 279 |       "Reading datafile = house_dataset\n",
 280 |       "num sources = 1\n",
 281 |       "average  since         example        example  current  current  current\n",
 282 |       "loss     last          counter         weight    label  predict features\n",
 283 |       "0.000000 0.000000            1            1.0   0.0000   0.0000        5\n",
 284 |       "0.666667 1.000000            2            3.0   1.0000   0.0000        5\n",
 285 |       "\n",
 286 |       "finished run\n",
 287 |       "number of examples per pass = 3\n",
 288 |       "passes used = 1\n",
 289 |       "weighted example sum = 4.000000\n",
 290 |       "weighted label sum = 2.000000\n",
 291 |       "average loss = 0.750000\n",
 292 |       "best constant = 0.500000\n",
 293 |       "best constant's loss = 0.250000\n",
 294 |       "total feature number = 15\n",
 295 |       "------------ COMPLETED ------------\n",
 296 |       "\n"
 297 |      ]
 298 |     }
 299 |    ],
 300 |    "source": [
 301 |     "import subprocess\n",
 302 |     "\n",
 303 |     "def execute_vw(parameters):\n",
 304 |     "    execution = subprocess.Popen('vw '+parameters, shell=True, stderr=subprocess.PIPE)\n",
 305 |     "    line = \"\"\n",
 306 |     "    history = \"\"\n",
 307 |     "    while True:\n",
 308 |     "        out = execution.stderr.read(1)\n",
 309 |     "        history += out\n",
 310 |     "        if out == '' and execution.poll() != None:\n",
 311 |     "            print '------------ COMPLETED ------------\\n'\n",
 312 |     "            break\n",
 313 |     "        if out != '':\n",
 314 |     "            line += out\n",
 315 |     "            if '\\n' in line[-2:]:\n",
 316 |     "                print line[:-2]\n",
 317 |     "                line = ''\n",
 318 |     "    return history.split('\\r\\n')\n",
 319 |     "\n",
 320 |     "\n",
 321 |     "params = \"house_dataset\"\n",
 322 |     "results = execute_vw(params)"
 323 |    ]
 324 |   },
 325 |   {
 326 |    "cell_type": "markdown",
 327 |    "metadata": {},
 328 |    "source": [
 329 |     "###Processing examples"
 330 |    ]
 331 |   },
 332 |   {
 333 |    "cell_type": "code",
 334 |    "execution_count": 2,
 335 |    "metadata": {
 336 |     "collapsed": true
 337 |    },
 338 |    "outputs": [],
 339 |    "source": [
 340 |     "import csv\n",
 341 |     "\n",
 342 |     "def vw_convert(origin_file, target_file, binary_features, numeric_features, target, transform_target=lambda(x):x,\n",
 343 |     "               separator=',', classification=True, multiclass=False, fieldnames= None, header=True, sparse=True):\n",
 344 |     "    \"\"\"\n",
 345 |     "    Reads a online style stream and returns a generator of normalized feature vectors\n",
 346 |     "    \n",
 347 |     "    Parameters\n",
 348 |     "    ‐‐‐‐‐‐‐‐‐‐\n",
 349 |     "    original_file = the csv file you are taken the data from \n",
 350 |     "    target file = the file to stream from\n",
 351 |     "    binary_features = the list of qualitative features to consider\n",
 352 |     "    numeric_features = the list of numeric features to consider\n",
 353 |     "    target = the label of the response variable\n",
 354 |     "    transform_target = a function transforming the response\n",
 355 |     "    separator = the field separator character\n",
 356 |     "    classification = a Boolean indicating if it is classification\n",
 357 |     "    multiclass =  a Boolean indicating if it is multiclass classification\n",
 358 |     "    fieldnames = the fields' labels (can be ommitted and read from file)\n",
 359 |     "    header = a boolean indicating if the original file has an header\n",
 360 |     "    sparse = if a sparse vector is to be returned from the generator\n",
 361 |     "    \"\"\"\n",
 362 |     "    with open(target_file, 'wb') as W:\n",
 363 |     "        with open(origin_file, 'rb') as R:\n",
 364 |     "            iterator = csv.DictReader(R, fieldnames, delimiter=separator)\n",
 365 |     "            for n, row in enumerate(iterator):\n",
 366 |     "                if not header or n>0:\n",
 367 |     "                # DATA PROCESSING\n",
 368 |     "                    response = transform_target(float(row[target]))\n",
 369 |     "                    if classification and not multiclass:\n",
 370 |     "                            if response == 0:\n",
 371 |     "                                stream_row = '-1 '\n",
 372 |     "                            else:\n",
 373 |     "                                stream_row = '1 '\n",
 374 |     "                    else:\n",
 375 |     "                        stream_row = str(response)+' '\n",
 376 |     "                    quantitative = list()\n",
 377 |     "                    qualitative  = list()\n",
 378 |     "                    for k,v in row.iteritems():\n",
 379 |     "                        if k in binary_features:\n",
 380 |     "                            qualitative.append(str(k)+'_'+str(v)+':1')\n",
 381 |     "                        else:\n",
 382 |     "                            if k in numeric_features and (float(v)!=0 or not sparse):\n",
 383 |     "                                quantitative.append(str(k)+':'+str(v))\n",
 384 |     "                    if quantitative:\n",
 385 |     "                        stream_row += '|n '+' '.join(quantitative)\n",
 386 |     "                    if qualitative:\n",
 387 |     "                        stream_row += '|q ' + ' '.join(qualitative)\n",
 388 |     "                    W.write(stream_row+'\\n')"
 389 |    ]
 390 |   },
 391 |   {
 392 |    "cell_type": "markdown",
 393 |    "metadata": {},
 394 |    "source": [
 395 |     "###Examples with toys datasets"
 396 |    ]
 397 |   },
 398 |   {
 399 |    "cell_type": "code",
 400 |    "execution_count": 210,
 401 |    "metadata": {
 402 |     "collapsed": true
 403 |    },
 404 |    "outputs": [],
 405 |    "source": [
 406 |     "import numpy as np\n",
 407 |     "from sklearn.datasets import load_iris, load_boston\n",
 408 |     "from random import seed\n",
 409 |     "iris = load_iris()\n",
 410 |     "seed(2)\n",
 411 |     "re_order = np.random.permutation(len(iris.target))\n",
 412 |     "with open('iris_versicolor.vw','wb') as W1:\n",
 413 |     "    for k in re_order:\n",
 414 |     "        y = iris.target[k]\n",
 415 |     "        X = iris.values()[1][k,:]\n",
 416 |     "        features = ' |f '+' '.join([a+':'+str(b) for a,b in zip(map(lambda(a): a[:-5].replace(' ','_'), iris.feature_names),X)])\n",
 417 |     "        target = '1' if y==1 else '-1'\n",
 418 |     "        W1.write(target+features+'\\n')"
 419 |    ]
 420 |   },
 421 |   {
 422 |    "cell_type": "code",
 423 |    "execution_count": 203,
 424 |    "metadata": {
 425 |     "collapsed": false
 426 |    },
 427 |    "outputs": [],
 428 |    "source": [
 429 |     "boston = load_boston()\n",
 430 |     "seed(2)\n",
 431 |     "re_order = np.random.permutation(len(boston.target))\n",
 432 |     "with open('boston.vw','wb') as W1:\n",
 433 |     "     for k in re_order:\n",
 434 |     "        y = boston.target[k]\n",
 435 |     "        X = boston.data[k,:]\n",
 436 |     "        features = ' |f '+' '.join([a+':'+str(b) for a,b in zip(map(lambda(a): a[:-5].replace(' ','_'), iris.feature_names),X)])\n",
 437 |     "        W1.write(str(y)+features+'\\n')"
 438 |    ]
 439 |   },
 440 |   {
 441 |    "cell_type": "markdown",
 442 |    "metadata": {},
 443 |    "source": [
 444 |     "###Binary Iris"
 445 |    ]
 446 |   },
 447 |   {
 448 |    "cell_type": "code",
 449 |    "execution_count": 197,
 450 |    "metadata": {
 451 |     "collapsed": false
 452 |    },
 453 |    "outputs": [
 454 |     {
 455 |      "name": "stdout",
 456 |      "output_type": "stream",
 457 |      "text": [
 458 |       "using l2 regularization = 1e-006\n",
 459 |       "predictions = iris_bin.test\n",
 460 |       "Lambda = 1e-006\n",
 461 |       "Kernel = rbf\n",
 462 |       "bandwidth = 0.1\n",
 463 |       "Num weight bits = 18\n",
 464 |       "learning rate = 0.5\n",
 465 |       "initial_t = 0\n",
 466 |       "power_t = 0.5\n",
 467 |       "using no cache\n",
 468 |       "Reading datafile = iris_versicolor.vw\n",
 469 |       "num sources = 1\n",
 470 |       "average  since         example        example  current  current  current\n",
 471 |       "loss     last          counter         weight    label  predict features\n",
 472 |       "1.000000 1.000000            1            1.0  -1.0000   0.0000        5\n",
 473 |       "0.960606 0.921212            2            2.0  -1.0000  -0.0788        5\n",
 474 |       "1.030685 1.100763            4            4.0  -1.0000  -0.7865        5\n",
 475 |       "0.790707 0.550729            8            8.0  -1.0000  -0.3755        5\n",
 476 |       "0.647808 0.504909           16           16.0  -1.0000  -1.2473        5\n",
 477 |       "0.477695 0.307582           32           32.0   1.0000   0.8621        5\n",
 478 |       "0.319804 0.161914           64           64.0  -1.0000  -1.7015        5\n",
 479 |       "0.272695 0.225585          128          128.0  -1.0000  -1.3150        5\n",
 480 |       "\n",
 481 |       "finished run\n",
 482 |       "number of examples = 150\n",
 483 |       "weighted example sum = 150.000000\n",
 484 |       "weighted label sum = -50.000000\n",
 485 |       "average loss = 0.248892\n",
 486 |       "best constant = -0.333333\n",
 487 |       "best constant's loss = 0.888889\n",
 488 |       "total feature number = 750\n",
 489 |       "Num support = 49\n",
 490 |       "Number of kernel evaluations = 8836 Number of cache queries = 18555\n",
 491 |       "Total loss = 37.333748\n",
 492 |       "Done freeing model\n",
 493 |       "Done freeing kernel params\n",
 494 |       "Done with finish \n",
 495 |       "------------ COMPLETED ------------\n",
 496 |       "\n"
 497 |      ]
 498 |     }
 499 |    ],
 500 |    "source": [
 501 |     "params = '--ksvm --l2 0.000001 --reprocess 2 -b 18 --kernel rbf --bandwidth=0.1 -p iris_bin.test -d iris_versicolor.vw'\n",
 502 |     "results = execute_vw(params)"
 503 |    ]
 504 |   },
 505 |   {
 506 |    "cell_type": "code",
 507 |    "execution_count": 198,
 508 |    "metadata": {
 509 |     "collapsed": false
 510 |    },
 511 |    "outputs": [
 512 |     {
 513 |      "name": "stdout",
 514 |      "output_type": "stream",
 515 |      "text": [
 516 |       "holdout accuracy: 0.966\n"
 517 |      ]
 518 |     }
 519 |    ],
 520 |    "source": [
 521 |     "import numpy as np\n",
 522 |     "def sigmoid(x):\n",
 523 |     "    return 1. / (1. + np.exp(-x))\n",
 524 |     "\n",
 525 |     "accuracy = 0\n",
 526 |     "with open('iris_bin.test', 'rb') as R:\n",
 527 |     "    with open('iris_versicolor.vw', 'rb') as TRAIN:\n",
 528 |     "        holdouts = 0.0\n",
 529 |     "        for n,(line, example) in enumerate(zip(R,TRAIN)):\n",
 530 |     "            if (n+1) % 10==0:\n",
 531 |     "                predicted = float(line.strip())\n",
 532 |     "                y = float(example.split('|')[0])\n",
 533 |     "                accuracy += np.sign(predicted)==np.sign(y)\n",
 534 |     "                holdouts += 1            \n",
 535 |     "print 'holdout accuracy: %0.3f' % ((accuracy / holdouts)**0.5)"
 536 |    ]
 537 |   },
 538 |   {
 539 |    "cell_type": "markdown",
 540 |    "metadata": {},
 541 |    "source": [
 542 |     "###Boston"
 543 |    ]
 544 |   },
 545 |   {
 546 |    "cell_type": "code",
 547 |    "execution_count": 211,
 548 |    "metadata": {
 549 |     "collapsed": false
 550 |    },
 551 |    "outputs": [
 552 |     {
 553 |      "name": "stdout",
 554 |      "output_type": "stream",
 555 |      "text": [
 556 |       "final_regressor = boston.model\n",
 557 |       "using dropout for neural network training\n",
 558 |       "Num weight bits = 18\n",
 559 |       "learning rate = 0.5\n",
 560 |       "initial_t = 0\n",
 561 |       "power_t = 0.5\n",
 562 |       "decay_learning_rate = 1\n",
 563 |       "creating cache_file = cache_train.vw\n",
 564 |       "Reading datafile = boston.vw\n",
 565 |       "num sources = 1\n",
 566 |       "average  since         example        example  current  current  current\n",
 567 |       "loss     last          counter         weight    label  predict features\n",
 568 |       "2500.000000 2500.000000            1            1.0  50.0000   0.0000        4\n",
 569 |       "1570.433136 640.866272            2            2.0  26.4000   1.0847        3\n",
 570 |       "945.682968 320.932800            4            4.0  21.0000   3.4834        3\n",
 571 |       "738.617393 531.551817            8            8.0  35.4000   6.9177        4\n",
 572 |       "559.106543 379.595694           16           16.0  23.1000   6.6911        3\n",
 573 |       "362.538769 165.970995           32           32.0  16.7000  12.2397        3\n",
 574 |       "301.716126 240.893483           64           64.0  19.7000  12.3789        3\n",
 575 |       "236.351873 170.987621          128          128.0  16.1000  15.3972        3\n",
 576 |       "180.695258 125.038643          256          256.0  26.5000  24.0065        3\n",
 577 |       "99.536619 99.536619          512          512.0  28.7000  18.4439        3 h\n",
 578 |       "83.688702 67.840785         1024         1024.0  50.0000  20.8653        4 h\n",
 579 |       "72.301786 60.914870         2048         2048.0  10.4000   0.0000        3 h\n",
 580 |       "59.041621 45.840391         4096         4096.0  20.6000  21.1746        4 h\n",
 581 |       "\n",
 582 |       "finished run\n",
 583 |       "number of examples per pass = 456\n",
 584 |       "passes used = 10\n",
 585 |       "weighted example sum = 4560.000000\n",
 586 |       "weighted label sum = 103341.001506\n",
 587 |       "average loss = 43.299850 h\n",
 588 |       "best constant = 22.662500\n",
 589 |       "total feature number = 15220\n",
 590 |       "------------ COMPLETED ------------\n",
 591 |       "\n"
 592 |      ]
 593 |     }
 594 |    ],
 595 |    "source": [
 596 |     "params = 'boston.vw -f boston.model --loss_function squared -k --cache_file cache_train.vw --passes=20 --nn 5 --dropout'\n",
 597 |     "results = execute_vw(params)"
 598 |    ]
 599 |   },
 600 |   {
 601 |    "cell_type": "code",
 602 |    "execution_count": 212,
 603 |    "metadata": {
 604 |     "collapsed": false
 605 |    },
 606 |    "outputs": [
 607 |     {
 608 |      "name": "stdout",
 609 |      "output_type": "stream",
 610 |      "text": [
 611 |       "only testing\n",
 612 |       "predictions = boston.test\n",
 613 |       "using dropout for neural network testing\n",
 614 |       "Num weight bits = 18\n",
 615 |       "learning rate = 0.5\n",
 616 |       "initial_t = 0\n",
 617 |       "power_t = 0.5\n",
 618 |       "creating cache_file = cache_test.vw\n",
 619 |       "Reading datafile = boston.vw\n",
 620 |       "num sources = 1\n",
 621 |       "average  since         example        example  current  current  current\n",
 622 |       "loss     last          counter         weight    label  predict features\n",
 623 |       "922.607483 922.607483            1            1.0  50.0000  19.6255        4\n",
 624 |       "464.302045 5.996608            2            2.0  26.4000  23.9512        3\n",
 625 |       "253.949617 43.597188            4            4.0  21.0000  21.2530        3\n",
 626 |       "175.713928 97.478239            8            8.0  35.4000  25.5958        4\n",
 627 |       "130.466937 85.219947           16           16.0  15.2000  15.8726        3\n",
 628 |       "79.291346 28.115755           32           32.0  15.6000  19.7057        4\n",
 629 |       "85.270478 91.249610           64           64.0  22.8000  20.4866        3\n",
 630 |       "83.265921 81.261364          128          128.0  20.8000  18.1267        3\n",
 631 |       "70.838572 58.411224          256          256.0  27.5000  16.6386        3\n",
 632 |       "\n",
 633 |       "finished run\n",
 634 |       "number of examples per pass = 506\n",
 635 |       "passes used = 1\n",
 636 |       "weighted example sum = 506.000000\n",
 637 |       "weighted label sum = 11401.600174\n",
 638 |       "average loss = 65.960779\n",
 639 |       "best constant = 22.532808\n",
 640 |       "total feature number = 1687\n",
 641 |       "------------ COMPLETED ------------\n",
 642 |       "\n"
 643 |      ]
 644 |     }
 645 |    ],
 646 |    "source": [
 647 |     "params = '-t boston.vw -i boston.model -k --cache_file cache_test.vw -p boston.test'\n",
 648 |     "results = execute_vw(params)"
 649 |    ]
 650 |   },
 651 |   {
 652 |    "cell_type": "code",
 653 |    "execution_count": 214,
 654 |    "metadata": {
 655 |     "collapsed": false
 656 |    },
 657 |    "outputs": [
 658 |     {
 659 |      "name": "stdout",
 660 |      "output_type": "stream",
 661 |      "text": [
 662 |       "holdout RMSE: 7.010\n"
 663 |      ]
 664 |     }
 665 |    ],
 666 |    "source": [
 667 |     "val_rmse = 0\n",
 668 |     "with open('boston.test', 'rb') as R:\n",
 669 |     "    with open('boston.vw', 'rb') as TRAIN:\n",
 670 |     "        holdouts = 0.0\n",
 671 |     "        for n,(line, example) in enumerate(zip(R,TRAIN)):\n",
 672 |     "            if (n+1) % 10==0:\n",
 673 |     "                predicted = float(line.strip())\n",
 674 |     "                y = float(example.split('|')[0])\n",
 675 |     "                val_rmse += (predicted - y)**2\n",
 676 |     "                holdouts += 1            \n",
 677 |     "print 'holdout RMSE: %0.3f' % ((val_rmse / holdouts)**0.5)"
 678 |    ]
 679 |   },
 680 |   {
 681 |    "cell_type": "markdown",
 682 |    "metadata": {},
 683 |    "source": [
 684 |     "###Bike sharing"
 685 |    ]
 686 |   },
 687 |   {
 688 |    "cell_type": "code",
 689 |    "execution_count": 6,
 690 |    "metadata": {
 691 |     "collapsed": false
 692 |    },
 693 |    "outputs": [],
 694 |    "source": [
 695 |     "import os\n",
 696 |     "local_path = os.getcwd()\n",
 697 |     "b_vars = ['holiday','hr','mnth', 'season','weathersit','weekday','workingday','yr']\n",
 698 |     "n_vars = ['hum', 'temp', 'atemp', 'windspeed']\n",
 699 |     "source = '\\\\bikesharing\\\\hour.csv'\n",
 700 |     "origin = target_file=local_path+'\\\\'+source\n",
 701 |     "target = target_file=local_path+'\\\\'+'bike.vw'\n",
 702 |     "vw_convert(origin, target, binary_features=b_vars, numeric_features=n_vars, target = 'cnt', transform_target=apply_log,\n",
 703 |     "               separator=',', classification=False, multiclass=False, fieldnames= None, header=True)"
 704 |    ]
 705 |   },
 706 |   {
 707 |    "cell_type": "code",
 708 |    "execution_count": 45,
 709 |    "metadata": {
 710 |     "collapsed": false
 711 |    },
 712 |    "outputs": [
 713 |     {
 714 |      "name": "stdout",
 715 |      "output_type": "stream",
 716 |      "text": [
 717 |       "final_regressor = regression.model\n",
 718 |       "Num weight bits = 18\n",
 719 |       "learning rate = 0.5\n",
 720 |       "initial_t = 0\n",
 721 |       "power_t = 0.5\n",
 722 |       "decay_learning_rate = 1\n",
 723 |       "creating cache_file = cache_train.vw\n",
 724 |       "Reading datafile = bike.vw\n",
 725 |       "num sources = 1\n",
 726 |       "average  since         example        example  current  current  current\n",
 727 |       "loss     last          counter         weight    label  predict features\n",
 728 |       "8.027098 8.027098            1            1.0   2.8332   0.0000       12\n",
 729 |       "7.243733 6.460369            2            2.0   3.7136   1.1718       12\n",
 730 |       "4.184013 1.124293            4            4.0   2.6391   2.4762       12\n",
 731 |       "2.709537 1.235061            8            8.0   1.3863   1.5636       12\n",
 732 |       "2.265795 1.822052           16           16.0   4.7095   3.7598       13\n",
 733 |       "1.325281 0.384768           32           32.0   2.1972   1.5774       13\n",
 734 |       "1.350559 1.375836           64           64.0   5.0626   3.8186       13\n",
 735 |       "1.395717 1.440876          128          128.0   4.2195   4.0547       13\n",
 736 |       "1.165618 0.935518          256          256.0   2.0794   3.3485       13\n",
 737 |       "0.952714 0.739810          512          512.0   4.0775   3.6438       13\n",
 738 |       "0.757944 0.563175         1024         1024.0   5.4116   4.0760       13\n",
 739 |       "0.583856 0.409769         2048         2048.0   1.0986   1.0007       13\n",
 740 |       "0.453590 0.323324         4096         4096.0   5.4027   5.5651       13\n",
 741 |       "0.393729 0.333867         8192         8192.0   3.8286   4.1227       12\n",
 742 |       "0.561750 0.561750        16384        16384.0   4.3944   4.0809       13 h\n",
 743 |       "0.509105 0.456460        32768        32768.0   4.4659   4.4656       13 h\n",
 744 |       "0.468332 0.427559        65536        65536.0   4.5951   4.4378       13 h\n",
 745 |       "\n",
 746 |       "finished run\n",
 747 |       "number of examples per pass = 15999\n",
 748 |       "passes used = 6\n",
 749 |       "weighted example sum = 95994.000000\n",
 750 |       "weighted label sum = 439183.191893\n",
 751 |       "average loss = 0.427485 h\n",
 752 |       "best constant = 4.575111\n",
 753 |       "total feature number = 1235898\n",
 754 |       "------------ COMPLETED ------------\n",
 755 |       "\n"
 756 |      ]
 757 |     }
 758 |    ],
 759 |    "source": [
 760 |     "params = 'bike.vw -f regression.model -k --cache_file cache_train.vw --passes=1000 --hash strings --holdout_after 16000'\n",
 761 |     "results = execute_vw(params)"
 762 |    ]
 763 |   },
 764 |   {
 765 |    "cell_type": "code",
 766 |    "execution_count": 47,
 767 |    "metadata": {
 768 |     "collapsed": false
 769 |    },
 770 |    "outputs": [
 771 |     {
 772 |      "name": "stdout",
 773 |      "output_type": "stream",
 774 |      "text": [
 775 |       "only testing\n",
 776 |       "predictions = pred.test\n",
 777 |       "Num weight bits = 18\n",
 778 |       "learning rate = 0.5\n",
 779 |       "initial_t = 0\n",
 780 |       "power_t = 0.5\n",
 781 |       "creating cache_file = cache_test.vw\n",
 782 |       "Reading datafile = bike.vw\n",
 783 |       "num sources = 1\n",
 784 |       "average  since         example        example  current  current  current\n",
 785 |       "loss     last          counter         weight    label  predict features\n",
 786 |       "0.127379 0.127379            1            1.0   2.8332   3.1901       12\n",
 787 |       "0.751745 1.376112            2            2.0   3.7136   2.5405       12\n",
 788 |       "1.210345 1.668944            4            4.0   2.6391   1.5334       12\n",
 789 |       "2.774795 4.339245            8            8.0   1.3863   4.3803       12\n",
 790 |       "2.276018 1.777242           16           16.0   4.7095   4.8526       13\n",
 791 |       "2.179675 2.083333           32           32.0   2.1972   4.6568       13\n",
 792 |       "1.411963 0.644251           64           64.0   5.0626   5.1554       13\n",
 793 |       "0.836451 0.260938          128          128.0   4.2195   4.6608       13\n",
 794 |       "0.677186 0.517921          256          256.0   2.0794   2.8816       13\n",
 795 |       "0.600932 0.524678          512          512.0   4.0775   4.0583       13\n",
 796 |       "0.512835 0.424738         1024         1024.0   5.4116   4.8593       13\n",
 797 |       "0.498590 0.484345         2048         2048.0   1.0986   1.0587       13\n",
 798 |       "0.422767 0.346943         4096         4096.0   5.4027   5.7840       13\n",
 799 |       "0.407376 0.391985         8192         8192.0   3.8286   3.9312       12\n",
 800 |       "0.374806 0.342236        16384        16384.0   5.7900   5.4536       12\n",
 801 |       "\n",
 802 |       "finished run\n",
 803 |       "number of examples per pass = 17379\n",
 804 |       "passes used = 1\n",
 805 |       "weighted example sum = 17379.000000\n",
 806 |       "weighted label sum = 79504.382239\n",
 807 |       "average loss = 0.380562\n",
 808 |       "best constant = 4.574739\n",
 809 |       "total feature number = 223723\n",
 810 |       "------------ COMPLETED ------------\n",
 811 |       "\n"
 812 |      ]
 813 |     }
 814 |    ],
 815 |    "source": [
 816 |     "params = '-t bike.vw -i regression.model -k --cache_file cache_test.vw -p pred.test'\n",
 817 |     "results = execute_vw(params)"
 818 |    ]
 819 |   },
 820 |   {
 821 |    "cell_type": "code",
 822 |    "execution_count": 10,
 823 |    "metadata": {
 824 |     "collapsed": false
 825 |    },
 826 |    "outputs": [
 827 |     {
 828 |      "name": "stdout",
 829 |      "output_type": "stream",
 830 |      "text": [
 831 |       "holdout RMSE: 135.306\n",
 832 |       "holdout RMSLE: 0.845\n"
 833 |      ]
 834 |     }
 835 |    ],
 836 |    "source": [
 837 |     "val_rmse = 0\n",
 838 |     "val_rmsle = 0\n",
 839 |     "with open('pred.test', 'rb') as R:\n",
 840 |     "    with open('bike.vw', 'rb') as TRAIN:\n",
 841 |     "        holdouts = 0.0\n",
 842 |     "        for n,(line, example) in enumerate(zip(R,TRAIN)):\n",
 843 |     "            if n > 16000:\n",
 844 |     "                predicted = float(line.strip())\n",
 845 |     "                y_log = float(example.split('|')[0])\n",
 846 |     "                y = apply_exp(y_log)\n",
 847 |     "                val_rmse += (apply_exp(predicted) - y)**2\n",
 848 |     "                val_rmsle += (predicted - y_log)**2\n",
 849 |     "                holdouts += 1\n",
 850 |     "            \n",
 851 |     "print 'holdout RMSE: %0.3f' % ((val_rmse / holdouts)**0.5)\n",
 852 |     "print 'holdout RMSLE: %0.3f' % ((val_rmsle / holdouts)**0.5)\n"
 853 |    ]
 854 |   },
 855 |   {
 856 |    "cell_type": "markdown",
 857 |    "metadata": {},
 858 |    "source": [
 859 |     "###Covertype"
 860 |    ]
 861 |   },
 862 |   {
 863 |    "cell_type": "code",
 864 |    "execution_count": 8,
 865 |    "metadata": {
 866 |     "collapsed": false
 867 |    },
 868 |    "outputs": [],
 869 |    "source": [
 870 |     "import os\n",
 871 |     "local_path = os.getcwd()\n",
 872 |     "n_vars = ['var_'+'0'*int(j<10)+str(j) for j in range(54)]\n",
 873 |     "source = 'shuffled_covtype.data'\n",
 874 |     "origin = target_file=local_path+'\\\\'+source\n",
 875 |     "target = target_file=local_path+'\\\\'+'covtype.vw'\n",
 876 |     "vw_convert(origin, target, binary_features=list(), fieldnames= n_vars+['covertype'], numeric_features=n_vars,\n",
 877 |     "    target = 'covertype', separator=',', classification=True, multiclass=True, header=False, sparse=False)"
 878 |    ]
 879 |   },
 880 |   {
 881 |    "cell_type": "code",
 882 |    "execution_count": 20,
 883 |    "metadata": {
 884 |     "collapsed": false
 885 |    },
 886 |    "outputs": [
 887 |     {
 888 |      "name": "stdout",
 889 |      "output_type": "stream",
 890 |      "text": [
 891 |       "creating cubic features for triples: nnn \n",
 892 |       "final_regressor = multiclass.model\n",
 893 |       "Num weight bits = 18\n",
 894 |       "learning rate = 1\n",
 895 |       "initial_t = 0\n",
 896 |       "power_t = 0.5\n",
 897 |       "decay_learning_rate = 1\n",
 898 |       "creating cache_file = cache_train.vw\n",
 899 |       "Reading datafile = covtype.vw\n",
 900 |       "num sources = 1\n",
 901 |       "average  since         example        example  current  current  current\n",
 902 |       "loss     last          counter         weight    label  predict features\n",
 903 |       "0.000000 0.000000            1            1.0        1        1      377\n",
 904 |       "0.000000 0.000000            2            2.0        1        1      377\n",
 905 |       "0.250000 0.500000            4            4.0        2        1      377\n",
 906 |       "0.375000 0.500000            8            8.0        1        2      377\n",
 907 |       "0.437500 0.500000           16           16.0        2        1      231\n",
 908 |       "0.531250 0.625000           32           32.0        1        2      377\n",
 909 |       "0.546875 0.562500           64           64.0        2        1      377\n",
 910 |       "0.500000 0.453125          128          128.0        1        1      377\n",
 911 |       "0.519531 0.539063          256          256.0        2        2      377\n",
 912 |       "0.484375 0.449219          512          512.0        2        2      377\n",
 913 |       "0.446289 0.408203         1024         1024.0        3        6      377\n",
 914 |       "0.416504 0.386719         2048         2048.0        2        2      377\n",
 915 |       "0.402100 0.387695         4096         4096.0        1        1      377\n",
 916 |       "0.372559 0.343018         8192         8192.0        1        1      298\n",
 917 |       "0.348694 0.324829        16384        16384.0        1        1      377\n",
 918 |       "0.319092 0.289490        32768        32768.0        2        2      377\n",
 919 |       "0.297256 0.275421        65536        65536.0        2        2      377\n",
 920 |       "0.278419 0.259583       131072       131072.0        2        2      377\n",
 921 |       "0.263660 0.248901       262144       262144.0        2        2      377\n",
 922 |       "0.253858 0.253858       524288       524288.0        1        1      377 h\n",
 923 |       "\n",
 924 |       "finished run\n",
 925 |       "number of examples per pass = 522911\n",
 926 |       "passes used = 2\n",
 927 |       "weighted example sum = 1045822.000000\n",
 928 |       "weighted label sum = 0.000000\n",
 929 |       "average loss = 0.235538 h\n",
 930 |       "total feature number = 384838154\n",
 931 |       "------------ COMPLETED ------------\n",
 932 |       "\n"
 933 |      ]
 934 |     }
 935 |    ],
 936 |    "source": [
 937 |     "params = 'covtype.vw --ect 7 -f multiclass.model -k --cache_file cache_train.vw --passes=2 -l 1.0 --cubic nnn'\n",
 938 |     "results = execute_vw(params)"
 939 |    ]
 940 |   },
 941 |   {
 942 |    "cell_type": "code",
 943 |    "execution_count": 21,
 944 |    "metadata": {
 945 |     "collapsed": false
 946 |    },
 947 |    "outputs": [
 948 |     {
 949 |      "name": "stdout",
 950 |      "output_type": "stream",
 951 |      "text": [
 952 |       "creating cubic features for triples: nnn \n",
 953 |       "only testing\n",
 954 |       "predictions = covertype.test\n",
 955 |       "Num weight bits = 18\n",
 956 |       "learning rate = 0.5\n",
 957 |       "initial_t = 0\n",
 958 |       "power_t = 0.5\n",
 959 |       "creating cache_file = cache_test.vw\n",
 960 |       "Reading datafile = covtype.vw\n",
 961 |       "num sources = 1\n",
 962 |       "average  since         example        example  current  current  current\n",
 963 |       "loss     last          counter         weight    label  predict features\n",
 964 |       "0.000000 0.000000            1            1.0        1        1      377\n",
 965 |       "0.000000 0.000000            2            2.0        1        1      377\n",
 966 |       "0.000000 0.000000            4            4.0        2        2      377\n",
 967 |       "0.000000 0.000000            8            8.0        1        1      377\n",
 968 |       "0.187500 0.375000           16           16.0        1        2      377\n",
 969 |       "0.156250 0.125000           32           32.0        3        3      377\n",
 970 |       "0.156250 0.156250           64           64.0        2        1      377\n",
 971 |       "0.218750 0.281250          128          128.0        2        2      377\n",
 972 |       "0.222656 0.226563          256          256.0        2        2      377\n",
 973 |       "0.240234 0.257813          512          512.0        2        2      377\n",
 974 |       "0.234375 0.228516         1024         1024.0        2        2      377\n",
 975 |       "0.242676 0.250977         2048         2048.0        2        2      377\n",
 976 |       "0.242920 0.243164         4096         4096.0        1        1      377\n",
 977 |       "0.236328 0.229736         8192         8192.0        1        1      377\n",
 978 |       "0.231079 0.225830        16384        16384.0        1        1      298\n",
 979 |       "0.229858 0.228638        32768        32768.0        1        1      377\n",
 980 |       "0.232224 0.234589        65536        65536.0        1        1      377\n",
 981 |       "0.231529 0.230835       131072       131072.0        2        2      377\n",
 982 |       "0.231815 0.232101       262144       262144.0        2        2      377\n",
 983 |       "0.231606 0.231396       524288       524288.0        1        1      377\n",
 984 |       "\n",
 985 |       "finished run\n",
 986 |       "number of examples per pass = 581012\n",
 987 |       "passes used = 1\n",
 988 |       "weighted example sum = 581012.000000\n",
 989 |       "weighted label sum = 0.000000\n",
 990 |       "average loss = 0.231111\n",
 991 |       "total feature number = 213797603\n",
 992 |       "------------ COMPLETED ------------\n",
 993 |       "\n"
 994 |      ]
 995 |     }
 996 |    ],
 997 |    "source": [
 998 |     "params = '-t covtype.vw -i multiclass.model -k --cache_file cache_test.vw -p covertype.test'\n",
 999 |     "results = execute_vw(params)"
1000 |    ]
1001 |   },
1002 |   {
1003 |    "cell_type": "code",
1004 |    "execution_count": 8,
1005 |    "metadata": {
1006 |     "collapsed": false
1007 |    },
1008 |    "outputs": [
1009 |     {
1010 |      "name": "stdout",
1011 |      "output_type": "stream",
1012 |      "text": [
1013 |       "holdout accuracy: 0.769\n"
1014 |      ]
1015 |     }
1016 |    ],
1017 |    "source": [
1018 |     "accuracy = 0\n",
1019 |     "with open('covertype.test', 'rb') as R:\n",
1020 |     "    with open('covtype.vw', 'rb') as TRAIN:\n",
1021 |     "        holdouts = 0.0\n",
1022 |     "        for n,(line, example) in enumerate(zip(R,TRAIN)):\n",
1023 |     "            if (n+1) % 10==0:\n",
1024 |     "                predicted = float(line.strip())\n",
1025 |     "                y = float(example.split('|')[0])\n",
1026 |     "                accuracy += predicted ==y\n",
1027 |     "                holdouts += 1\n",
1028 |     "print 'holdout accuracy: %0.3f' % (accuracy / holdouts)"
1029 |    ]
1030 |   }
1031 |  ],
1032 |  "metadata": {
1033 |   "kernelspec": {
1034 |    "display_name": "Python 2",
1035 |    "language": "python",
1036 |    "name": "python2"
1037 |   },
1038 |   "language_info": {
1039 |    "codemirror_mode": {
1040 |     "name": "ipython",
1041 |     "version": 2
1042 |    },
1043 |    "file_extension": ".py",
1044 |    "mimetype": "text/x-python",
1045 |    "name": "python",
1046 |    "nbconvert_exporter": "python",
1047 |    "pygments_lexer": "ipython2",
1048 |    "version": "2.7.9"
1049 |   }
1050 |  },
1051 |  "nbformat": 4,
1052 |  "nbformat_minor": 0
1053 | }
1054 | 


--------------------------------------------------------------------------------
/Chapter 08/Chapter_8_code_HDFS.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stdout",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "16/05/10 19:34:19 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\n",
 15 |       "Deleted /tmp\n",
 16 |       "16/05/10 19:34:22 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\n"
 17 |      ]
 18 |     }
 19 |    ],
 20 |    "source": [
 21 |     "# Clean up\n",
 22 |     "!hdfs dfs -rm -r -f /datasets /tmp\n",
 23 |     "!rm -rf /tmp/hadoop_git_readme*\n",
 24 |     "!hdfs dfs -expunge"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Command line"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    },
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "Configured Capacity: 42241163264 (39.34 GB)\r\n",
 46 |       "Present Capacity: 37536710656 (34.96 GB)\r\n",
 47 |       "DFS Remaining: 37346992128 (34.78 GB)\r\n",
 48 |       "DFS Used: 189718528 (180.93 MB)\r\n",
 49 |       "DFS Used%: 0.51%\r\n",
 50 |       "Under replicated blocks: 0\r\n",
 51 |       "Blocks with corrupt replicas: 0\r\n",
 52 |       "Missing blocks: 0\r\n",
 53 |       "\r\n",
 54 |       "-------------------------------------------------\r\n",
 55 |       "Live datanodes (1):\r\n",
 56 |       "\r\n",
 57 |       "Name: 127.0.0.1:50010 (localhost)\r\n",
 58 |       "Hostname: sparkbox\r\n",
 59 |       "Decommission Status : Normal\r\n",
 60 |       "Configured Capacity: 42241163264 (39.34 GB)\r\n",
 61 |       "DFS Used: 189718528 (180.93 MB)\r\n",
 62 |       "Non DFS Used: 4704452608 (4.38 GB)\r\n",
 63 |       "DFS Remaining: 37346992128 (34.78 GB)\r\n",
 64 |       "DFS Used%: 0.45%\r\n",
 65 |       "DFS Remaining%: 88.41%\r\n",
 66 |       "Configured Cache Capacity: 0 (0 B)\r\n",
 67 |       "Cache Used: 0 (0 B)\r\n",
 68 |       "Cache Remaining: 0 (0 B)\r\n",
 69 |       "Cache Used%: 100.00%\r\n",
 70 |       "Cache Remaining%: 0.00%\r\n",
 71 |       "Xceivers: 1\r\n",
 72 |       "Last contact: Tue May 10 19:34:23 UTC 2016\r\n",
 73 |       "\r\n",
 74 |       "\r\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "!hdfs dfsadmin -report"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "metadata": {
 86 |     "collapsed": false
 87 |    },
 88 |    "outputs": [
 89 |     {
 90 |      "name": "stdout",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "Found 2 items\r\n",
 94 |       "drwxr-xr-x   - vagrant supergroup          0 2016-05-10 19:05 /spark\r\n",
 95 |       "drwxr-xr-x   - vagrant supergroup          0 2016-05-10 18:48 /user\r\n"
 96 |      ]
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "!hdfs dfs -ls /"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 4,
106 |    "metadata": {
107 |     "collapsed": false
108 |    },
109 |    "outputs": [
110 |     {
111 |      "name": "stdout",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "Filesystem               Size     Used  Available  Use%\r\n",
115 |       "hdfs://localhost:9000  39.3 G  180.9 M     34.8 G    0%\r\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "!hdfs dfs -df -h /"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 5,
126 |    "metadata": {
127 |     "collapsed": false
128 |    },
129 |    "outputs": [
130 |     {
131 |      "name": "stdout",
132 |      "output_type": "stream",
133 |      "text": [
134 |       "179.0 M  /spark\r\n",
135 |       "473.4 K  /user\r\n"
136 |      ]
137 |     }
138 |    ],
139 |    "source": [
140 |     "!hdfs dfs -du -h /"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 6,
146 |    "metadata": {
147 |     "collapsed": false
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     "!hdfs dfs -mkdir /datasets"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 7,
157 |    "metadata": {
158 |     "collapsed": false
159 |    },
160 |    "outputs": [],
161 |    "source": [
162 |     "!wget -q http://www.gutenberg.org/cache/epub/100/pg100.txt \\\n",
163 |     "    -O ../datasets/shakespeare_all.txt"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 8,
169 |    "metadata": {
170 |     "collapsed": false
171 |    },
172 |    "outputs": [],
173 |    "source": [
174 |     "!hdfs dfs -put ../datasets/shakespeare_all.txt \\\n",
175 |     "    /datasets/shakespeare_all.txt\n",
176 |     "\n",
177 |     "!hdfs dfs -put ../datasets/hadoop_git_readme.txt \\\n",
178 |     "    /datasets/hadoop_git_readme.txt"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 9,
184 |    "metadata": {
185 |     "collapsed": false
186 |    },
187 |    "outputs": [
188 |     {
189 |      "name": "stdout",
190 |      "output_type": "stream",
191 |      "text": [
192 |       "Found 2 items\r\n",
193 |       "-rw-r--r--   1 vagrant supergroup       1365 2016-05-10 19:34 /datasets/hadoop_git_readme.txt\r\n",
194 |       "-rw-r--r--   1 vagrant supergroup    5589889 2016-05-10 19:34 /datasets/shakespeare_all.txt\r\n"
195 |      ]
196 |     }
197 |    ],
198 |    "source": [
199 |     "!hdfs dfs -ls /datasets"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 10,
205 |    "metadata": {
206 |     "collapsed": false
207 |    },
208 |    "outputs": [
209 |     {
210 |      "name": "stdout",
211 |      "output_type": "stream",
212 |      "text": [
213 |       "30\r\n"
214 |      ]
215 |     }
216 |    ],
217 |    "source": [
218 |     "!hdfs dfs -cat /datasets/hadoop_git_readme.txt | wc -l"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 11,
224 |    "metadata": {
225 |     "collapsed": false
226 |    },
227 |    "outputs": [
228 |     {
229 |      "name": "stdout",
230 |      "output_type": "stream",
231 |      "text": [
232 |       "60\r\n"
233 |      ]
234 |     }
235 |    ],
236 |    "source": [
237 |     "!hdfs dfs -cat \\\n",
238 |     "    hdfs:///datasets/hadoop_git_readme.txt \\\n",
239 |     "    file:///home/vagrant/datasets/hadoop_git_readme.txt | wc -l"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 12,
245 |    "metadata": {
246 |     "collapsed": true
247 |    },
248 |    "outputs": [],
249 |    "source": [
250 |     "!hdfs dfs -cp /datasets/hadoop_git_readme.txt \\\n",
251 |     "    /datasets/copy_hadoop_git_readme.txt"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 13,
257 |    "metadata": {
258 |     "collapsed": false
259 |    },
260 |    "outputs": [
261 |     {
262 |      "name": "stdout",
263 |      "output_type": "stream",
264 |      "text": [
265 |       "16/05/10 19:35:07 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\r\n",
266 |       "Deleted /datasets/copy_hadoop_git_readme.txt\r\n"
267 |      ]
268 |     }
269 |    ],
270 |    "source": [
271 |     "!hdfs dfs -rm /datasets/copy_hadoop_git_readme.txt"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 14,
277 |    "metadata": {
278 |     "collapsed": false
279 |    },
280 |    "outputs": [
281 |     {
282 |      "name": "stdout",
283 |      "output_type": "stream",
284 |      "text": [
285 |       "16/05/10 19:35:09 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\r\n"
286 |      ]
287 |     }
288 |    ],
289 |    "source": [
290 |     "!hdfs dfs -expunge"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 15,
296 |    "metadata": {
297 |     "collapsed": false
298 |    },
299 |    "outputs": [],
300 |    "source": [
301 |     "!hdfs dfs -get /datasets/hadoop_git_readme.txt \\\n",
302 |     "    /tmp/hadoop_git_readme.txt"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 16,
308 |    "metadata": {
309 |     "collapsed": false
310 |    },
311 |    "outputs": [
312 |     {
313 |      "name": "stdout",
314 |      "output_type": "stream",
315 |      "text": [
316 |       "ntry, of \r\n",
317 |       "encryption software.  BEFORE using any encryption software, please \r\n",
318 |       "check your country's laws, regulations and policies concerning the\r\n",
319 |       "import, possession, or use, and re-export of encryption software, to \r\n",
320 |       "see if this is permitted.  See <http://www.wassenaar.org/> for more\r\n",
321 |       "information.\r\n",
322 |       "\r\n",
323 |       "The U.S. Government Department of Commerce, Bureau of Industry and\r\n",
324 |       "Security (BIS), has classified this software as Export Commodity \r\n",
325 |       "Control Number (ECCN) 5D002.C.1, which includes information security\r\n",
326 |       "software using or performing cryptographic functions with asymmetric\r\n",
327 |       "algorithms.  The form and manner of this Apache Software Foundation\r\n",
328 |       "distribution makes it eligible for export under the License Exception\r\n",
329 |       "ENC Technology Software Unrestricted (TSU) exception (see the BIS \r\n",
330 |       "Export Administration Regulations, Section 740.13) for both object \r\n",
331 |       "code and source code.\r\n",
332 |       "\r\n",
333 |       "The following provides more details on the included cryptographic\r\n",
334 |       "software:\r\n",
335 |       "  Hadoop Core uses the SSL libraries from the Jetty project written \r\n",
336 |       "by mortbay.org."
337 |      ]
338 |     }
339 |    ],
340 |    "source": [
341 |     "!hdfs dfs -tail /datasets/hadoop_git_readme.txt"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "markdown",
346 |    "metadata": {
347 |     "collapsed": true
348 |    },
349 |    "source": [
350 |     "## Snakebite"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 17,
356 |    "metadata": {
357 |     "collapsed": false
358 |    },
359 |    "outputs": [],
360 |    "source": [
361 |     "from snakebite.client import Client\n",
362 |     "client = Client(\"localhost\", 9000)"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 18,
368 |    "metadata": {
369 |     "collapsed": false
370 |    },
371 |    "outputs": [
372 |     {
373 |      "data": {
374 |       "text/plain": [
375 |        "{'blockSize': 134217728L,\n",
376 |        " 'bytesPerChecksum': 512,\n",
377 |        " 'checksumType': 2,\n",
378 |        " 'encryptDataTransfer': False,\n",
379 |        " 'fileBufferSize': 4096,\n",
380 |        " 'replication': 1,\n",
381 |        " 'trashInterval': 0L,\n",
382 |        " 'writePacketSize': 65536}"
383 |       ]
384 |      },
385 |      "execution_count": 18,
386 |      "metadata": {},
387 |      "output_type": "execute_result"
388 |     }
389 |    ],
390 |    "source": [
391 |     "client.serverdefaults()"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": 19,
397 |    "metadata": {
398 |     "collapsed": false
399 |    },
400 |    "outputs": [
401 |     {
402 |      "name": "stdout",
403 |      "output_type": "stream",
404 |      "text": [
405 |       "/datasets\n",
406 |       "/spark\n",
407 |       "/user\n"
408 |      ]
409 |     }
410 |    ],
411 |    "source": [
412 |     "for x in client.ls(['/']):\n",
413 |     "    print x['path']"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "code",
418 |    "execution_count": 20,
419 |    "metadata": {
420 |     "collapsed": false
421 |    },
422 |    "outputs": [
423 |     {
424 |      "data": {
425 |       "text/plain": [
426 |        "{'capacity': 42241163264L,\n",
427 |        " 'corrupt_blocks': 0L,\n",
428 |        " 'filesystem': 'hdfs://localhost:9000',\n",
429 |        " 'missing_blocks': 0L,\n",
430 |        " 'remaining': 37341663232L,\n",
431 |        " 'under_replicated': 0L,\n",
432 |        " 'used': 195353480L}"
433 |       ]
434 |      },
435 |      "execution_count": 20,
436 |      "metadata": {},
437 |      "output_type": "execute_result"
438 |     }
439 |    ],
440 |    "source": [
441 |     "client.df()"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": 21,
447 |    "metadata": {
448 |     "collapsed": false
449 |    },
450 |    "outputs": [
451 |     {
452 |      "data": {
453 |       "text/plain": [
454 |        "[{'length': 5591254L, 'path': '/datasets'},\n",
455 |        " {'length': 187698038L, 'path': '/spark'},\n",
456 |        " {'length': 484810L, 'path': '/user'}]"
457 |       ]
458 |      },
459 |      "execution_count": 21,
460 |      "metadata": {},
461 |      "output_type": "execute_result"
462 |     }
463 |    ],
464 |    "source": [
465 |     "list(client.du([\"/\"]))"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": 22,
471 |    "metadata": {
472 |     "collapsed": false
473 |    },
474 |    "outputs": [],
475 |    "source": [
476 |     "# Note:\n",
477 |     "# put command is not yet available"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "code",
482 |    "execution_count": 23,
483 |    "metadata": {
484 |     "collapsed": false
485 |    },
486 |    "outputs": [
487 |     {
488 |      "name": "stdout",
489 |      "output_type": "stream",
490 |      "text": [
491 |       "30\n"
492 |      ]
493 |     }
494 |    ],
495 |    "source": [
496 |     "for el in client.cat(['/datasets/hadoop_git_readme.txt']):\n",
497 |     "    print el.next().count(\"\\n\")"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "code",
502 |    "execution_count": 24,
503 |    "metadata": {
504 |     "collapsed": false
505 |    },
506 |    "outputs": [],
507 |    "source": [
508 |     "# Note:\n",
509 |     "# copy command is not yet available"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": 25,
515 |    "metadata": {
516 |     "collapsed": false
517 |    },
518 |    "outputs": [
519 |     {
520 |      "data": {
521 |       "text/plain": [
522 |        "{'path': '/datasets/shakespeare_all.txt', 'result': True}"
523 |       ]
524 |      },
525 |      "execution_count": 25,
526 |      "metadata": {},
527 |      "output_type": "execute_result"
528 |     }
529 |    ],
530 |    "source": [
531 |     "client.delete(['/datasets/shakespeare_all.txt']).next()"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": 26,
537 |    "metadata": {
538 |     "collapsed": false
539 |    },
540 |    "outputs": [
541 |     {
542 |      "data": {
543 |       "text/plain": [
544 |        "{'error': '',\n",
545 |        " 'path': '/tmp/hadoop_git_readme_2.txt',\n",
546 |        " 'result': True,\n",
547 |        " 'source_path': '/datasets/hadoop_git_readme.txt'}"
548 |       ]
549 |      },
550 |      "execution_count": 26,
551 |      "metadata": {},
552 |      "output_type": "execute_result"
553 |     }
554 |    ],
555 |    "source": [
556 |     "(client\n",
557 |     ".copyToLocal(['/datasets/hadoop_git_readme.txt'], \n",
558 |     "             '/tmp/hadoop_git_readme_2.txt')\n",
559 |     ".next())"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": 27,
565 |    "metadata": {
566 |     "collapsed": false
567 |    },
568 |    "outputs": [
569 |     {
570 |      "data": {
571 |       "text/plain": [
572 |        "[{'path': '/datasets_2', 'result': True}]"
573 |       ]
574 |      },
575 |      "execution_count": 27,
576 |      "metadata": {},
577 |      "output_type": "execute_result"
578 |     }
579 |    ],
580 |    "source": [
581 |     "list(client.mkdir(['/datasets_2']))"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "code",
586 |    "execution_count": 28,
587 |    "metadata": {
588 |     "collapsed": false
589 |    },
590 |    "outputs": [
591 |     {
592 |      "data": {
593 |       "text/plain": [
594 |        "[{'path': '/datasets', 'result': True},\n",
595 |        " {'path': '/datasets_2', 'result': True}]"
596 |       ]
597 |      },
598 |      "execution_count": 28,
599 |      "metadata": {},
600 |      "output_type": "execute_result"
601 |     }
602 |    ],
603 |    "source": [
604 |     "list(client.delete(['/datasets*'], recurse=True))"
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": null,
610 |    "metadata": {
611 |     "collapsed": true
612 |    },
613 |    "outputs": [],
614 |    "source": []
615 |   }
616 |  ],
617 |  "metadata": {
618 |   "kernelspec": {
619 |    "display_name": "Python 2",
620 |    "language": "python",
621 |    "name": "python2"
622 |   },
623 |   "language_info": {
624 |    "codemirror_mode": {
625 |     "name": "ipython",
626 |     "version": 2
627 |    },
628 |    "file_extension": ".py",
629 |    "mimetype": "text/x-python",
630 |    "name": "python",
631 |    "nbconvert_exporter": "python",
632 |    "pygments_lexer": "ipython2",
633 |    "version": "2.7.6"
634 |   }
635 |  },
636 |  "nbformat": 4,
637 |  "nbformat_minor": 0
638 | }
639 | 


--------------------------------------------------------------------------------
/Chapter 08/Chapter_8_code_MR.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Let's first insert some data in the HDFS"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Found 2 items\r\n",
 22 |       "-rw-r--r--   1 vagrant supergroup       1365 2016-05-10 19:58 /datasets/hadoop_git_readme.txt\r\n",
 23 |       "-rw-r--r--   1 vagrant supergroup    5589889 2016-05-10 19:58 /datasets/shakespeare_all.txt\r\n"
 24 |      ]
 25 |     }
 26 |    ],
 27 |    "source": [
 28 |     "!hdfs dfs -mkdir -p /datasets\n",
 29 |     "!wget -q http://www.gutenberg.org/cache/epub/100/pg100.txt \\\n",
 30 |     "    -O ../datasets/shakespeare_all.txt\n",
 31 |     "!hdfs dfs -put -f ../datasets/shakespeare_all.txt /datasets/shakespeare_all.txt\n",
 32 |     "!hdfs dfs -put -f ../datasets/hadoop_git_readme.txt /datasets/hadoop_git_readme.txt\n",
 33 |     "!hdfs dfs -ls /datasets"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "## MR with Hadoop streaming"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "metadata": {
 47 |     "collapsed": true
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "with open('mapper_hadoop.py', 'w') as fh:\n",
 52 |     "    fh.write(\"\"\"#!/usr/bin/env python\n",
 53 |     "\n",
 54 |     "import sys\n",
 55 |     "\n",
 56 |     "for line in sys.stdin:\n",
 57 |     "    print \"chars\", len(line.rstrip('\\\\n'))\n",
 58 |     "    print \"words\", len(line.split())\n",
 59 |     "    print \"lines\", 1\n",
 60 |     "    \"\"\")\n",
 61 |     "\n",
 62 |     "\n",
 63 |     "with open('reducer_hadoop.py', 'w') as fh:\n",
 64 |     "    fh.write(\"\"\"#!/usr/bin/env python\n",
 65 |     "\n",
 66 |     "import sys\n",
 67 |     "\n",
 68 |     "counts = {\"chars\": 0, \"words\":0, \"lines\":0}\n",
 69 |     "\n",
 70 |     "for line in sys.stdin:\n",
 71 |     "    kv = line.rstrip().split()\n",
 72 |     "    counts[kv[0]] += int(kv[1])\n",
 73 |     "\n",
 74 |     "for k,v in counts.items():\n",
 75 |     "    print k, v\n",
 76 |     "    \"\"\") "
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 3,
 82 |    "metadata": {
 83 |     "collapsed": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "!chmod a+x *_hadoop.py"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 4,
 93 |    "metadata": {
 94 |     "collapsed": false
 95 |    },
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "chars 1335\r\n",
102 |       "lines 31\r\n",
103 |       "words 179\r\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "!cat ../datasets/hadoop_git_readme.txt | ./mapper_hadoop.py | sort -k1,1 | ./reducer_hadoop.py"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 5,
114 |    "metadata": {
115 |     "collapsed": false
116 |    },
117 |    "outputs": [
118 |     {
119 |      "name": "stdout",
120 |      "output_type": "stream",
121 |      "text": [
122 |       "16/05/10 19:58:48 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\n",
123 |       "Deleted /tmp/mr.out\n",
124 |       "packageJobJar: [/tmp/hadoop-unjar5384590696382062055/] [] /tmp/streamjob1965588122940844531.jar tmpDir=null\n",
125 |       "16/05/10 19:58:50 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n",
126 |       "16/05/10 19:58:51 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n",
127 |       "16/05/10 19:58:51 INFO mapred.FileInputFormat: Total input paths to process : 1\n",
128 |       "16/05/10 19:58:51 INFO mapreduce.JobSubmitter: number of splits:2\n",
129 |       "16/05/10 19:58:52 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1462906052477_0019\n",
130 |       "16/05/10 19:58:52 INFO impl.YarnClientImpl: Submitted application application_1462906052477_0019\n",
131 |       "16/05/10 19:58:52 INFO mapreduce.Job: The url to track the job: http://sparkbox:8088/proxy/application_1462906052477_0019/\n",
132 |       "16/05/10 19:58:52 INFO mapreduce.Job: Running job: job_1462906052477_0019\n",
133 |       "16/05/10 19:58:58 INFO mapreduce.Job: Job job_1462906052477_0019 running in uber mode : false\n",
134 |       "16/05/10 19:58:58 INFO mapreduce.Job:  map 0% reduce 0%\n",
135 |       "16/05/10 19:59:03 INFO mapreduce.Job:  map 50% reduce 0%\n",
136 |       "16/05/10 19:59:08 INFO mapreduce.Job:  map 100% reduce 0%\n",
137 |       "16/05/10 19:59:14 INFO mapreduce.Job:  map 100% reduce 100%\n",
138 |       "16/05/10 19:59:14 INFO mapreduce.Job: Job job_1462906052477_0019 completed successfully\n",
139 |       "16/05/10 19:59:14 INFO mapreduce.Job: Counters: 49\n",
140 |       "\tFile System Counters\n",
141 |       "\t\tFILE: Number of bytes read=1060\n",
142 |       "\t\tFILE: Number of bytes written=332854\n",
143 |       "\t\tFILE: Number of read operations=0\n",
144 |       "\t\tFILE: Number of large read operations=0\n",
145 |       "\t\tFILE: Number of write operations=0\n",
146 |       "\t\tHDFS: Number of bytes read=2256\n",
147 |       "\t\tHDFS: Number of bytes written=33\n",
148 |       "\t\tHDFS: Number of read operations=9\n",
149 |       "\t\tHDFS: Number of large read operations=0\n",
150 |       "\t\tHDFS: Number of write operations=2\n",
151 |       "\tJob Counters \n",
152 |       "\t\tLaunched map tasks=2\n",
153 |       "\t\tLaunched reduce tasks=1\n",
154 |       "\t\tData-local map tasks=2\n",
155 |       "\t\tTotal time spent by all maps in occupied slots (ms)=6732\n",
156 |       "\t\tTotal time spent by all reduces in occupied slots (ms)=3739\n",
157 |       "\t\tTotal time spent by all map tasks (ms)=6732\n",
158 |       "\t\tTotal time spent by all reduce tasks (ms)=3739\n",
159 |       "\t\tTotal vcore-milliseconds taken by all map tasks=6732\n",
160 |       "\t\tTotal vcore-milliseconds taken by all reduce tasks=3739\n",
161 |       "\t\tTotal megabyte-milliseconds taken by all map tasks=6893568\n",
162 |       "\t\tTotal megabyte-milliseconds taken by all reduce tasks=3828736\n",
163 |       "\tMap-Reduce Framework\n",
164 |       "\t\tMap input records=31\n",
165 |       "\t\tMap output records=93\n",
166 |       "\t\tMap output bytes=868\n",
167 |       "\t\tMap output materialized bytes=1066\n",
168 |       "\t\tInput split bytes=208\n",
169 |       "\t\tCombine input records=0\n",
170 |       "\t\tCombine output records=0\n",
171 |       "\t\tReduce input groups=23\n",
172 |       "\t\tReduce shuffle bytes=1066\n",
173 |       "\t\tReduce input records=93\n",
174 |       "\t\tReduce output records=3\n",
175 |       "\t\tSpilled Records=186\n",
176 |       "\t\tShuffled Maps =2\n",
177 |       "\t\tFailed Shuffles=0\n",
178 |       "\t\tMerged Map outputs=2\n",
179 |       "\t\tGC time elapsed (ms)=78\n",
180 |       "\t\tCPU time spent (ms)=1830\n",
181 |       "\t\tPhysical memory (bytes) snapshot=699170816\n",
182 |       "\t\tVirtual memory (bytes) snapshot=2495647744\n",
183 |       "\t\tTotal committed heap usage (bytes)=512229376\n",
184 |       "\tShuffle Errors\n",
185 |       "\t\tBAD_ID=0\n",
186 |       "\t\tCONNECTION=0\n",
187 |       "\t\tIO_ERROR=0\n",
188 |       "\t\tWRONG_LENGTH=0\n",
189 |       "\t\tWRONG_MAP=0\n",
190 |       "\t\tWRONG_REDUCE=0\n",
191 |       "\tFile Input Format Counters \n",
192 |       "\t\tBytes Read=2048\n",
193 |       "\tFile Output Format Counters \n",
194 |       "\t\tBytes Written=33\n",
195 |       "16/05/10 19:59:14 INFO streaming.StreamJob: Output directory: /tmp/mr.out\n"
196 |      ]
197 |     }
198 |    ],
199 |    "source": [
200 |     "!hdfs dfs -mkdir -p /tmp\n",
201 |     "!hdfs dfs -rm -f -r /tmp/mr.out\n",
202 |     "\n",
203 |     "!hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.6.4.jar \\\n",
204 |     "-files mapper_hadoop.py,reducer_hadoop.py \\\n",
205 |     "-mapper mapper_hadoop.py -reducer reducer_hadoop.py \\\n",
206 |     "-input /datasets/hadoop_git_readme.txt -output /tmp/mr.out\n",
207 |     "\n"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 6,
213 |    "metadata": {
214 |     "collapsed": false
215 |    },
216 |    "outputs": [
217 |     {
218 |      "name": "stdout",
219 |      "output_type": "stream",
220 |      "text": [
221 |       "Found 2 items\r\n",
222 |       "-rw-r--r--   1 vagrant supergroup          0 2016-05-10 19:59 /tmp/mr.out/_SUCCESS\r\n",
223 |       "-rw-r--r--   1 vagrant supergroup         33 2016-05-10 19:59 /tmp/mr.out/part-00000\r\n"
224 |      ]
225 |     }
226 |    ],
227 |    "source": [
228 |     "!hdfs dfs -ls /tmp/mr.out"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 7,
234 |    "metadata": {
235 |     "collapsed": false
236 |    },
237 |    "outputs": [
238 |     {
239 |      "name": "stdout",
240 |      "output_type": "stream",
241 |      "text": [
242 |       "chars 1335\t\r\n",
243 |       "lines 31\t\r\n",
244 |       "words 179\t\r\n"
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "!hdfs dfs -cat /tmp/mr.out/part-00000"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {
256 |     "collapsed": true
257 |    },
258 |    "outputs": [],
259 |    "source": []
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "## MR with Python MrJob library"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 8,
271 |    "metadata": {
272 |     "collapsed": true
273 |    },
274 |    "outputs": [],
275 |    "source": [
276 |     "with open(\"MrJob_job1.py\", \"w\") as fh:\n",
277 |     "    fh.write(\"\"\"\n",
278 |     "from mrjob.job import MRJob\n",
279 |     "\n",
280 |     "\n",
281 |     "class MRWordFrequencyCount(MRJob):\n",
282 |     "\n",
283 |     "    def mapper(self, _, line):\n",
284 |     "        yield \"chars\", len(line)\n",
285 |     "        yield \"words\", len(line.split())\n",
286 |     "        yield \"lines\", 1\n",
287 |     "\n",
288 |     "    def reducer(self, key, values):\n",
289 |     "        yield key, sum(values)\n",
290 |     "\n",
291 |     "\n",
292 |     "if __name__ == '__main__':\n",
293 |     "    MRWordFrequencyCount.run()    \n",
294 |     "    \"\"\")"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 9,
300 |    "metadata": {
301 |     "collapsed": false
302 |    },
303 |    "outputs": [
304 |     {
305 |      "name": "stdout",
306 |      "output_type": "stream",
307 |      "text": [
308 |       "No configs found; falling back on auto-configuration\r\n",
309 |       "Creating temp directory /tmp/MrJob_job1.vagrant.20160510.195920.590984\r\n",
310 |       "Running step 1 of 1...\r\n",
311 |       "Streaming final output from /tmp/MrJob_job1.vagrant.20160510.195920.590984/output...\r\n",
312 |       "\"chars\"\t1335\r\n",
313 |       "\"lines\"\t31\r\n",
314 |       "\"words\"\t179\r\n",
315 |       "Removing temp directory /tmp/MrJob_job1.vagrant.20160510.195920.590984...\r\n"
316 |      ]
317 |     }
318 |    ],
319 |    "source": [
320 |     "!python MrJob_job1.py ../datasets/hadoop_git_readme.txt"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 10,
326 |    "metadata": {
327 |     "collapsed": false
328 |    },
329 |    "outputs": [
330 |     {
331 |      "name": "stdout",
332 |      "output_type": "stream",
333 |      "text": [
334 |       "No configs found; falling back on auto-configuration\n",
335 |       "Looking for hadoop binary in /usr/local/hadoop/bin...\n",
336 |       "Found hadoop binary: /usr/local/hadoop/bin/hadoop\n",
337 |       "Creating temp directory /tmp/MrJob_job1.vagrant.20160510.195920.870616\n",
338 |       "Using Hadoop version 2.6.4\n",
339 |       "Copying local files to hdfs:///user/vagrant/tmp/mrjob/MrJob_job1.vagrant.20160510.195920.870616/files/...\n",
340 |       "Looking for Hadoop streaming jar in /usr/local/hadoop...\n",
341 |       "Found Hadoop streaming jar: /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.6.4.jar\n",
342 |       "Running step 1 of 1...\n",
343 |       "  packageJobJar: [/tmp/hadoop-unjar7634308048659876233/] [] /tmp/streamjob5879999650692493094.jar tmpDir=null\n",
344 |       "  Connecting to ResourceManager at /0.0.0.0:8032\n",
345 |       "  Connecting to ResourceManager at /0.0.0.0:8032\n",
346 |       "  Total input paths to process : 1\n",
347 |       "  number of splits:2\n",
348 |       "  Submitting tokens for job: job_1462906052477_0020\n",
349 |       "  Submitted application application_1462906052477_0020\n",
350 |       "  The url to track the job: http://sparkbox:8088/proxy/application_1462906052477_0020/\n",
351 |       "  Running job: job_1462906052477_0020\n",
352 |       "  Job job_1462906052477_0020 running in uber mode : false\n",
353 |       "   map 0% reduce 0%\n",
354 |       "   map 50% reduce 0%\n",
355 |       "   map 100% reduce 0%\n",
356 |       "   map 100% reduce 100%\n",
357 |       "  Job job_1462906052477_0020 completed successfully\n",
358 |       "  Output directory: hdfs:///user/vagrant/tmp/mrjob/MrJob_job1.vagrant.20160510.195920.870616/output\n",
359 |       "Counters: 50\n",
360 |       "\tFile Input Format Counters \n",
361 |       "\t\tBytes Read=2048\n",
362 |       "\tFile Output Format Counters \n",
363 |       "\t\tBytes Written=36\n",
364 |       "\tFile System Counters\n",
365 |       "\t\tFILE: Number of bytes read=1153\n",
366 |       "\t\tFILE: Number of bytes written=337717\n",
367 |       "\t\tFILE: Number of large read operations=0\n",
368 |       "\t\tFILE: Number of read operations=0\n",
369 |       "\t\tFILE: Number of write operations=0\n",
370 |       "\t\tHDFS: Number of bytes read=2256\n",
371 |       "\t\tHDFS: Number of bytes written=36\n",
372 |       "\t\tHDFS: Number of large read operations=0\n",
373 |       "\t\tHDFS: Number of read operations=9\n",
374 |       "\t\tHDFS: Number of write operations=2\n",
375 |       "\tJob Counters \n",
376 |       "\t\tData-local map tasks=2\n",
377 |       "\t\tKilled map tasks=1\n",
378 |       "\t\tLaunched map tasks=2\n",
379 |       "\t\tLaunched reduce tasks=1\n",
380 |       "\t\tTotal megabyte-milliseconds taken by all map tasks=7394304\n",
381 |       "\t\tTotal megabyte-milliseconds taken by all reduce tasks=3846144\n",
382 |       "\t\tTotal time spent by all map tasks (ms)=7221\n",
383 |       "\t\tTotal time spent by all maps in occupied slots (ms)=7221\n",
384 |       "\t\tTotal time spent by all reduce tasks (ms)=3756\n",
385 |       "\t\tTotal time spent by all reduces in occupied slots (ms)=3756\n",
386 |       "\t\tTotal vcore-milliseconds taken by all map tasks=7221\n",
387 |       "\t\tTotal vcore-milliseconds taken by all reduce tasks=3756\n",
388 |       "\tMap-Reduce Framework\n",
389 |       "\t\tCPU time spent (ms)=1830\n",
390 |       "\t\tCombine input records=0\n",
391 |       "\t\tCombine output records=0\n",
392 |       "\t\tFailed Shuffles=0\n",
393 |       "\t\tGC time elapsed (ms)=66\n",
394 |       "\t\tInput split bytes=208\n",
395 |       "\t\tMap input records=31\n",
396 |       "\t\tMap output bytes=961\n",
397 |       "\t\tMap output materialized bytes=1159\n",
398 |       "\t\tMap output records=93\n",
399 |       "\t\tMerged Map outputs=2\n",
400 |       "\t\tPhysical memory (bytes) snapshot=726175744\n",
401 |       "\t\tReduce input groups=3\n",
402 |       "\t\tReduce input records=93\n",
403 |       "\t\tReduce output records=3\n",
404 |       "\t\tReduce shuffle bytes=1159\n",
405 |       "\t\tShuffled Maps =2\n",
406 |       "\t\tSpilled Records=186\n",
407 |       "\t\tTotal committed heap usage (bytes)=515899392\n",
408 |       "\t\tVirtual memory (bytes) snapshot=2496479232\n",
409 |       "\tShuffle Errors\n",
410 |       "\t\tBAD_ID=0\n",
411 |       "\t\tCONNECTION=0\n",
412 |       "\t\tIO_ERROR=0\n",
413 |       "\t\tWRONG_LENGTH=0\n",
414 |       "\t\tWRONG_MAP=0\n",
415 |       "\t\tWRONG_REDUCE=0\n",
416 |       "Streaming final output from hdfs:///user/vagrant/tmp/mrjob/MrJob_job1.vagrant.20160510.195920.870616/output...\n",
417 |       "\"chars\"\t1335\n",
418 |       "\"lines\"\t31\n",
419 |       "\"words\"\t179\n",
420 |       "Removing HDFS temp directory hdfs:///user/vagrant/tmp/mrjob/MrJob_job1.vagrant.20160510.195920.870616...\n",
421 |       "Removing temp directory /tmp/MrJob_job1.vagrant.20160510.195920.870616...\n"
422 |      ]
423 |     }
424 |    ],
425 |    "source": [
426 |     "!python MrJob_job1.py -r hadoop hdfs:///datasets/hadoop_git_readme.txt"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": 11,
432 |    "metadata": {
433 |     "collapsed": true
434 |    },
435 |    "outputs": [],
436 |    "source": [
437 |     "with open(\"MrJob_job2.py\", \"w\") as fh:\n",
438 |     "    fh.write(\"\"\"\n",
439 |     "from mrjob.job import MRJob\n",
440 |     "from mrjob.step import MRStep\n",
441 |     "import re\n",
442 |     "\n",
443 |     "WORD_RE = re.compile(r\"[\\w']+\")\n",
444 |     "\n",
445 |     "\n",
446 |     "class MRMostUsedWord(MRJob):\n",
447 |     "\n",
448 |     "    def steps(self):\n",
449 |     "        return [\n",
450 |     "            MRStep(mapper=self.mapper_get_words,\n",
451 |     "                   reducer=self.reducer_count_words),\n",
452 |     "            MRStep(mapper=self.mapper_word_count_one_key,\n",
453 |     "                   reducer=self.reducer_find_max_word)\n",
454 |     "        ]\n",
455 |     "\n",
456 |     "    def mapper_get_words(self, _, line):\n",
457 |     "        # yield each word in the line\n",
458 |     "        for word in WORD_RE.findall(line):\n",
459 |     "            yield (word.lower(), 1)\n",
460 |     "\n",
461 |     "    def reducer_count_words(self, word, counts):\n",
462 |     "        # send all (num_occurrences, word) pairs to the same reducer.\n",
463 |     "        yield (word, sum(counts))\n",
464 |     "    \n",
465 |     "    def mapper_word_count_one_key(self, word, counts):\n",
466 |     "        # send all the tuples to same reducer\n",
467 |     "        yield None, (counts, word)\n",
468 |     "\n",
469 |     "    def reducer_find_max_word(self, _, count_word_pairs):\n",
470 |     "        # each item of word_count_pairs is a tuple (count, word),\n",
471 |     "        yield max(count_word_pairs)\n",
472 |     "\n",
473 |     "\n",
474 |     "if __name__ == '__main__':\n",
475 |     "    MRMostUsedWord.run()\n",
476 |     "\"\"\")"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": 12,
482 |    "metadata": {
483 |     "collapsed": false
484 |    },
485 |    "outputs": [
486 |     {
487 |      "name": "stdout",
488 |      "output_type": "stream",
489 |      "text": [
490 |       "27801\t\"the\"\r\n"
491 |      ]
492 |     }
493 |    ],
494 |    "source": [
495 |     "# This time is running on a big dataset\n",
496 |     "!python MrJob_job2.py --quiet ../datasets/shakespeare_all.txt"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": 13,
502 |    "metadata": {
503 |     "collapsed": false
504 |    },
505 |    "outputs": [
506 |     {
507 |      "name": "stdout",
508 |      "output_type": "stream",
509 |      "text": [
510 |       "27801\t\"the\"\r\n"
511 |      ]
512 |     }
513 |    ],
514 |    "source": [
515 |     "!python MrJob_job2.py -r hadoop --quiet hdfs:///datasets/shakespeare_all.txt"
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": null,
521 |    "metadata": {
522 |     "collapsed": true
523 |    },
524 |    "outputs": [],
525 |    "source": []
526 |   }
527 |  ],
528 |  "metadata": {
529 |   "kernelspec": {
530 |    "display_name": "Python 2",
531 |    "language": "python",
532 |    "name": "python2"
533 |   },
534 |   "language_info": {
535 |    "codemirror_mode": {
536 |     "name": "ipython",
537 |     "version": 2
538 |    },
539 |    "file_extension": ".py",
540 |    "mimetype": "text/x-python",
541 |    "name": "python",
542 |    "nbconvert_exporter": "python",
543 |    "pygments_lexer": "ipython2",
544 |    "version": "2.7.6"
545 |   }
546 |  },
547 |  "nbformat": 4,
548 |  "nbformat_minor": 0
549 | }
550 | 


--------------------------------------------------------------------------------
/Chapter 08/Chapter_8_code_Spark.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Let's first insert some data in the HDFS"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Found 2 items\n",
 22 |       "-rw-r--r--   1 vagrant supergroup       1365 2016-05-10 20:06 /datasets/hadoop_git_readme.txt\n",
 23 |       "-rw-r--r--   1 vagrant supergroup    5589889 2016-05-10 20:06 /datasets/shakespeare_all.txt\n",
 24 |       "16/05/10 20:06:36 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\n",
 25 |       "Deleted /tmp\n"
 26 |      ]
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "!hdfs dfs -mkdir -p /datasets\n",
 31 |     "!wget -q http://www.gutenberg.org/cache/epub/100/pg100.txt \\\n",
 32 |     "    -O ../datasets/shakespeare_all.txt\n",
 33 |     "!hdfs dfs -put -f ../datasets/shakespeare_all.txt /datasets/shakespeare_all.txt\n",
 34 |     "!hdfs dfs -put -f ../datasets/hadoop_git_readme.txt /datasets/hadoop_git_readme.txt\n",
 35 |     "!hdfs dfs -ls /datasets\n",
 36 |     "!hdfs dfs -rm -r /tmp"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "## pySpark"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {
 50 |     "collapsed": false
 51 |    },
 52 |    "outputs": [
 53 |     {
 54 |      "data": {
 55 |       "text/plain": [
 56 |        "[(u'spark.rdd.compress', u'True'),\n",
 57 |        " (u'spark.master', u'yarn-client'),\n",
 58 |        " (u'spark.serializer.objectStreamReset', u'100'),\n",
 59 |        " (u'spark.yarn.isPython', u'true'),\n",
 60 |        " (u'spark.submit.deployMode', u'client'),\n",
 61 |        " (u'spark.executor.cores', u'2'),\n",
 62 |        " (u'spark.app.name', u'PySparkShell')]"
 63 |       ]
 64 |      },
 65 |      "execution_count": 2,
 66 |      "metadata": {},
 67 |      "output_type": "execute_result"
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "sc._conf.getAll()"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 3,
 77 |    "metadata": {
 78 |     "collapsed": false
 79 |    },
 80 |    "outputs": [
 81 |     {
 82 |      "data": {
 83 |       "text/plain": [
 84 |        "ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:423"
 85 |       ]
 86 |      },
 87 |      "execution_count": 3,
 88 |      "metadata": {},
 89 |      "output_type": "execute_result"
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "numbers = range(10)\n",
 94 |     "numbers_rdd = sc.parallelize(numbers)\n",
 95 |     "\n",
 96 |     "numbers_rdd"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 4,
102 |    "metadata": {
103 |     "collapsed": false
104 |    },
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
110 |       ]
111 |      },
112 |      "execution_count": 4,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "numbers_rdd.collect()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 5,
124 |    "metadata": {
125 |     "collapsed": false
126 |    },
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "[0, 1, 2, 3]"
132 |       ]
133 |      },
134 |      "execution_count": 5,
135 |      "metadata": {},
136 |      "output_type": "execute_result"
137 |     }
138 |    ],
139 |    "source": [
140 |     "numbers_rdd.take(4)"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 6,
146 |    "metadata": {
147 |     "collapsed": false
148 |    },
149 |    "outputs": [
150 |     {
151 |      "data": {
152 |       "text/plain": [
153 |        "u'For the latest information about Hadoop, please visit our website at:'"
154 |       ]
155 |      },
156 |      "execution_count": 6,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "sc.textFile(\"hdfs:///datasets/hadoop_git_readme.txt\").first()"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 7,
168 |    "metadata": {
169 |     "collapsed": false
170 |    },
171 |    "outputs": [
172 |     {
173 |      "data": {
174 |       "text/plain": [
175 |        "u'For the latest information about Hadoop, please visit our website at:'"
176 |       ]
177 |      },
178 |      "execution_count": 7,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "sc.textFile(\"file:///home/vagrant/datasets/hadoop_git_readme.txt\").first()"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": 8,
190 |    "metadata": {
191 |     "collapsed": true
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "numbers_rdd.saveAsTextFile(\"hdfs:///tmp/numbers_1_10.txt\")"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 9,
201 |    "metadata": {
202 |     "collapsed": false
203 |    },
204 |    "outputs": [
205 |     {
206 |      "name": "stdout",
207 |      "output_type": "stream",
208 |      "text": [
209 |       "Found 5 items\r\n",
210 |       "-rw-r--r--   1 vagrant supergroup          0 2016-05-10 20:06 /tmp/numbers_1_10.txt/_SUCCESS\r\n",
211 |       "-rw-r--r--   1 vagrant supergroup          4 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00000\r\n",
212 |       "-rw-r--r--   1 vagrant supergroup          4 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00001\r\n",
213 |       "-rw-r--r--   1 vagrant supergroup          4 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00002\r\n",
214 |       "-rw-r--r--   1 vagrant supergroup          8 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00003\r\n"
215 |      ]
216 |     }
217 |    ],
218 |    "source": [
219 |     "!hdfs dfs -ls /tmp/numbers_1_10.txt"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 10,
225 |    "metadata": {
226 |     "collapsed": true
227 |    },
228 |    "outputs": [],
229 |    "source": [
230 |     "numbers_rdd.coalesce(1).saveAsTextFile(\"hdfs:///tmp/numbers_1_10_one_file.txt\")"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 11,
236 |    "metadata": {
237 |     "collapsed": false
238 |    },
239 |    "outputs": [
240 |     {
241 |      "name": "stdout",
242 |      "output_type": "stream",
243 |      "text": [
244 |       "Found 2 items\r\n",
245 |       "-rw-r--r--   1 vagrant supergroup          0 2016-05-10 20:06 /tmp/numbers_1_10_one_file.txt/_SUCCESS\r\n",
246 |       "-rw-r--r--   1 vagrant supergroup         20 2016-05-10 20:06 /tmp/numbers_1_10_one_file.txt/part-00000\r\n"
247 |      ]
248 |     }
249 |    ],
250 |    "source": [
251 |     "!hdfs dfs -ls /tmp/numbers_1_10_one_file.txt"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 12,
257 |    "metadata": {
258 |     "collapsed": false
259 |    },
260 |    "outputs": [
261 |     {
262 |      "name": "stdout",
263 |      "output_type": "stream",
264 |      "text": [
265 |       "0\r\n",
266 |       "1\r\n",
267 |       "2\r\n",
268 |       "3\r\n",
269 |       "4\r\n",
270 |       "5\r\n",
271 |       "6\r\n",
272 |       "7\r\n",
273 |       "8\r\n",
274 |       "9\r\n"
275 |      ]
276 |     }
277 |    ],
278 |    "source": [
279 |     "!hdfs dfs -cat /tmp/numbers_1_10_one_file.txt/part-00000"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 13,
285 |    "metadata": {
286 |     "collapsed": true
287 |    },
288 |    "outputs": [],
289 |    "source": [
290 |     "numbers_rdd.saveAsTextFile(\"file:///tmp/numbers_1_10.txt\")"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 14,
296 |    "metadata": {
297 |     "collapsed": false
298 |    },
299 |    "outputs": [
300 |     {
301 |      "name": "stdout",
302 |      "output_type": "stream",
303 |      "text": [
304 |       "part-00000  part-00001\tpart-00002  part-00003\t_SUCCESS\r\n"
305 |      ]
306 |     }
307 |    ],
308 |    "source": [
309 |     "!ls /tmp/numbers_1_10.txt"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 15,
315 |    "metadata": {
316 |     "collapsed": false
317 |    },
318 |    "outputs": [
319 |     {
320 |      "data": {
321 |       "text/plain": [
322 |        "[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]"
323 |       ]
324 |      },
325 |      "execution_count": 15,
326 |      "metadata": {},
327 |      "output_type": "execute_result"
328 |     }
329 |    ],
330 |    "source": [
331 |     "def sq(x):\n",
332 |     "    return x**2\n",
333 |     "\n",
334 |     "numbers_rdd.map(sq).collect()"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 16,
340 |    "metadata": {
341 |     "collapsed": false
342 |    },
343 |    "outputs": [
344 |     {
345 |      "data": {
346 |       "text/plain": [
347 |        "[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]"
348 |       ]
349 |      },
350 |      "execution_count": 16,
351 |      "metadata": {},
352 |      "output_type": "execute_result"
353 |     }
354 |    ],
355 |    "source": [
356 |     "numbers_rdd.map(lambda x: x**2).collect()"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 17,
362 |    "metadata": {
363 |     "collapsed": false
364 |    },
365 |    "outputs": [
366 |     {
367 |      "data": {
368 |       "text/plain": [
369 |        "285"
370 |       ]
371 |      },
372 |      "execution_count": 17,
373 |      "metadata": {},
374 |      "output_type": "execute_result"
375 |     }
376 |    ],
377 |    "source": [
378 |     "numbers_rdd.map(lambda x: x**2).reduce(lambda a,b: a+b)"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 18,
384 |    "metadata": {
385 |     "collapsed": false
386 |    },
387 |    "outputs": [
388 |     {
389 |      "data": {
390 |       "text/plain": [
391 |        "285"
392 |       ]
393 |      },
394 |      "execution_count": 18,
395 |      "metadata": {},
396 |      "output_type": "execute_result"
397 |     }
398 |    ],
399 |    "source": [
400 |     "numbers_rdd.map(lambda x: x**2).sum()"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": 19,
406 |    "metadata": {
407 |     "collapsed": false
408 |    },
409 |    "outputs": [
410 |     {
411 |      "data": {
412 |       "text/plain": [
413 |        "[('even', 0),\n",
414 |        " ('odd', 1),\n",
415 |        " ('even', 2),\n",
416 |        " ('odd', 3),\n",
417 |        " ('even', 4),\n",
418 |        " ('odd', 5),\n",
419 |        " ('even', 6),\n",
420 |        " ('odd', 7),\n",
421 |        " ('even', 8),\n",
422 |        " ('odd', 9)]"
423 |       ]
424 |      },
425 |      "execution_count": 19,
426 |      "metadata": {},
427 |      "output_type": "execute_result"
428 |     }
429 |    ],
430 |    "source": [
431 |     "def tag(x):\n",
432 |     "    return \"even\" if x%2==0 else \"odd\"\n",
433 |     "        \n",
434 |     "    \n",
435 |     "numbers_rdd.map(lambda x: (tag(x), x) ).collect()"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": 20,
441 |    "metadata": {
442 |     "collapsed": false
443 |    },
444 |    "outputs": [
445 |     {
446 |      "data": {
447 |       "text/plain": [
448 |        "[('even', 20), ('odd', 25)]"
449 |       ]
450 |      },
451 |      "execution_count": 20,
452 |      "metadata": {},
453 |      "output_type": "execute_result"
454 |     }
455 |    ],
456 |    "source": [
457 |     "numbers_rdd.map(lambda x: (tag(x), x) ).reduceByKey(lambda a,b: a+b).collect()"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": null,
463 |    "metadata": {
464 |     "collapsed": true
465 |    },
466 |    "outputs": [],
467 |    "source": []
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": null,
472 |    "metadata": {
473 |     "collapsed": true
474 |    },
475 |    "outputs": [],
476 |    "source": []
477 |   },
478 |   {
479 |    "cell_type": "code",
480 |    "execution_count": 21,
481 |    "metadata": {
482 |     "collapsed": false
483 |    },
484 |    "outputs": [
485 |     {
486 |      "name": "stdout",
487 |      "output_type": "stream",
488 |      "text": [
489 |       "{'chars': 1335, 'lines': 31, 'words': 179}\n"
490 |      ]
491 |     }
492 |    ],
493 |    "source": [
494 |     "def emit_feats(line):\n",
495 |     "    return [(\"chars\", len(line)), \\\n",
496 |     "            (\"words\", len(line.split())), \\\n",
497 |     "            (\"lines\", 1)]\n",
498 |     "\n",
499 |     "print (sc.textFile(\"/datasets/hadoop_git_readme.txt\")\n",
500 |     " .flatMap(emit_feats)\n",
501 |     " .reduceByKey(lambda a,b: a+b)\n",
502 |     " .collectAsMap())"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "code",
507 |    "execution_count": 22,
508 |    "metadata": {
509 |     "collapsed": false
510 |    },
511 |    "outputs": [
512 |     {
513 |      "name": "stdout",
514 |      "output_type": "stream",
515 |      "text": [
516 |       "[(27801, u'the')]\n"
517 |      ]
518 |     }
519 |    ],
520 |    "source": [
521 |     "import re\n",
522 |     "WORD_RE = re.compile(r\"[\\w']+\")\n",
523 |     "\n",
524 |     "print (sc.textFile(\"/datasets/shakespeare_all.txt\")\n",
525 |     " .flatMap(lambda line: WORD_RE.findall(line))\n",
526 |     " .map(lambda word: (word.lower(), 1))\n",
527 |     " .reduceByKey(lambda a,b: a+b)\n",
528 |     " .map(lambda (k,v): (v,k))\n",
529 |     " .takeOrdered(1, key = lambda x: -x[0]))"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": 23,
535 |    "metadata": {
536 |     "collapsed": false
537 |    },
538 |    "outputs": [
539 |     {
540 |      "name": "stdout",
541 |      "output_type": "stream",
542 |      "text": [
543 |       "[(u'the', 27801)]\n"
544 |      ]
545 |     }
546 |    ],
547 |    "source": [
548 |     "print (sc.textFile(\"/datasets/shakespeare_all.txt\")\n",
549 |     " .flatMap(lambda line: [(word.lower(), 1) for word in WORD_RE.findall(line)])\n",
550 |     " .reduceByKey(lambda a,b: a+b)\n",
551 |     " .takeOrdered(1, key = lambda x: -x[1]))"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "code",
556 |    "execution_count": null,
557 |    "metadata": {
558 |     "collapsed": true
559 |    },
560 |    "outputs": [],
561 |    "source": []
562 |   }
563 |  ],
564 |  "metadata": {
565 |   "kernelspec": {
566 |    "display_name": "Python 2",
567 |    "language": "python",
568 |    "name": "python2"
569 |   },
570 |   "language_info": {
571 |    "codemirror_mode": {
572 |     "name": "ipython",
573 |     "version": 2
574 |    },
575 |    "file_extension": ".py",
576 |    "mimetype": "text/x-python",
577 |    "name": "python",
578 |    "nbconvert_exporter": "python",
579 |    "pygments_lexer": "ipython2",
580 |    "version": "2.7.6"
581 |   }
582 |  },
583 |  "nbformat": 4,
584 |  "nbformat_minor": 0
585 | }
586 | 


--------------------------------------------------------------------------------
/Chapter 08/Chapter_8_code_Vagrantfile:
--------------------------------------------------------------------------------
 1 | Vagrant.configure("2") do |config|
 2 | 	config.vm.box = "sparkpy/sparkbox_test_1"
 3 | 	config.vm.hostname = "sparkbox"
 4 | 	config.ssh.insert_key = false
 5 | 
 6 | 	# Hadoop ResourceManager
 7 | 	config.vm.network :forwarded_port, guest: 8088, host: 8088, auto_correct: true
 8 | 
 9 | 	# Hadoop NameNode
10 | 	config.vm.network :forwarded_port, guest: 50070, host: 50070, auto_correct: true
11 | 
12 | 	# Hadoop DataNode
13 | 	config.vm.network :forwarded_port, guest: 50075, host: 50075, auto_correct: true
14 | 
15 | 	# Ipython notebooks (yarn and standalone)
16 | 	config.vm.network :forwarded_port, guest: 8888, host: 8888, auto_correct: true
17 | 
18 | 
19 | 	config.vm.provider "virtualbox" do |v|
20 | 	    v.customize ["modifyvm", :id, "--natdnshostresolver1", "on"]
21 | 		v.customize ["modifyvm", :id, "--natdnsproxy1", "on"]
22 | 		v.customize ["modifyvm", :id, "--nictype1", "virtio"]
23 | 
24 | 	    v.name = "sparkbox_test"
25 | 	    v.memory = "4096"
26 | 	    v.cpus = "2"
27 | 	end
28 |   
29 | end
30 | 


--------------------------------------------------------------------------------
/Chapter 09/Chapter_9_code_01.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "## Sharing data within the cluster"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "##### Read-only variables (broadcast)"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 1,
  20 |    "metadata": {
  21 |     "collapsed": false
  22 |    },
  23 |    "outputs": [],
  24 |    "source": [
  25 |     "# Example: let's encode the gender found in the demographic data\n",
  26 |     "# As a hot encode. Note: the association should be the same\n",
  27 |     "# on every machine in the cluster, requiring a shared mapping\n",
  28 |     "\n",
  29 |     "one_hot_encoding = {\"M\": (1, 0, 0),\n",
  30 |     "                    \"F\": (0, 1, 0),\n",
  31 |     "                    \"U\": (0, 0, 1)\n",
  32 |     "                   }"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "code",
  37 |    "execution_count": 2,
  38 |    "metadata": {
  39 |     "collapsed": false
  40 |    },
  41 |    "outputs": [
  42 |     {
  43 |      "data": {
  44 |       "text/plain": [
  45 |        "[(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (0, 0, 1)]"
  46 |       ]
  47 |      },
  48 |      "execution_count": 2,
  49 |      "metadata": {},
  50 |      "output_type": "execute_result"
  51 |     }
  52 |    ],
  53 |    "source": [
  54 |     "# Gender one-hot-encoding\n",
  55 |     "(sc.parallelize([\"M\", \"F\", \"U\", \"F\", \"M\", \"U\"])\n",
  56 |     "   .map(lambda x: one_hot_encoding[x])\n",
  57 |     "   .collect())\n",
  58 |     "\n",
  59 |     "# The command above works only in the single node configuration\n",
  60 |     "# since the variable \"one_hot_encoding\" is defined only on this machine\n",
  61 |     "# On a multi-node cluster, it will raise a Java error"
  62 |    ]
  63 |   },
  64 |   {
  65 |    "cell_type": "code",
  66 |    "execution_count": 3,
  67 |    "metadata": {
  68 |     "collapsed": false
  69 |    },
  70 |    "outputs": [
  71 |     {
  72 |      "data": {
  73 |       "text/plain": [
  74 |        "[(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (0, 0, 1)]"
  75 |       ]
  76 |      },
  77 |      "execution_count": 3,
  78 |      "metadata": {},
  79 |      "output_type": "execute_result"
  80 |     }
  81 |    ],
  82 |    "source": [
  83 |     "# Solution 1: include the encoding map in the .map() function \n",
  84 |     "# In this way, all the nodes will see it\n",
  85 |     "\n",
  86 |     "def map_ohe(x):\n",
  87 |     "    ohe = {\"M\": (1, 0, 0),\n",
  88 |     "           \"F\": (0, 1, 0),\n",
  89 |     "           \"U\": (0, 0, 1)\n",
  90 |     "          }\n",
  91 |     "    return ohe[x]\n",
  92 |     "\n",
  93 |     "sc.parallelize([\"M\", \"F\", \"U\", \"F\", \"M\", \"U\"]).map(map_ohe).collect()\n",
  94 |     "\n"
  95 |    ]
  96 |   },
  97 |   {
  98 |    "cell_type": "code",
  99 |    "execution_count": 4,
 100 |    "metadata": {
 101 |     "collapsed": false
 102 |    },
 103 |    "outputs": [
 104 |     {
 105 |      "data": {
 106 |       "text/plain": [
 107 |        "[(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (0, 0, 1)]"
 108 |       ]
 109 |      },
 110 |      "execution_count": 4,
 111 |      "metadata": {},
 112 |      "output_type": "execute_result"
 113 |     }
 114 |    ],
 115 |    "source": [
 116 |     "# Solution 2: broadcast the map to all the nodes.\n",
 117 |     "# All of them will be able to read-only it\n",
 118 |     "\n",
 119 |     "bcast_map = sc.broadcast(one_hot_encoding)\n",
 120 |     "\n",
 121 |     "def bcast_map_ohe(x, shared_ohe):\n",
 122 |     "    return shared_ohe[x]\n",
 123 |     "\n",
 124 |     "(sc.parallelize([\"M\", \"F\", \"U\", \"F\", \"M\", \"U\"])\n",
 125 |     " .map(lambda x: bcast_map_ohe(x, bcast_map.value))\n",
 126 |     " .collect())"
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "code",
 131 |    "execution_count": 5,
 132 |    "metadata": {
 133 |     "collapsed": true
 134 |    },
 135 |    "outputs": [],
 136 |    "source": [
 137 |     "bcast_map.unpersist()"
 138 |    ]
 139 |   },
 140 |   {
 141 |    "cell_type": "markdown",
 142 |    "metadata": {},
 143 |    "source": [
 144 |     "##### Write-only variables (broadcast)"
 145 |    ]
 146 |   },
 147 |   {
 148 |    "cell_type": "code",
 149 |    "execution_count": 6,
 150 |    "metadata": {
 151 |     "collapsed": false
 152 |    },
 153 |    "outputs": [
 154 |     {
 155 |      "name": "stdout",
 156 |      "output_type": "stream",
 157 |      "text": [
 158 |       "The number of empty lines is:\n"
 159 |      ]
 160 |     },
 161 |     {
 162 |      "data": {
 163 |       "text/plain": [
 164 |        "6"
 165 |       ]
 166 |      },
 167 |      "execution_count": 6,
 168 |      "metadata": {},
 169 |      "output_type": "execute_result"
 170 |     }
 171 |    ],
 172 |    "source": [
 173 |     "# Let's coint the empty line in a file\n",
 174 |     "\n",
 175 |     "print \"The number of empty lines is:\"\n",
 176 |     "\n",
 177 |     "(sc.textFile('file:///home/vagrant/datasets/hadoop_git_readme.txt')\n",
 178 |     "   .filter(lambda line: len(line) == 0)\n",
 179 |     "   .count())"
 180 |    ]
 181 |   },
 182 |   {
 183 |    "cell_type": "code",
 184 |    "execution_count": 7,
 185 |    "metadata": {
 186 |     "collapsed": false
 187 |    },
 188 |    "outputs": [
 189 |     {
 190 |      "name": "stdout",
 191 |      "output_type": "stream",
 192 |      "text": [
 193 |       "In the file there are 31 lines\n",
 194 |       "And 6 lines are empty\n"
 195 |      ]
 196 |     }
 197 |    ],
 198 |    "source": [
 199 |     "# Let's count the lines in a file, and at the same time,\n",
 200 |     "# count the empty ones\n",
 201 |     "\n",
 202 |     "accum = sc.accumulator(0)\n",
 203 |     "\n",
 204 |     "def split_line(line):   \n",
 205 |     "    if len(line) == 0:\n",
 206 |     "        accum.add(1)\n",
 207 |     "    return 1\n",
 208 |     "\n",
 209 |     "tot_lines = (\n",
 210 |     "    sc.textFile('file:///home/vagrant/datasets/hadoop_git_readme.txt')\n",
 211 |     "      .map(split_line)\n",
 212 |     "      .count())\n",
 213 |     "\n",
 214 |     "empty_lines = accum.value\n",
 215 |     "\n",
 216 |     "\n",
 217 |     "print \"In the file there are %d lines\" % tot_lines\n",
 218 |     "print \"And %d lines are empty\" % empty_lines"
 219 |    ]
 220 |   },
 221 |   {
 222 |    "cell_type": "markdown",
 223 |    "metadata": {},
 224 |    "source": [
 225 |     "# Real world example with broadcast and accumulator\n",
 226 |     "### train multiple classifiers and select the best one, accumulating the errors"
 227 |    ]
 228 |   },
 229 |   {
 230 |    "cell_type": "code",
 231 |    "execution_count": 8,
 232 |    "metadata": {
 233 |     "collapsed": true
 234 |    },
 235 |    "outputs": [],
 236 |    "source": [
 237 |     "# step 1: load the dataset\n",
 238 |     "# note: if the dataset is large, you should read the next section\n",
 239 |     "\n",
 240 |     "from sklearn.datasets import load_iris\n",
 241 |     "\n",
 242 |     "bcast_dataset = sc.broadcast(load_iris())"
 243 |    ]
 244 |   },
 245 |   {
 246 |    "cell_type": "code",
 247 |    "execution_count": 9,
 248 |    "metadata": {
 249 |     "collapsed": false
 250 |    },
 251 |    "outputs": [],
 252 |    "source": [
 253 |     "# step 2: create an accumulator that stores the errors in a list\n",
 254 |     "\n",
 255 |     "from pyspark import AccumulatorParam\n",
 256 |     "\n",
 257 |     "class ErrorAccumulator(AccumulatorParam):\n",
 258 |     "    def zero(self, initialList):\n",
 259 |     "        return initialList\n",
 260 |     "\n",
 261 |     "    def addInPlace(self, v1, v2):\n",
 262 |     "        if not isinstance(v1, list):\n",
 263 |     "            v1 = [v1]\n",
 264 |     "        if not isinstance(v2, list):\n",
 265 |     "            v2 = [v2]\n",
 266 |     "        return v1 + v2\n",
 267 |     "\n",
 268 |     "errAccum = sc.accumulator([], ErrorAccumulator())"
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "code",
 273 |    "execution_count": 10,
 274 |    "metadata": {
 275 |     "collapsed": true
 276 |    },
 277 |    "outputs": [],
 278 |    "source": [
 279 |     "# step 3: create mappers: each of them will use a classifier\n",
 280 |     "\n",
 281 |     "def apply_classifier(clf, dataset):\n",
 282 |     "    \n",
 283 |     "    clf_name = clf.__class__.__name__\n",
 284 |     "    X = dataset.value.data\n",
 285 |     "    y = dataset.value.target\n",
 286 |     "    \n",
 287 |     "    try:\n",
 288 |     "        from sklearn.metrics import accuracy_score\n",
 289 |     "        \n",
 290 |     "        clf.fit(X, y)\n",
 291 |     "        y_pred = clf.predict(X)\n",
 292 |     "        acc = accuracy_score(y, y_pred)\n",
 293 |     "\n",
 294 |     "        return [(clf_name, acc)]\n",
 295 |     "\n",
 296 |     "    except Exception as e:\n",
 297 |     "        errAccum.add((clf_name, str(e)))\n",
 298 |     "        return []\n"
 299 |    ]
 300 |   },
 301 |   {
 302 |    "cell_type": "code",
 303 |    "execution_count": 11,
 304 |    "metadata": {
 305 |     "collapsed": false
 306 |    },
 307 |    "outputs": [
 308 |     {
 309 |      "data": {
 310 |       "text/plain": [
 311 |        "[('DummyClassifier', 0.33333333333333331),\n",
 312 |        " ('SGDClassifier', 0.66666666666666663)]"
 313 |       ]
 314 |      },
 315 |      "execution_count": 11,
 316 |      "metadata": {},
 317 |      "output_type": "execute_result"
 318 |     }
 319 |    ],
 320 |    "source": [
 321 |     "from sklearn.linear_model import SGDClassifier\n",
 322 |     "from sklearn.dummy import DummyClassifier\n",
 323 |     "from sklearn.decomposition import PCA\n",
 324 |     "from sklearn.manifold import MDS\n",
 325 |     "\n",
 326 |     "classifiers = [DummyClassifier('most_frequent'), \n",
 327 |     "               SGDClassifier(), \n",
 328 |     "               PCA(), \n",
 329 |     "               MDS()]\n",
 330 |     "\n",
 331 |     "(sc.parallelize(classifiers)\n",
 332 |     "     .flatMap(lambda x: apply_classifier(x, bcast_dataset))\n",
 333 |     "     .collect())"
 334 |    ]
 335 |   },
 336 |   {
 337 |    "cell_type": "code",
 338 |    "execution_count": 12,
 339 |    "metadata": {
 340 |     "collapsed": false
 341 |    },
 342 |    "outputs": [
 343 |     {
 344 |      "name": "stdout",
 345 |      "output_type": "stream",
 346 |      "text": [
 347 |       "The errors are:\n"
 348 |      ]
 349 |     },
 350 |     {
 351 |      "data": {
 352 |       "text/plain": [
 353 |        "[('PCA', \"'PCA' object has no attribute 'predict'\"),\n",
 354 |        " ('MDS',\n",
 355 |        "  \"Proximity must be 'precomputed' or 'euclidean'. Got euclidean instead\")]"
 356 |       ]
 357 |      },
 358 |      "execution_count": 12,
 359 |      "metadata": {},
 360 |      "output_type": "execute_result"
 361 |     }
 362 |    ],
 363 |    "source": [
 364 |     "print \"The errors are:\"\n",
 365 |     "errAccum.value"
 366 |    ]
 367 |   },
 368 |   {
 369 |    "cell_type": "code",
 370 |    "execution_count": 13,
 371 |    "metadata": {
 372 |     "collapsed": false
 373 |    },
 374 |    "outputs": [],
 375 |    "source": [
 376 |     "bcast_dataset.unpersist()"
 377 |    ]
 378 |   },
 379 |   {
 380 |    "cell_type": "markdown",
 381 |    "metadata": {},
 382 |    "source": [
 383 |     "# Load the data"
 384 |    ]
 385 |   },
 386 |   {
 387 |    "cell_type": "code",
 388 |    "execution_count": 14,
 389 |    "metadata": {
 390 |     "collapsed": true
 391 |    },
 392 |    "outputs": [],
 393 |    "source": [
 394 |     "from pyspark.sql import SQLContext\n",
 395 |     "sqlContext = SQLContext(sc)"
 396 |    ]
 397 |   },
 398 |   {
 399 |    "cell_type": "code",
 400 |    "execution_count": 15,
 401 |    "metadata": {
 402 |     "collapsed": false
 403 |    },
 404 |    "outputs": [
 405 |     {
 406 |      "name": "stdout",
 407 |      "output_type": "stream",
 408 |      "text": [
 409 |       "{\"user_id\":0, \"balance\": 10.0}\r\n",
 410 |       "{\"user_id\":1, \"gender\":\"M\", \"balance\": 1.0}\r\n",
 411 |       "{\"user_id\":2, \"gender\":\"F\", \"balance\": -0.5}\r\n",
 412 |       "{\"user_id\":3, \"gender\":\"F\", \"balance\": 0.0}\r\n",
 413 |       "{\"user_id\":4, \"balance\": 5.0}\r\n",
 414 |       "{\"user_id\":5, \"gender\":\"M\", \"balance\": 3.0}"
 415 |      ]
 416 |     }
 417 |    ],
 418 |    "source": [
 419 |     "!cat /home/vagrant/datasets/users.json"
 420 |    ]
 421 |   },
 422 |   {
 423 |    "cell_type": "code",
 424 |    "execution_count": 16,
 425 |    "metadata": {
 426 |     "collapsed": false
 427 |    },
 428 |    "outputs": [
 429 |     {
 430 |      "name": "stdout",
 431 |      "output_type": "stream",
 432 |      "text": [
 433 |       "+-------+------+-------+\n",
 434 |       "|balance|gender|user_id|\n",
 435 |       "+-------+------+-------+\n",
 436 |       "|   10.0|  null|      0|\n",
 437 |       "|    1.0|     M|      1|\n",
 438 |       "|   -0.5|     F|      2|\n",
 439 |       "|    0.0|     F|      3|\n",
 440 |       "|    5.0|  null|      4|\n",
 441 |       "|    3.0|     M|      5|\n",
 442 |       "+-------+------+-------+\n",
 443 |       "\n"
 444 |      ]
 445 |     }
 446 |    ],
 447 |    "source": [
 448 |     "df = sqlContext.read.json(\"file:///home/vagrant/datasets/users.json\")\n",
 449 |     "df.show()"
 450 |    ]
 451 |   },
 452 |   {
 453 |    "cell_type": "code",
 454 |    "execution_count": 17,
 455 |    "metadata": {
 456 |     "collapsed": false
 457 |    },
 458 |    "outputs": [
 459 |     {
 460 |      "name": "stdout",
 461 |      "output_type": "stream",
 462 |      "text": [
 463 |       "root\n",
 464 |       " |-- balance: double (nullable = true)\n",
 465 |       " |-- gender: string (nullable = true)\n",
 466 |       " |-- user_id: long (nullable = true)\n",
 467 |       "\n"
 468 |      ]
 469 |     }
 470 |    ],
 471 |    "source": [
 472 |     "df.printSchema()"
 473 |    ]
 474 |   },
 475 |   {
 476 |    "cell_type": "code",
 477 |    "execution_count": 18,
 478 |    "metadata": {
 479 |     "collapsed": false
 480 |    },
 481 |    "outputs": [
 482 |     {
 483 |      "name": "stdout",
 484 |      "output_type": "stream",
 485 |      "text": [
 486 |       "+-------+------+-------+\n",
 487 |       "|balance|gender|user_id|\n",
 488 |       "+-------+------+-------+\n",
 489 |       "|    1.0|     M|      1|\n",
 490 |       "|    3.0|     M|      5|\n",
 491 |       "+-------+------+-------+\n",
 492 |       "\n"
 493 |      ]
 494 |     }
 495 |    ],
 496 |    "source": [
 497 |     "(df.filter(df['gender'] != 'null')\n",
 498 |     "   .filter(df['balance'] > 0)\n",
 499 |     "   .select(['balance', 'gender', 'user_id'])\n",
 500 |     "   .show())"
 501 |    ]
 502 |   },
 503 |   {
 504 |    "cell_type": "code",
 505 |    "execution_count": 19,
 506 |    "metadata": {
 507 |     "collapsed": false
 508 |    },
 509 |    "outputs": [
 510 |     {
 511 |      "name": "stdout",
 512 |      "output_type": "stream",
 513 |      "text": [
 514 |       "+-------+------+-------+\n",
 515 |       "|balance|gender|user_id|\n",
 516 |       "+-------+------+-------+\n",
 517 |       "|    1.0|     M|      1|\n",
 518 |       "|    3.0|     M|      5|\n",
 519 |       "+-------+------+-------+\n",
 520 |       "\n"
 521 |      ]
 522 |     }
 523 |    ],
 524 |    "source": [
 525 |     "(df.filter('gender is not null')\n",
 526 |     "   .filter('balance > 0').select(\"*\").show())"
 527 |    ]
 528 |   },
 529 |   {
 530 |    "cell_type": "code",
 531 |    "execution_count": 20,
 532 |    "metadata": {
 533 |     "collapsed": false
 534 |    },
 535 |    "outputs": [
 536 |     {
 537 |      "name": "stdout",
 538 |      "output_type": "stream",
 539 |      "text": [
 540 |       "+-------+------+-------+\n",
 541 |       "|balance|gender|user_id|\n",
 542 |       "+-------+------+-------+\n",
 543 |       "|    1.0|     M|      1|\n",
 544 |       "|    3.0|     M|      5|\n",
 545 |       "+-------+------+-------+\n",
 546 |       "\n"
 547 |      ]
 548 |     }
 549 |    ],
 550 |    "source": [
 551 |     "df.filter('gender is not null and balance > 0').show()"
 552 |    ]
 553 |   },
 554 |   {
 555 |    "cell_type": "code",
 556 |    "execution_count": 21,
 557 |    "metadata": {
 558 |     "collapsed": false
 559 |    },
 560 |    "outputs": [
 561 |     {
 562 |      "name": "stdout",
 563 |      "output_type": "stream",
 564 |      "text": [
 565 |       "+-------+------+-------+\n",
 566 |       "|balance|gender|user_id|\n",
 567 |       "+-------+------+-------+\n",
 568 |       "|    1.0|     M|      1|\n",
 569 |       "|   -0.5|     F|      2|\n",
 570 |       "|    0.0|     F|      3|\n",
 571 |       "|    3.0|     M|      5|\n",
 572 |       "+-------+------+-------+\n",
 573 |       "\n"
 574 |      ]
 575 |     }
 576 |    ],
 577 |    "source": [
 578 |     "df.na.drop().show()"
 579 |    ]
 580 |   },
 581 |   {
 582 |    "cell_type": "code",
 583 |    "execution_count": 22,
 584 |    "metadata": {
 585 |     "collapsed": false
 586 |    },
 587 |    "outputs": [
 588 |     {
 589 |      "name": "stdout",
 590 |      "output_type": "stream",
 591 |      "text": [
 592 |       "+-------+------+-------+\n",
 593 |       "|balance|gender|user_id|\n",
 594 |       "+-------+------+-------+\n",
 595 |       "|    1.0|     M|      1|\n",
 596 |       "|   -0.5|     F|      2|\n",
 597 |       "|    0.0|     F|      3|\n",
 598 |       "|    3.0|     M|      5|\n",
 599 |       "+-------+------+-------+\n",
 600 |       "\n"
 601 |      ]
 602 |     }
 603 |    ],
 604 |    "source": [
 605 |     "df.na.drop(subset=[\"gender\"]).show()"
 606 |    ]
 607 |   },
 608 |   {
 609 |    "cell_type": "code",
 610 |    "execution_count": 23,
 611 |    "metadata": {
 612 |     "collapsed": false
 613 |    },
 614 |    "outputs": [
 615 |     {
 616 |      "name": "stdout",
 617 |      "output_type": "stream",
 618 |      "text": [
 619 |       "+-------+------+-------+\n",
 620 |       "|balance|gender|user_id|\n",
 621 |       "+-------+------+-------+\n",
 622 |       "|   10.0|     U|      0|\n",
 623 |       "|    1.0|     M|      1|\n",
 624 |       "|   -0.5|     F|      2|\n",
 625 |       "|    0.0|     F|      3|\n",
 626 |       "|    5.0|     U|      4|\n",
 627 |       "|    3.0|     M|      5|\n",
 628 |       "+-------+------+-------+\n",
 629 |       "\n"
 630 |      ]
 631 |     }
 632 |    ],
 633 |    "source": [
 634 |     "df.na.fill({'gender': \"U\", 'balance': 0.0}).show()"
 635 |    ]
 636 |   },
 637 |   {
 638 |    "cell_type": "code",
 639 |    "execution_count": 24,
 640 |    "metadata": {
 641 |     "collapsed": false
 642 |    },
 643 |    "outputs": [
 644 |     {
 645 |      "name": "stdout",
 646 |      "output_type": "stream",
 647 |      "text": [
 648 |       "+------+------------+\n",
 649 |       "|gender|avg(balance)|\n",
 650 |       "+------+------------+\n",
 651 |       "|     F|       -0.25|\n",
 652 |       "|     M|         2.0|\n",
 653 |       "|     U|         7.5|\n",
 654 |       "+------+------------+\n",
 655 |       "\n"
 656 |      ]
 657 |     }
 658 |    ],
 659 |    "source": [
 660 |     "(df.na.fill({'gender': \"U\", 'balance': 0.0})\n",
 661 |     "   .groupBy(\"gender\").avg('balance').show())"
 662 |    ]
 663 |   },
 664 |   {
 665 |    "cell_type": "code",
 666 |    "execution_count": 25,
 667 |    "metadata": {
 668 |     "collapsed": true
 669 |    },
 670 |    "outputs": [],
 671 |    "source": [
 672 |     "df.registerTempTable(\"users\")"
 673 |    ]
 674 |   },
 675 |   {
 676 |    "cell_type": "code",
 677 |    "execution_count": 26,
 678 |    "metadata": {
 679 |     "collapsed": false
 680 |    },
 681 |    "outputs": [
 682 |     {
 683 |      "name": "stdout",
 684 |      "output_type": "stream",
 685 |      "text": [
 686 |       "+------+-----+\n",
 687 |       "|gender|  _c1|\n",
 688 |       "+------+-----+\n",
 689 |       "|     F|-0.25|\n",
 690 |       "|     M|  2.0|\n",
 691 |       "+------+-----+\n",
 692 |       "\n"
 693 |      ]
 694 |     }
 695 |    ],
 696 |    "source": [
 697 |     "sqlContext.sql(\"\"\"\n",
 698 |     "    SELECT gender, AVG(balance) \n",
 699 |     "    FROM users \n",
 700 |     "    WHERE gender IS NOT NULL \n",
 701 |     "    GROUP BY gender\"\"\").show()"
 702 |    ]
 703 |   },
 704 |   {
 705 |    "cell_type": "code",
 706 |    "execution_count": 27,
 707 |    "metadata": {
 708 |     "collapsed": false
 709 |    },
 710 |    "outputs": [
 711 |     {
 712 |      "data": {
 713 |       "text/plain": [
 714 |        "pyspark.sql.dataframe.DataFrame"
 715 |       ]
 716 |      },
 717 |      "execution_count": 27,
 718 |      "metadata": {},
 719 |      "output_type": "execute_result"
 720 |     }
 721 |    ],
 722 |    "source": [
 723 |     "type(sqlContext.table(\"users\"))"
 724 |    ]
 725 |   },
 726 |   {
 727 |    "cell_type": "code",
 728 |    "execution_count": 28,
 729 |    "metadata": {
 730 |     "collapsed": false
 731 |    },
 732 |    "outputs": [
 733 |     {
 734 |      "data": {
 735 |       "text/plain": [
 736 |        "[Row(balance=10.0, gender=None, user_id=0),\n",
 737 |        " Row(balance=1.0, gender=u'M', user_id=1),\n",
 738 |        " Row(balance=-0.5, gender=u'F', user_id=2),\n",
 739 |        " Row(balance=0.0, gender=u'F', user_id=3),\n",
 740 |        " Row(balance=5.0, gender=None, user_id=4),\n",
 741 |        " Row(balance=3.0, gender=u'M', user_id=5)]"
 742 |       ]
 743 |      },
 744 |      "execution_count": 28,
 745 |      "metadata": {},
 746 |      "output_type": "execute_result"
 747 |     }
 748 |    ],
 749 |    "source": [
 750 |     "sqlContext.table(\"users\").collect()"
 751 |    ]
 752 |   },
 753 |   {
 754 |    "cell_type": "code",
 755 |    "execution_count": 29,
 756 |    "metadata": {
 757 |     "collapsed": false
 758 |    },
 759 |    "outputs": [
 760 |     {
 761 |      "data": {
 762 |       "text/plain": [
 763 |        "Row(balance=10.0, gender=None, user_id=0)"
 764 |       ]
 765 |      },
 766 |      "execution_count": 29,
 767 |      "metadata": {},
 768 |      "output_type": "execute_result"
 769 |     }
 770 |    ],
 771 |    "source": [
 772 |     "a_row = sqlContext.sql(\"SELECT * FROM users\").first()\n",
 773 |     "a_row"
 774 |    ]
 775 |   },
 776 |   {
 777 |    "cell_type": "code",
 778 |    "execution_count": 30,
 779 |    "metadata": {
 780 |     "collapsed": false
 781 |    },
 782 |    "outputs": [
 783 |     {
 784 |      "name": "stdout",
 785 |      "output_type": "stream",
 786 |      "text": [
 787 |       "10.0\n",
 788 |       "10.0\n"
 789 |      ]
 790 |     }
 791 |    ],
 792 |    "source": [
 793 |     "print a_row['balance']\n",
 794 |     "print a_row.balance"
 795 |    ]
 796 |   },
 797 |   {
 798 |    "cell_type": "code",
 799 |    "execution_count": 31,
 800 |    "metadata": {
 801 |     "collapsed": false
 802 |    },
 803 |    "outputs": [
 804 |     {
 805 |      "data": {
 806 |       "text/plain": [
 807 |        "{'balance': 10.0, 'gender': None, 'user_id': 0}"
 808 |       ]
 809 |      },
 810 |      "execution_count": 31,
 811 |      "metadata": {},
 812 |      "output_type": "execute_result"
 813 |     }
 814 |    ],
 815 |    "source": [
 816 |     "a_row.asDict()"
 817 |    ]
 818 |   },
 819 |   {
 820 |    "cell_type": "code",
 821 |    "execution_count": 32,
 822 |    "metadata": {
 823 |     "collapsed": true
 824 |    },
 825 |    "outputs": [],
 826 |    "source": [
 827 |     "!rm -rf /tmp/complete_users*"
 828 |    ]
 829 |   },
 830 |   {
 831 |    "cell_type": "code",
 832 |    "execution_count": 33,
 833 |    "metadata": {
 834 |     "collapsed": false
 835 |    },
 836 |    "outputs": [],
 837 |    "source": [
 838 |     "(df.na.drop().write\n",
 839 |     "   .save(\"file:///tmp/complete_users.json\", format='json'))"
 840 |    ]
 841 |   },
 842 |   {
 843 |    "cell_type": "code",
 844 |    "execution_count": 34,
 845 |    "metadata": {
 846 |     "collapsed": false
 847 |    },
 848 |    "outputs": [
 849 |     {
 850 |      "name": "stdout",
 851 |      "output_type": "stream",
 852 |      "text": [
 853 |       "total 28\r\n",
 854 |       "4 drwxrwxr-x  2 vagrant vagrant 4096 May 10 20:36 .\r\n",
 855 |       "4 drwxrwxrwt 22 root    root    4096 May 10 20:36 ..\r\n",
 856 |       "4 -rw-r--r--  1 vagrant vagrant   83 May 10 20:36 part-r-00000-f5728f74-10d9-4c7a-8865-64cb80c7ca0a\r\n",
 857 |       "4 -rw-rw-r--  1 vagrant vagrant   12 May 10 20:36 .part-r-00000-f5728f74-10d9-4c7a-8865-64cb80c7ca0a.crc\r\n",
 858 |       "4 -rw-r--r--  1 vagrant vagrant   82 May 10 20:36 part-r-00001-f5728f74-10d9-4c7a-8865-64cb80c7ca0a\r\n",
 859 |       "4 -rw-rw-r--  1 vagrant vagrant   12 May 10 20:36 .part-r-00001-f5728f74-10d9-4c7a-8865-64cb80c7ca0a.crc\r\n",
 860 |       "0 -rw-r--r--  1 vagrant vagrant    0 May 10 20:36 _SUCCESS\r\n",
 861 |       "4 -rw-rw-r--  1 vagrant vagrant    8 May 10 20:36 ._SUCCESS.crc\r\n"
 862 |      ]
 863 |     }
 864 |    ],
 865 |    "source": [
 866 |     "!ls -als /tmp/complete_users.json"
 867 |    ]
 868 |   },
 869 |   {
 870 |    "cell_type": "code",
 871 |    "execution_count": 35,
 872 |    "metadata": {
 873 |     "collapsed": false
 874 |    },
 875 |    "outputs": [
 876 |     {
 877 |      "name": "stdout",
 878 |      "output_type": "stream",
 879 |      "text": [
 880 |       "+-------+------+-------+\n",
 881 |       "|balance|gender|user_id|\n",
 882 |       "+-------+------+-------+\n",
 883 |       "|    0.0|     F|      3|\n",
 884 |       "|    3.0|     M|      5|\n",
 885 |       "|    1.0|     M|      1|\n",
 886 |       "|   -0.5|     F|      2|\n",
 887 |       "+-------+------+-------+\n",
 888 |       "\n"
 889 |      ]
 890 |     }
 891 |    ],
 892 |    "source": [
 893 |     "sqlContext.sql(\n",
 894 |     "    \"SELECT * FROM json.`file:///tmp/complete_users.json`\").show()"
 895 |    ]
 896 |   },
 897 |   {
 898 |    "cell_type": "code",
 899 |    "execution_count": 36,
 900 |    "metadata": {
 901 |     "collapsed": true
 902 |    },
 903 |    "outputs": [],
 904 |    "source": [
 905 |     "df.na.drop().write.save(\n",
 906 |     "    \"file:///tmp/complete_users.parquet\", format='parquet')"
 907 |    ]
 908 |   },
 909 |   {
 910 |    "cell_type": "code",
 911 |    "execution_count": 37,
 912 |    "metadata": {
 913 |     "collapsed": false
 914 |    },
 915 |    "outputs": [
 916 |     {
 917 |      "name": "stdout",
 918 |      "output_type": "stream",
 919 |      "text": [
 920 |       "total 44\r\n",
 921 |       "4 drwxrwxr-x  2 vagrant vagrant 4096 May 10 20:36 .\r\n",
 922 |       "4 drwxrwxrwt 23 root    root    4096 May 10 20:36 ..\r\n",
 923 |       "4 -rw-r--r--  1 vagrant vagrant  376 May 10 20:36 _common_metadata\r\n",
 924 |       "4 -rw-rw-r--  1 vagrant vagrant   12 May 10 20:36 ._common_metadata.crc\r\n",
 925 |       "4 -rw-r--r--  1 vagrant vagrant 1082 May 10 20:36 _metadata\r\n",
 926 |       "4 -rw-rw-r--  1 vagrant vagrant   20 May 10 20:36 ._metadata.crc\r\n",
 927 |       "4 -rw-r--r--  1 vagrant vagrant  750 May 10 20:36 part-r-00000-810195c2-ffa9-4a54-add7-61e6a7c92095.gz.parquet\r\n",
 928 |       "4 -rw-rw-r--  1 vagrant vagrant   16 May 10 20:36 .part-r-00000-810195c2-ffa9-4a54-add7-61e6a7c92095.gz.parquet.crc\r\n",
 929 |       "4 -rw-r--r--  1 vagrant vagrant  746 May 10 20:36 part-r-00001-810195c2-ffa9-4a54-add7-61e6a7c92095.gz.parquet\r\n",
 930 |       "4 -rw-rw-r--  1 vagrant vagrant   16 May 10 20:36 .part-r-00001-810195c2-ffa9-4a54-add7-61e6a7c92095.gz.parquet.crc\r\n",
 931 |       "0 -rw-r--r--  1 vagrant vagrant    0 May 10 20:36 _SUCCESS\r\n",
 932 |       "4 -rw-rw-r--  1 vagrant vagrant    8 May 10 20:36 ._SUCCESS.crc\r\n"
 933 |      ]
 934 |     }
 935 |    ],
 936 |    "source": [
 937 |     "!ls -als /tmp/complete_users.parquet/"
 938 |    ]
 939 |   },
 940 |   {
 941 |    "cell_type": "code",
 942 |    "execution_count": 38,
 943 |    "metadata": {
 944 |     "collapsed": false
 945 |    },
 946 |    "outputs": [],
 947 |    "source": [
 948 |     "from pyspark.sql import Row\n",
 949 |     "\n",
 950 |     "rdd_gender = \\\n",
 951 |     "    sc.parallelize([Row(short_gender=\"M\", long_gender=\"Male\"),\n",
 952 |     "                    Row(short_gender=\"F\", long_gender=\"Female\")])\n",
 953 |     "\n",
 954 |     "(sqlContext.createDataFrame(rdd_gender)\n",
 955 |     "           .registerTempTable(\"gender_maps\"))"
 956 |    ]
 957 |   },
 958 |   {
 959 |    "cell_type": "code",
 960 |    "execution_count": 39,
 961 |    "metadata": {
 962 |     "collapsed": false
 963 |    },
 964 |    "outputs": [
 965 |     {
 966 |      "name": "stdout",
 967 |      "output_type": "stream",
 968 |      "text": [
 969 |       "+-----------+------------+\n",
 970 |       "|long_gender|short_gender|\n",
 971 |       "+-----------+------------+\n",
 972 |       "|       Male|           M|\n",
 973 |       "|     Female|           F|\n",
 974 |       "+-----------+------------+\n",
 975 |       "\n"
 976 |      ]
 977 |     }
 978 |    ],
 979 |    "source": [
 980 |     "sqlContext.table(\"gender_maps\").show()"
 981 |    ]
 982 |   },
 983 |   {
 984 |    "cell_type": "code",
 985 |    "execution_count": 40,
 986 |    "metadata": {
 987 |     "collapsed": false
 988 |    },
 989 |    "outputs": [
 990 |     {
 991 |      "name": "stdout",
 992 |      "output_type": "stream",
 993 |      "text": [
 994 |       "+-------+-----------+-------+\n",
 995 |       "|balance|long_gender|user_id|\n",
 996 |       "+-------+-----------+-------+\n",
 997 |       "|    1.0|       Male|      1|\n",
 998 |       "|    3.0|       Male|      5|\n",
 999 |       "|   -0.5|     Female|      2|\n",
1000 |       "|    0.0|     Female|      3|\n",
1001 |       "+-------+-----------+-------+\n",
1002 |       "\n"
1003 |      ]
1004 |     }
1005 |    ],
1006 |    "source": [
1007 |     "sqlContext.sql(\"\"\"\n",
1008 |     "    SELECT balance, long_gender, user_id \n",
1009 |     "    FROM parquet.`file:///tmp/complete_users.parquet` \n",
1010 |     "    JOIN gender_maps ON gender=short_gender\"\"\").show()"
1011 |    ]
1012 |   },
1013 |   {
1014 |    "cell_type": "code",
1015 |    "execution_count": 41,
1016 |    "metadata": {
1017 |     "collapsed": false
1018 |    },
1019 |    "outputs": [
1020 |     {
1021 |      "data": {
1022 |       "text/plain": [
1023 |        "[u'gender_maps', u'users']"
1024 |       ]
1025 |      },
1026 |      "execution_count": 41,
1027 |      "metadata": {},
1028 |      "output_type": "execute_result"
1029 |     }
1030 |    ],
1031 |    "source": [
1032 |     "sqlContext.tableNames()"
1033 |    ]
1034 |   },
1035 |   {
1036 |    "cell_type": "code",
1037 |    "execution_count": 42,
1038 |    "metadata": {
1039 |     "collapsed": true
1040 |    },
1041 |    "outputs": [],
1042 |    "source": [
1043 |     "for table in sqlContext.tableNames():\n",
1044 |     "    sqlContext.dropTempTable(table)"
1045 |    ]
1046 |   },
1047 |   {
1048 |    "cell_type": "code",
1049 |    "execution_count": null,
1050 |    "metadata": {
1051 |     "collapsed": true
1052 |    },
1053 |    "outputs": [],
1054 |    "source": []
1055 |   }
1056 |  ],
1057 |  "metadata": {
1058 |   "kernelspec": {
1059 |    "display_name": "Python 2",
1060 |    "language": "python",
1061 |    "name": "python2"
1062 |   },
1063 |   "language_info": {
1064 |    "codemirror_mode": {
1065 |     "name": "ipython",
1066 |     "version": 2
1067 |    },
1068 |    "file_extension": ".py",
1069 |    "mimetype": "text/x-python",
1070 |    "name": "python",
1071 |    "nbconvert_exporter": "python",
1072 |    "pygments_lexer": "ipython2",
1073 |    "version": "2.7.6"
1074 |   }
1075 |  },
1076 |  "nbformat": 4,
1077 |  "nbformat_minor": 0
1078 | }
1079 | 


--------------------------------------------------------------------------------
/Chapter 09/Chapter_9_code_02.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "!rm -rf kdd*\n",
 12 |     "\n",
 13 |     "# !wget -q -O ../datasets/kddtrain.gz \\\n",
 14 |     "# http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz\n",
 15 |     "\n",
 16 |     "!wget -q -O ../datasets/kddtrain.gz \\\n",
 17 |     "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\n",
 18 |     "\n",
 19 |     "!wget -q -O ../datasets/kddtest.gz \\\n",
 20 |     "http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz\n",
 21 |     "    \n",
 22 |     "!wget -q -O ../datasets/kddnames \\\n",
 23 |     "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names\n",
 24 |     "\n",
 25 |     "!gunzip ../datasets/kdd*gz"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 2,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.\r\n",
 40 |       "0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.\r\n",
 41 |       "0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.\r\n"
 42 |      ]
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "!head -3 ../datasets/kddtrain"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {
 53 |     "collapsed": false
 54 |    },
 55 |    "outputs": [
 56 |     {
 57 |      "name": "stdout",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "Num features: 41\n",
 61 |       "First 10: ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot']\n"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "with open('../datasets/kddnames', 'r') as fh:\n",
 67 |     "    header = [line.split(':')[0] \n",
 68 |     "              for line in fh.read().splitlines()][1:]\n",
 69 |     "\n",
 70 |     "header.append('target')\n",
 71 |     "\n",
 72 |     "print \"Num features:\", len(header)-1\n",
 73 |     "print \"First 10:\", header[:10]"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 4,
 79 |    "metadata": {
 80 |     "collapsed": false
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "train_rdd = sc.textFile('file:///home/vagrant/datasets/kddtrain')\n",
 85 |     "test_rdd = sc.textFile('file:///home/vagrant/datasets/kddtest')"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 5,
 91 |    "metadata": {
 92 |     "collapsed": false,
 93 |     "scrolled": false
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "def line_parser(line):\n",
 98 |     "\n",
 99 |     "    def piece_parser(piece):\n",
100 |     "            if \".\" in piece or piece.isdigit():\n",
101 |     "                return float(piece)\n",
102 |     "            else:\n",
103 |     "                return piece\n",
104 |     "\n",
105 |     "    return [piece_parser(piece) for piece in line[:-1].split(',')]\n",
106 |     "              \n",
107 |     "train_df = sqlContext.createDataFrame(\n",
108 |     "    train_rdd.map(line_parser), header)\n",
109 |     "\n",
110 |     "test_df = sqlContext.createDataFrame(\n",
111 |     "    test_rdd.map(line_parser), header)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 6,
117 |    "metadata": {
118 |     "collapsed": false
119 |    },
120 |    "outputs": [
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "Train observations: 494021\n",
126 |       "Test observations: 311029\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "print \"Train observations:\", train_df.count()\n",
132 |     "print \"Test observations:\", test_df.count()"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 7,
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "outputs": [
142 |     {
143 |      "name": "stdout",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "root\n",
147 |       " |-- duration: double (nullable = true)\n",
148 |       " |-- protocol_type: string (nullable = true)\n",
149 |       " |-- service: string (nullable = true)\n",
150 |       " |-- flag: string (nullable = true)\n",
151 |       " |-- src_bytes: double (nullable = true)\n",
152 |       " |-- dst_bytes: double (nullable = true)\n",
153 |       " |-- land: double (nullable = true)\n",
154 |       " |-- wrong_fragment: double (nullable = true)\n",
155 |       " |-- urgent: double (nullable = true)\n",
156 |       " |-- hot: double (nullable = true)\n",
157 |       " |-- num_failed_logins: double (nullable = true)\n",
158 |       " |-- logged_in: double (nullable = true)\n",
159 |       " |-- num_compromised: double (nullable = true)\n",
160 |       " |-- root_shell: double (nullable = true)\n",
161 |       " |-- su_attempted: double (nullable = true)\n",
162 |       " |-- num_root: double (nullable = true)\n",
163 |       " |-- num_file_creations: double (nullable = true)\n",
164 |       " |-- num_shells: double (nullable = true)\n",
165 |       " |-- num_access_files: double (nullable = true)\n",
166 |       " |-- num_outbound_cmds: double (nullable = true)\n",
167 |       " |-- is_host_login: double (nullable = true)\n",
168 |       " |-- is_guest_login: double (nullable = true)\n",
169 |       " |-- count: double (nullable = true)\n",
170 |       " |-- srv_count: double (nullable = true)\n",
171 |       " |-- serror_rate: double (nullable = true)\n",
172 |       " |-- srv_serror_rate: double (nullable = true)\n",
173 |       " |-- rerror_rate: double (nullable = true)\n",
174 |       " |-- srv_rerror_rate: double (nullable = true)\n",
175 |       " |-- same_srv_rate: double (nullable = true)\n",
176 |       " |-- diff_srv_rate: double (nullable = true)\n",
177 |       " |-- srv_diff_host_rate: double (nullable = true)\n",
178 |       " |-- dst_host_count: double (nullable = true)\n",
179 |       " |-- dst_host_srv_count: double (nullable = true)\n",
180 |       " |-- dst_host_same_srv_rate: double (nullable = true)\n",
181 |       " |-- dst_host_diff_srv_rate: double (nullable = true)\n",
182 |       " |-- dst_host_same_src_port_rate: double (nullable = true)\n",
183 |       " |-- dst_host_srv_diff_host_rate: double (nullable = true)\n",
184 |       " |-- dst_host_serror_rate: double (nullable = true)\n",
185 |       " |-- dst_host_srv_serror_rate: double (nullable = true)\n",
186 |       " |-- dst_host_rerror_rate: double (nullable = true)\n",
187 |       " |-- dst_host_srv_rerror_rate: double (nullable = true)\n",
188 |       " |-- target: string (nullable = true)\n",
189 |       "\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "train_df.printSchema()"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 8,
200 |    "metadata": {
201 |     "collapsed": false
202 |    },
203 |    "outputs": [],
204 |    "source": [
205 |     "from pyspark.ml import Pipeline\n",
206 |     "from pyspark.ml.feature import StringIndexer\n",
207 |     "\n",
208 |     "\n",
209 |     "cols_categorical = [\"protocol_type\", \"service\", \"flag\",\"target\"]\n",
210 |     "preproc_stages = []\n",
211 |     "\n",
212 |     "for col in cols_categorical:\n",
213 |     "    out_col = col + \"_cat\"\n",
214 |     "    preproc_stages.append(\n",
215 |     "        StringIndexer(\n",
216 |     "            inputCol=col, outputCol=out_col, handleInvalid=\"skip\"))\n",
217 |     "\n",
218 |     "pipeline = Pipeline(stages=preproc_stages)\n",
219 |     "indexer = pipeline.fit(train_df)\n",
220 |     "\n",
221 |     "train_num_df = indexer.transform(train_df)\n",
222 |     "test_num_df = indexer.transform(test_df)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 9,
228 |    "metadata": {
229 |     "collapsed": false
230 |    },
231 |    "outputs": [
232 |     {
233 |      "name": "stdout",
234 |      "output_type": "stream",
235 |      "text": [
236 |       "[StringIndexer_46ae881ca7febd4a4e81, StringIndexer_49f6bbd151ce1e9bb5a7, StringIndexer_4cfcb173a161bbe6cd60, StringIndexer_4aa581cc25ad8d6eed7e]\n",
237 |       "\n",
238 |       "Pipeline_450a8f0d2083e96d03ca\n",
239 |       "PipelineModel_475d9917035781236edb\n"
240 |      ]
241 |     }
242 |    ],
243 |    "source": [
244 |     "print pipeline.getStages()\n",
245 |     "print\n",
246 |     "print pipeline\n",
247 |     "print indexer"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 10,
253 |    "metadata": {
254 |     "collapsed": false
255 |    },
256 |    "outputs": [
257 |     {
258 |      "name": "stdout",
259 |      "output_type": "stream",
260 |      "text": [
261 |       "First observation, after the 4 StringIndexers:\n",
262 |       "\n",
263 |       "Row(duration=0.0, protocol_type=u'tcp', service=u'http', flag=u'SF', src_bytes=181.0, dst_bytes=5450.0, land=0.0, wrong_fragment=0.0, urgent=0.0, hot=0.0, num_failed_logins=0.0, logged_in=1.0, num_compromised=0.0, root_shell=0.0, su_attempted=0.0, num_root=0.0, num_file_creations=0.0, num_shells=0.0, num_access_files=0.0, num_outbound_cmds=0.0, is_host_login=0.0, is_guest_login=0.0, count=8.0, srv_count=8.0, serror_rate=0.0, srv_serror_rate=0.0, rerror_rate=0.0, srv_rerror_rate=0.0, same_srv_rate=1.0, diff_srv_rate=0.0, srv_diff_host_rate=0.0, dst_host_count=9.0, dst_host_srv_count=9.0, dst_host_same_srv_rate=1.0, dst_host_diff_srv_rate=0.0, dst_host_same_src_port_rate=0.11, dst_host_srv_diff_host_rate=0.0, dst_host_serror_rate=0.0, dst_host_srv_serror_rate=0.0, dst_host_rerror_rate=0.0, dst_host_srv_rerror_rate=0.0, target=u'normal', protocol_type_cat=1.0, service_cat=2.0, flag_cat=0.0, target_cat=2.0)\n"
264 |      ]
265 |     }
266 |    ],
267 |    "source": [
268 |     "print \"First observation, after the 4 StringIndexers:\\n\"\n",
269 |     "print train_num_df.first()"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 11,
275 |    "metadata": {
276 |     "collapsed": false
277 |    },
278 |    "outputs": [
279 |     {
280 |      "name": "stdout",
281 |      "output_type": "stream",
282 |      "text": [
283 |       "['num_access_files', 'src_bytes', 'srv_count', 'num_outbound_cmds', 'rerror_rate', 'urgent', 'protocol_type_cat', 'dst_host_same_srv_rate', 'duration', 'dst_host_diff_srv_rate', 'srv_serror_rate', 'is_host_login', 'wrong_fragment', 'serror_rate', 'num_compromised', 'is_guest_login', 'dst_host_rerror_rate', 'dst_host_srv_serror_rate', 'hot', 'dst_host_srv_count', 'logged_in', 'srv_rerror_rate', 'dst_host_srv_diff_host_rate', 'srv_diff_host_rate', 'dst_host_same_src_port_rate', 'root_shell', 'service_cat', 'su_attempted', 'dst_host_count', 'num_file_creations', 'flag_cat', 'count', 'land', 'same_srv_rate', 'dst_bytes', 'num_shells', 'dst_host_srv_rerror_rate', 'num_root', 'diff_srv_rate', 'num_failed_logins', 'dst_host_serror_rate']\n",
284 |       "Total numerical features: 41\n"
285 |      ]
286 |     }
287 |    ],
288 |    "source": [
289 |     "features_header = set(header) \\\n",
290 |     "                - set(cols_categorical) \\\n",
291 |     "                | set([c + \"_cat\" for c in cols_categorical]) \\\n",
292 |     "                - set([\"target\", \"target_cat\"])\n",
293 |     "features_header = list(features_header)\n",
294 |     "print features_header\n",
295 |     "print \"Total numerical features:\", len(features_header)"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 12,
301 |    "metadata": {
302 |     "collapsed": false
303 |    },
304 |    "outputs": [],
305 |    "source": [
306 |     "from pyspark.mllib.linalg import Vectors\n",
307 |     "from pyspark.ml.feature import VectorAssembler\n",
308 |     "\n",
309 |     "assembler = VectorAssembler(\n",
310 |     "    inputCols=features_header,\n",
311 |     "    outputCol=\"features\")\n",
312 |     "\n",
313 |     "Xy_train = (assembler\n",
314 |     "                .transform(train_num_df)\n",
315 |     "                .select(\"features\", \"target_cat\"))\n",
316 |     "Xy_test = (assembler\n",
317 |     "                .transform(test_num_df)\n",
318 |     "                .select(\"features\", \"target_cat\"))"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 13,
324 |    "metadata": {
325 |     "collapsed": false
326 |    },
327 |    "outputs": [
328 |     {
329 |      "data": {
330 |       "text/plain": [
331 |        "Row(features=SparseVector(41, {1: 181.0, 2: 8.0, 6: 1.0, 7: 1.0, 19: 9.0, 20: 1.0, 24: 0.11, 26: 2.0, 28: 9.0, 31: 8.0, 33: 1.0, 34: 5450.0}), target_cat=2.0)"
332 |       ]
333 |      },
334 |      "execution_count": 13,
335 |      "metadata": {},
336 |      "output_type": "execute_result"
337 |     }
338 |    ],
339 |    "source": [
340 |     "Xy_train.first()"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 14,
346 |    "metadata": {
347 |     "collapsed": false
348 |    },
349 |    "outputs": [],
350 |    "source": [
351 |     "from pyspark.ml.classification import RandomForestClassifier\n",
352 |     "\n",
353 |     "clf = RandomForestClassifier(\n",
354 |     "    labelCol=\"target_cat\", featuresCol=\"features\", \n",
355 |     "    maxBins=100, seed=101)\n",
356 |     "fit_clf = clf.fit(Xy_train)"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 15,
362 |    "metadata": {
363 |     "collapsed": false
364 |    },
365 |    "outputs": [
366 |     {
367 |      "name": "stdout",
368 |      "output_type": "stream",
369 |      "text": [
370 |       "RandomForestClassifier_40f9923cb13e74b28cbe\n",
371 |       "RandomForestClassificationModel (uid=rfc_ac17a1f959a3) with 20 trees\n"
372 |      ]
373 |     }
374 |    ],
375 |    "source": [
376 |     "print clf\n",
377 |     "print fit_clf"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 16,
383 |    "metadata": {
384 |     "collapsed": false
385 |    },
386 |    "outputs": [],
387 |    "source": [
388 |     "Xy_pred_train = fit_clf.transform(Xy_train)\n",
389 |     "Xy_pred_test = fit_clf.transform(Xy_test)"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": 17,
395 |    "metadata": {
396 |     "collapsed": false
397 |    },
398 |    "outputs": [
399 |     {
400 |      "name": "stdout",
401 |      "output_type": "stream",
402 |      "text": [
403 |       "First observation after classification stage:\n",
404 |       "Row(features=SparseVector(41, {1: 105.0, 2: 1.0, 6: 2.0, 7: 1.0, 9: 0.01, 19: 254.0, 26: 1.0, 28: 255.0, 31: 1.0, 33: 1.0, 34: 146.0}), target_cat=2.0, rawPrediction=DenseVector([0.0283, 0.0112, 19.3474, 0.0677, 0.0251, 0.1414, 0.0357, 0.1194, 0.1309, 0.041, 0.0257, 0.0079, 0.0046, 0.0004, 0.0029, 0.0016, 0.002, 0.0023, 0.0013, 0.0008, 0.0012, 0.0006, 0.0006]), probability=DenseVector([0.0014, 0.0006, 0.9674, 0.0034, 0.0013, 0.0071, 0.0018, 0.006, 0.0065, 0.002, 0.0013, 0.0004, 0.0002, 0.0, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0, 0.0001, 0.0, 0.0]), prediction=2.0)\n"
405 |      ]
406 |     }
407 |    ],
408 |    "source": [
409 |     "print \"First observation after classification stage:\"\n",
410 |     "print Xy_pred_test.first()"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": 18,
416 |    "metadata": {
417 |     "collapsed": false
418 |    },
419 |    "outputs": [
420 |     {
421 |      "name": "stdout",
422 |      "output_type": "stream",
423 |      "text": [
424 |       "F1-score train set: 0.991904372002\n",
425 |       "F1-score test set: 0.966840043466\n"
426 |      ]
427 |     }
428 |    ],
429 |    "source": [
430 |     "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
431 |     "\n",
432 |     "evaluator = MulticlassClassificationEvaluator(\n",
433 |     "    labelCol=\"target_cat\", predictionCol=\"prediction\", \n",
434 |     "    metricName=\"f1\")\n",
435 |     "\n",
436 |     "print \"F1-score train set:\", evaluator.evaluate(Xy_pred_train)\n",
437 |     "print \"F1-score test set:\", evaluator.evaluate(Xy_pred_test)"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": 19,
443 |    "metadata": {
444 |     "collapsed": false
445 |    },
446 |    "outputs": [
447 |     {
448 |      "name": "stdout",
449 |      "output_type": "stream",
450 |      "text": [
451 |       "F1-score test set: 0.966840043466\n"
452 |      ]
453 |     }
454 |    ],
455 |    "source": [
456 |     "# All in one\n",
457 |     "\n",
458 |     "full_stages = preproc_stages + [assembler, clf]\n",
459 |     "full_pipeline = Pipeline(stages=full_stages)\n",
460 |     "full_model = full_pipeline.fit(train_df)\n",
461 |     "predictions = full_model.transform(test_df)\n",
462 |     "print \"F1-score test set:\", evaluator.evaluate(predictions)"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": 20,
468 |    "metadata": {
469 |     "collapsed": false
470 |    },
471 |    "outputs": [],
472 |    "source": [
473 |     "import matplotlib.pyplot as plt\n",
474 |     "import numpy as np\n",
475 |     "%matplotlib inline\n",
476 |     "    \n",
477 |     "def plot_confusion_matrix(cm):\n",
478 |     "    cm_normalized = \\\n",
479 |     "        cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
480 |     "    plt.imshow(\n",
481 |     "        cm_normalized, interpolation='nearest', cmap=plt.cm.Blues)\n",
482 |     "    plt.title('Normalized Confusion matrix')\n",
483 |     "    plt.colorbar()\n",
484 |     "    plt.tight_layout()\n",
485 |     "    plt.ylabel('True label')\n",
486 |     "    plt.xlabel('Predicted label')\n"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "code",
491 |    "execution_count": 21,
492 |    "metadata": {
493 |     "collapsed": false
494 |    },
495 |    "outputs": [
496 |     {
497 |      "data": {
498 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAU4AAAEoCAYAAAA3/hguAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAH4dJREFUeJzt3Xu8VFXdx/HPnCMqclEUQyWVvJHaRTTFMOB4qcBXF+1m\naBfTNM1LT1qmPT2CaGVmpampaZpZWZqSWaFleTiYKKCAeMFLaaYgKpIKoiKc54/f2sw+c2b2XpuZ\nPbNmzvfNa17smdmzZ82Zw491/4GIiIiIiIiIiIiIiIiIiIiIiIj0cZ3A0e74COC2Gl9/BLAWaKvx\ndX30B24B/gv8torr5PFzaZSxwKJGF0IkzZPAUmCT2GNfBO5oSGl6uwM4KsfrjyA9cB4OzAVeARYD\nfwb2q8F7fxa4J+W9W8laYIdGF0L89ZVfzPXVBnylBtcpuFsrOQX4EXAO8BZgW+AS4CM1uPb2wKNY\nQOkrkn4/NqhbKUSq9ATwDWAZsKl7rLTGOQaYgzUpZwPvjT3XiQWVfwArgR2xQHA88BjwMjDVPT7L\nXeM3QD/3+s2APwLPAS9iTdfhsevHa5xHAjPd8WlYDTC6rQauds9tCvwMqx0+DZxN8T/PNuB84Hng\nn8AJVK5xbuqu/fEyz0U2Ai4AnnG3HwEbuuc63PufgtXqF7vPAHAW8DrwhnuPo4ApwLWxa48oKduR\nrswvA//CasLR4zNjr0v7vqYCd7rr3AZsUeGzReX/Ovb9LAYOAQ7GAv4y4PTY+ftg3/Fyd+5FFL/n\nLvdZVrjP+8nY9U8DlgDXuMf+416zo3uPUe7+Ntj3Nq5CeUXq5gngQOBGLMBAz8C5OfYP4QjsH/Cn\nsQA3xD3fiTX3d3XP98P+gUwDBgK7YQHi71ggGAw8CHwudv1DgY3d+de710YqBc64t2JB64Pu/jTg\nUqwPcUusOXyse+444GEsOA9x119D+cA5AQvISS2WqcBdwFB3+4d7DCwIrMYCYjswEfvPJfoPajLw\ni9i1JlM5cA4AXgJ2ds8Nw3620PPn4vN9PQbshP3M7wC+W+GzReX/liv/F4EXgF+58uwGvIrVnAH2\nxIJnm3vsIXq2ZEqb6tH1v4v93mxMz8CJe88Hse/yNuC8CmUVqasngAOA3bEaylB6Bs7PAneXvOYu\n4PPu+A4sMMStpWctZy5Wa4mcj9XMytkD+4ceSQuc/YF7Y9cfBryG/SOMTMICN+7vY2PPvZ/KNc4j\nsJpQksexABv5APYzBQsCr5ZceykWXKB3DbP0/gh6Bs7lwMewzxx3JMWfi8/39c3Yc8cD00s/VEn5\no+b1IFeevWPnzAU+WuH1/wPcFLtfLnC+TrGGHj0WD5wANwMLgfkUa7BSB+rjTPcg1mQ+HeiOPb4N\n8FTJuf92j0dKf9HBAkRkVZn7A93xJsDlWK31JWAGViPz7Sv9GVaD/L67vz32j2sJFmiWA5dhNU+A\nrUvKW/rZ4pZh/5Ek/f5sg/084teL/2yW0bMP81WKnz2LlcBhWI15MfZdjaxQnrTv69nYcfy7KGcZ\nxd+HVe7v0u9ygDvexZVrCfZdfpvK3QCR57HuiiRXYv+xX4TVUKVOFDj9TAaOoWcf4zMUm2KR7d3j\nkW7W36nYP7h9sIA5Hv9BptOxJufRscf+g9VitsCap0Pcdd/pnl8CbBc7P35capa71qEJ5yzGaobx\n6y1OL3pZK+g5u2Grkuf/gtVot8Km7FxR5ho+31deLsWa5zthP/P/Jf3fXtrvzkCsD/lKrF94SPLp\nUksKnH7+ic0njPdLTccC2yRs1PMw4O1YzSLiE+QKFY4HYrWWl7D+ucmeZZ0InIQ1XV+PPb4ECzA/\nxJqWbdggQzSgcD1wMsU+zvjgRqmXgDOxUfSPYkGtn3vv77lzrsP6AKM+zjPp2dzOYr4r57ZY4Dkj\n9txbXBkGYLWulVjfbKlafV/rYyA28POqe8/jS55fin0XWVyIDXAdC/wJaz1InShw+puKBYioJrAM\n+BBWM3wB+Jq7H++HLK01lKtFdJccR/cvwPrsXsD64qZXeH3p6z6FBaqHKY6s/8Q99zms3+whV84b\nKNbersAGGRZg/XM3JrwfWAA+BQuOz2HN4C9THMA6x13nfneb6x4r97mTPg/A7dh/XPdjo+K3xJ5v\nA76K1RyXYRPFjy9znazfV2kZypUx6X7c17CR/peBn2KzJ+LnT8FGzpcDn0h47+ixj2I17OhznoIN\nQE1KKIOIiIiXq7Aa/cKEc36MzahYQHGKl4hInzUWC4aVAufB2Io3gNH0nnkhItInjaBy4LwM6++O\nLMKm7iVSH6eI9GXD6TkN72ls4UiiINfAFgZs0929cn1nrohIo4wdN56ZXTNqNzuhfaNu1ryefl7R\nK9gqvCxKy5s6jTDIwNm9cjEb73FCr8dXL5lNv6336fHY8jkX16tYdXHO1Cl868wpjS5Grlr9M7b6\n54PKn7F/vxrP6FrzetlYUMlr8y8ZlPEdnsGmuUWiZcqJGtVUn4D1JTyGbaQhIlJeoc3/lt0fKO4P\nsS+2vHpp5dNNI2qc7cDFwEFYZJ+DFf7hBpRFREJXqKoWex226m4o1pc5meK6/suxEfWDsb0VVgJf\n8LloIwLnPlghn3T3f4NN6E0NnG0Dh6ed0vTGje9odBFy1+qfsdU/H9T5M65fTTLisyjgxKwXbUTg\nLDeKNdrnhe2DFDhbQat/xlb/fFDnz9jWXr/38tSIwOm18cXqJbPXHbcNHN4ngqZIs+ma0UnXjM58\n36S6pnouGhE4S0extsVqnT2Ujp6LSHjGje/oUfv89tln1f5Nqmuq56IRJZqL7dY9Attw4jBscEhE\npLdCwf9WJ42ocb6Jdcbeho2wRxvuioj0FmCNs1ET4KdTOS0B4D+xfcje2QbEWm3CvEjLUx+niEhG\nqnGKiGSkGqeISEaqcYqIZKTAKSKSUZua6iIi2ajGKSKSkQaHREQy0iYfIiIZBdhUD69EIiJx1a9V\nT8s4MQSYhuVVvwfYPa1ICpwiErbqUmdEGScmALthGxvvWnLON4H7gHdjaTQuTCtS0zfVs649z7K2\nXevaRQJQ3eCQT8aJXYFz3fEj2M5tWwLPV7qoapwiErbqapzlMk6U7oq+APiYO94H2J6U3OpNX+MU\nkRaXUONcs+wx1i57POnVPhknzsWa5/OAhe7vNUkvUOAUkbAljKq3Dx1J+9CR6+6vefzW0lN8Mk68\nAhwVu/8E8K+kIqmpLiJhq66p7pNxYlP3HMAxwAxgRVKRVOMUkbBVNzhUKePEl9zzl2Oj7T/HmvUP\nAEenXVSBU0TCVv0E+HIZJy6PHc8CRpKBAqeIhE1r1UVEMgpwyaUCp4iETTVOEZFs2tpU42y4p2de\n4H3ukAPPynTt5X+bnLU4IpImvApn3wucItJcCmqqi4hko8ApIpKRAqeISEYKnCIiWYUXNxU4RSRs\nqnGKiGSkwCkikpECp4hIRgqcIiJZhRc3tQO8iIStUCh43ypIy6s+FLgVmI9tZHxkWpmavsbZ3e2T\ni6lo5RuJOZh6yLr2fMjHL/O/9o3HZbq2SF9VZVM9yqt+EJZ/aA6WOiOeHvhELEHbGVgQfQT4JbZ7\nfFmqcYpI0AptBe9bGfG86qsp5lWPWwIMdseDgWUkBE1ogRqniLS2Kmuc5fKqjy455wrg78BiYBDw\nqbSLKnCKSNCSAucbix/kjSUPJr3cpy/vm1j/ZgewI/BX4N1Y2uCyFDhFJGhJgXOj4e9go+HvWHd/\n5bwbSk/xyas+Bvi2O/4nlld9JJZauCz1cYpI0KocVffJq74IGzwCGIYFzX8llUk1ThEJW3XzOH3y\nqn8HuBpYgFUmTwNeTLqoAqeIBK0GK4fS8qq/AHw4ywUVOEUkaFpyKSKSkQKniEhW4cXNcAPn2rV+\nSynbyq8WqGjzAf3WpzhesiyjHPK+07Jd+87zshZHpCWoxikikpECp4hIRgqcPT0JvAyswRbf79PA\nsohIoCps3tFQjQyc3dja0MSJpiLSt6nG2Vt4PxERCUqIgbORa9W7gduxtaTHNLAcIhKwQsH/Vi+N\nrHHuh20guiW2jdMiYGb05LfPnrLuxLHjOhg3vqO+pRORVF0zOuma0Znre4RY4wylRJOBFcAP3P3u\nla+v9Xph1nmcb67xuy7ABu35Vcg1j1NaUf9+BahtXOne+euly8wre+z7E2v9/mU1qqm+CbbTMsAA\n4APAwgaVRUQCVoNkbTXXqKb6MGBarAy/Av7SoLKISMACbKk3LHA+AezRoPcWkSaStTuuHho9Haki\n3x/WGs817ZEr73nS+9zjxuyQ6dpZypK1z/LIX83zPvfnR4zKdG2RkNWgxjkBuADbyPhK4Hslz38N\nOMIdbwDsiqUJ/m+lCyp1hogErco+ziiv+gRgN2ASFhjjzgdGudsZQCcJQRMUOEUkcFXO4/TJqx53\nOHBdWpkUOEUkaFXWOMvlVR9e4a02AT4I3JhWpmD7OEVEIHkC/Ion57PiyQVJL88yCPJh4E5Smumg\nwCkigUsaKB68wygG71AcDH2u69rSU3zyqkc+jUczHdRUF5HAVdnH6ZNXHWBTYBxws0+ZVOMUkaBV\nuSLIJ686wCHunFU+F1XgFJGg1WAeZ1pedYBr3M2LAqeIBC3E3ZEUOEUkaAHGzeYPnO0Z17Ge8ZUf\nep973JyLcy1LFlpGKX2VapwiIhkFGDcVOEUkbKpxiohkFGDcVOAUkbCpxikiklGAcVOBU0TCphqn\niEhGCpwiIhkp55CISEYBVjgVOEUkbGqqB+Cyn57W6CIEZeIld2U6f/oJY3IqiUh5AcbNvhc4RaS5\ntAUYObUDvIgErcod4MFSAy8CHgO+UeGcDmAe8ACWHjiRapwiErQq+zijvOoHYfmH5mCpMx6OnbMZ\ncAmW4fJpYGjaRVXjFJGgtRX8b2X45FU/HEsJHCVxeyG1TOv5WURE6qIOedV3BjYH7sCSu302rUxq\nqotI0JJa6sseuZdlj96X9HKfvOr9gD2BA4FNgFnA3VifaFlJgfOilMKc7FEgEZGqFKgcOYeOfA9D\nR75n3f3H/3Rl6Sk+edX/gzXPV7lbF/Bu1jNw3ksxWkcl73bHPlFcRKRqVa64jOdVX4zlVZ9Ucs7N\n2ABSO7ARMBpIzLGTFDh/XnJ/ALDSt7QiIrVQh7zqi4BbgfuBtcAVwENJF/Xp4xwDXAkMwqq5ewDH\nAl/O+glERLKqQRJEn7zq57ubF59R9QuwCaTREP18YLzvG4iIVKMGE+BrzndU/amS+2/WuiD1ctio\n7RpdhKBkXXs+ZO8Tvc9dnjG9skg5zbrJx1PAfu54Q2w0/eHKp4uI1E6AcdMrcB4PXIhNGn0G+Atw\nQp6FEhGJhLjJh0/gfB5bkiQiUnfhhU2/waEdgVuwwaHnsTlPO+RZKBGRSJVLLnPhEzh/DVwPbA1s\nA9wAXJdnoUREIlVu8pFPmTzO6Q9ci+0sshr4JbBxnoUSEYmEWONM6uPcHOtemA6cQbGWeRi9J5OK\niOQiwLGhxMB5Hz3XpB/r/o7Wqp+eV6FERCLNNo9zRL0KISJSSYBp1b1XDr0D2I2efZu/qH1xRER6\narYaZ2QKtjZ9d+BPwETgTpo0cGrJYHX0M5F6Cy9s+gXOT2Cbet4HfAEYBvwqz0KJiERqsDtSzflM\nR1oFrME29tgUeI6eOyonuQpYCiyMPbY58FfgUWz55ma+hRWRvqcG05HS0gN3AC9h6YHnAd9KK5NP\n4JwDDME295zrLnyXx+sArsYKHXc6Fjh3Af6GRudFJEGV28pF6YEnYOM0k4Bdy5w3Axjlbueklcmn\nqR5tWHwZtovyYGCBx+sAZtJ7dP4jFPfzvAZL/q7gKSJlVbnJRzw9MBTTA5fu8JbpTZIC515Uzi20\nJ9bnuT6GYc133N/D1vM6ItIHVDmoXi498OiSc7qxTBcLsB3gvkYVqTN+QHJStv2TLuypO+U9RKSP\nq3I6kk98uQ8bt3kVmzX0e6wrsaKkwNnhW7KMlgJbAc9iG4c8V+6kc6ZOWXc8bnwH48bnVRwRWV9d\nMzrpmtGZ63skDcQ8/cBsnnlgdtLLfdIDvxI7ng78BBvEfrHSResxzj8C25bune7+ecAy4HtY3+Zm\n9O7j7F61Op+KqOZxiuSnf78C1DaudJ80zT/hxEWH7lr6/hsAjwAHYumBZ2MDRPGLDsMqcN1Yn+j1\npKyc9F05tL6uwwaChmL9DGcC57qCHY112H4q5zKISBOrchqnT3rgT2CZLt7EmuufTrto3oGzNPF7\n5KCc31dEWkQN5r+npQe+xN28+czjbAM+i9UWAbbDqrMiIrlrtv04Iz8B1gIHAFOBFe6x9+RYrvwM\n8F+otOqNNZku3X/D9qylEZEUAa649Aqco7HZ9PPc/ReBfrmVSEQkJsDNkbwC5xtYp2pkS6wGKiKS\nu2ZND3wRMA14C/AdbAQqdRG8iEgttIcXN70C5y+Be7F5UFB+naeISC6atca5HbASm8QONkl0O+Cp\nvAolIhIJMG56Bc4/U1zvuTHwNmwm/u55FUpEJNKso+rvKLm/J3BCDmUREemlWZvqpe6j97ZMIiK5\nCDBuegXOU2PHbViN85l8iiMi0lOzNtUHxo7fBP4I3JhPcUREeioEmOcyLXC2Y6kyTk05r3ms/K/3\nqVpCKdJ4zVbj3ACrYe6H7W+nndpFpO6aLXDOxvoz5wM3Azdge9WBBdGb8i2aiEjVqTNykbStXFTa\njbEd2w8APuRuH865XCIigNU4fW8VpOVVj+yNtbI/llampBrnlsApwMK0i4iI5KXKCmeUV/0gbDbQ\nHOAP9F423o6l87kVj9QfSYGzHRi0PiUVEamVDarr5PTNq34S8Dus1plepoTnngXOylREEZEaq0Ne\n9eFYMD0AC5ypA+F55xwSEalKW0LL+dH77uaxeXcnvdxnNtAFWKbdbqyZXlVTXQnVRKThkmqcI/fa\nl5F77bvu/vSrLyw9xSev+l5YEx4sI+9EYDXWF1pWUuBclvCciEhdVDmPcy6wM5YnfTFwGL2z7+4Q\nO74a20KzYtAENdVFJHBV7o7kk1c9MwVOEQlaDea/p+VVj/uCzwUVOEUkaK2yH6eISN0EGDcVOEUk\nbEnrwhtFgVNEghbiJh8KnCIStPDCpgKniAROg0MiIhmFFzYVOEUkcG0BbgGvwCkiQdOouohIRhpV\nFxHJKLywqcApIoFTjVNEJCP1cYqIZKQap4hIRuGFzTBrwSIi6xQK/rcK0vKqfxRYAMwD7sWStiVS\njVNEgpaUrM2DT17124Gb3fE7gWnATsllEhEJWJU1znhe9dUU86rHrYwdDwReSCuTAqeIBK2Q4U8Z\n5fKqDy9z3iFYLXQ6cHJamdRUF5GgJQ2q3z/7Hyycc1fSy33yqgP83t3GAtcCI5NOVuAUkaC1J0TO\nUaPfx6jR71t3/9eXnl96ik9e9biZWFzcgoQU6Wqqi0jQquzjjOdV3xDLq16aM31HirOe9nR/Vwya\noBqniASuQt+lL5+86h8HPocNHq0APp12UQVOEQlaDbbjTMurfp67ecu7qX4VsBRYGHtsCtbHMM/d\nJuRcBhFpYlWOquci78B5Nb0DYzfwQ2CUu92acxlEpInVYOVQzeXdVJ+JdcqWCnH5qYgEqJ41SV+N\nGlU/CVsb+jNgswaVQUSaQFvB/1YvjRgcuhSY6o7PBn4AHF160jlTp6w7Hje+g3HjO+pQNBHJomtG\nJ10zOnN9jxBrnPUo0QjgFmzxvO9z3atW+074z2bI3id6n7t8zsW5lEGkVfXvV4DaxpXumY++6H3y\n2F02r/X7l9WIpvrWseND6TniLiLSQyHDrV7ybqpfB4wHhmIL7ScDHcAe2Oj6ExQnooqI9NLWB3eA\nn1Tmsatyfk8RaSHhhU2tHBKR0AUYORU4RSRofbGpLiJSlfDCpgKniIQuwMipwCkiQQtxArwCp4gE\nLcAuTu0ALyJhq8EE+LS86kdge2fcD/wDeFdamVTjFJGwVVfj9Mmr/i9gHPASFmR/CuybdFHVOEUk\naFVuZOyTV30WFjQB7gHemlYmBU4RCVqVGxn75lWPHA38Oa1MaqqLSNCSWupzZs1k7t0zk16eZZu1\n/YGjgP3STlTgFJGwJUTOvceMZe8xY9fdv/yCc0tP8c2r/i7gCqyPc3lakdRUF5GgVdnH6ZNXfTvg\nJuAzWH9oKtU4RSRoVc7j9MmrfiYwBMtOATaItE/SRRU4RSRoNZj/npZX/Yvu5k2BU0SCVghw6ZAC\np4gELcC4qcApImELMG4qcIpI4AKMnAqcIhI0bSsnIpKR+jhFRDIKMG4qcIpI4AKMnAqcIhI09XGK\niGSkPk4RkYwCjJsKnCISuAAjpwKniARNfZwiIhm1hRc3FThFJHABBk7tAC8iQatyB3hIz6v+dizT\n5WvAqT5lUo1TRIJW5XQkn7zqy4CTgEN8L6oap4gErZDhVoZPXvXnsdxEq33LpMApIkGrc151L2qq\ni0jgKrfVZ905g1l3diW9OEtedW8KnCIStKQ+zjFjxzNm7Ph193903jmlp/jmVc9ETXURCVqVfZw+\nedXjb+VFNU4RCVod8qpvhY22DwbWAl8BdgNWVLqoAqeIBK0GSy7T8qo/S8/mfCoFThEJW4ArhxQ4\nRSRoAcZNBU4RCZs2MhYRyagQYORU4BSRoIUXNhU4RSRwAVY4FThFJGzaAV5EJKMQa5xacikiklGe\ngXNb4A7gQeAB4GT3+ObAX4FHgb8Am+VYBhFpclVuK5eLPAPnauCrwO7AvsAJwK7A6Vjg3AX4m7sv\nIlJWDVJn1FyegfNZYL47XoEtrB8OfAS4xj1+DRm2qxeRvifEGme9BodGAKOAe4BhwFL3+FJ3X0Sk\nrADHhuoSOAcCN2JbNb1S8lw3FXZoPmfqlHXH48Z3MG58Rz6lE5H11jWjk64Znfm+SYCRM+8i9QP+\niG3pdIF7bBHQgTXlt8YGkN5e8rruVatz2fGeIXuf6H3u8jkX51IGkVbVv18BahtXul95ba33yYM2\nbqv1+5eVZx9nAds09CGKQRNs9+XPu+PPA7/3vWDu/7MFQJ+x+bX654P6fsYa9HGm5VUH+LF7fgHW\nrZgoz8C5H/AZYH9gnrtNAM4F3o9NRzrA3feiX8jW0OqfsdU/H9Q5cGa4lRHlVZ+A7eo+CZvdE3cw\nsBOWYuNY4NK0MuXZx3knlQPzQTm+r4i0kCp3R4rnVYdiXvWHY+fEZ/rcg80tjw9i96KVQyIStDrk\nVS93zltr+RnqpZPiiLtuuunWPLdOaivr+79c8vqPA1fE7n8GuKjknFuwrsXI7cCeSYUKdZOPjkYX\nQESCUO0IuU9e9dJz3uoeExHpkzYA/kkxr/p8yg8O/dkd7wvcXa/CiYiEaiLwCDZIdIZ77EsUc6uD\njbw/jk1HSmymi4hIi/OZxNrsngTux+a8zm5sUWriKmxKx8LYY622rWC5zzgF60eLz19uZtoiskm1\nY9XoEdgyznL9FK3gCeyXsVWMxVZhxIPKecBp7vgbZFgAEahyn3EycEpjipOLrYA93PFArNm7K633\nXbac9wK3xu6fTmvu4/kEsEWjC1FjI+gZVBZR3BFrK3e/2Y2gd+A8tTFFqYvfY4tYWvG79NIsE+B9\nJrG2gm5sDtlc4JgGlyUvfWVbwZOwgYaf0VpN2BFoi8imCZzdjS5AneyH/VJOxHbMH9vY4uQumrTc\nai4F3oY1b5cAP2hscWpmvbaIbEXNEjh9JrG2giXu7+eBadg621azFGvWgW0r+FwDy5KX5ygGkitp\nje+xHxY0r6W4o1lf+C7LapbAORfbuWQENon1MGx7ulayCTDIHQ8APkDPfrNWsd7bCjaRrWPHh9L8\n32PNt4iU+ik3ibWVvA2bLTAfm/LRCp/xOmAx8AbWR/0FbNbA7bTOFJbSz3gU8AtsWtkCLJg0e9/f\n+4C12O9mfIpVq32XIiIiIiIiIiIiIiIiIiIiIiIiInlYg829WwhcD/Sv4lo/x/K4gOVySdqlajy2\nQUtWT1J+l6hKj8etyPheU2jtDTkkB82yckiq8yq2Bv6d2ETt40qez5J7Kr4m+Rh6plkttT8wJsO1\n4++R5fGs51RzvogCZx80E9gJqw3OBG7GViq1Ad/HNlBeABzrzi9gaQUWYZvWviV2rU5gL3c8AbgX\nW13yV2B7LDXBV7Ha7n7AlsDv3HvMphhUt8BWnjyA1WJ9EnRNw5biPkDvnaR+6B6/HRjqHtsRmO5e\n0wWM9HgPEenDop1sNsAC5ZewwLkCC3BggfJ/3fFGwBxsb4CPYUGtgK3BXu4eA9sVfE8sID4Vu1a0\n9K50Q99fU0zDuh229hngx8C33PHB2PK+ck3y+EbPQ9zf/bEuiOj+WmCSO/4/iqlg/4b9hwEw2t2P\nyqimumQSanpgqa3+WK0PrLZ1FRbAZgP/do9/AGvKf8LdH4xtrDIWC3jd2O5Nfy+5dgHLDNgVu9Z/\nS56PHETPPtFB2IYmY7HNMMCyDS73+ExfAQ5xx9u6ss7GAudv3eO/BG5y7zEGuCH2+g093kOkLAXO\nvmEV1sdZamXJ/ROxZnbcwaQ3nX37CQtYbe+NCs/56gAOxAL2a1jNd+MK1+zGuiGWU/5nIJKZ+jgl\nchvwZYr/me6CbXXXhW3j14Y11fcveV03lod6HNa0h2Jz+hWKW+WBNflPjt1/t/u7CzjcHU+k2Oyu\nZDAWCF8D3o4F0Egb8El3fDjWj/sK1syPatMF4F0p7yFSkQJn31CuRli6Y/eVWJ/jfVif4aVYkrxp\nWGbRh4BrgLvKXOsFrI/0Jmxw6Dr3+C1YEzwaHDoZeA82+PQgxbzWZ2GB9wF3ftTkr/Q5bsUC/EPA\nd4FZsXNWYhsHL8RqplPd40cAR1Pctu8jZa4rIiIiIiIiIiIiIiIiIiIiIiIiIiIiAft/wkVYtYwN\n5aYAAAAASUVORK5CYII=\n",
499 |       "text/plain": [
500 |        "<matplotlib.figure.Figure at 0x7f58c4122550>"
501 |       ]
502 |      },
503 |      "metadata": {},
504 |      "output_type": "display_data"
505 |     }
506 |    ],
507 |    "source": [
508 |     "from pyspark.mllib.evaluation import MulticlassMetrics\n",
509 |     "\n",
510 |     "metrics = MulticlassMetrics(\n",
511 |     "    predictions.select(\"prediction\", \"target_cat\").rdd)\n",
512 |     "conf_matrix = metrics.confusionMatrix().toArray()\n",
513 |     "plot_confusion_matrix(conf_matrix)"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": 22,
519 |    "metadata": {
520 |     "collapsed": false
521 |    },
522 |    "outputs": [
523 |     {
524 |      "data": {
525 |       "text/plain": [
526 |        "{u'back': 2203,\n",
527 |        " u'buffer_overflow': 30,\n",
528 |        " u'ftp_write': 8,\n",
529 |        " u'guess_passwd': 53,\n",
530 |        " u'imap': 12,\n",
531 |        " u'ipsweep': 1247,\n",
532 |        " u'land': 21,\n",
533 |        " u'loadmodule': 9,\n",
534 |        " u'multihop': 7,\n",
535 |        " u'neptune': 107201,\n",
536 |        " u'nmap': 231,\n",
537 |        " u'normal': 97278,\n",
538 |        " u'perl': 3,\n",
539 |        " u'phf': 4,\n",
540 |        " u'pod': 264,\n",
541 |        " u'portsweep': 1040,\n",
542 |        " u'rootkit': 10,\n",
543 |        " u'satan': 1589,\n",
544 |        " u'smurf': 280790,\n",
545 |        " u'spy': 2,\n",
546 |        " u'teardrop': 979,\n",
547 |        " u'warezclient': 1020,\n",
548 |        " u'warezmaster': 20}"
549 |       ]
550 |      },
551 |      "execution_count": 22,
552 |      "metadata": {},
553 |      "output_type": "execute_result"
554 |     }
555 |    ],
556 |    "source": [
557 |     "# Let's now improve the score: is the training dataset balanced?\n",
558 |     "\n",
559 |     "train_composition = train_df.groupBy(\"target\").count().rdd.collectAsMap()\n",
560 |     "train_composition"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": 23,
566 |    "metadata": {
567 |     "collapsed": false
568 |    },
569 |    "outputs": [
570 |     {
571 |      "data": {
572 |       "text/plain": [
573 |        "{u'back': 1,\n",
574 |        " u'buffer_overflow': 33.333333333333336,\n",
575 |        " u'ftp_write': 125.0,\n",
576 |        " u'guess_passwd': 18.867924528301888,\n",
577 |        " u'imap': 83.33333333333333,\n",
578 |        " u'ipsweep': 1,\n",
579 |        " u'land': 47.61904761904762,\n",
580 |        " u'loadmodule': 111.11111111111111,\n",
581 |        " u'multihop': 142.85714285714286,\n",
582 |        " u'neptune': 0.23320677978750198,\n",
583 |        " u'nmap': 4.329004329004329,\n",
584 |        " u'normal': 0.2569954152017928,\n",
585 |        " u'perl': 333.3333333333333,\n",
586 |        " u'phf': 250.0,\n",
587 |        " u'pod': 3.787878787878788,\n",
588 |        " u'portsweep': 1,\n",
589 |        " u'rootkit': 100.0,\n",
590 |        " u'satan': 1,\n",
591 |        " u'smurf': 0.08903450977598917,\n",
592 |        " u'spy': 500.0,\n",
593 |        " u'teardrop': 1.0214504596527068,\n",
594 |        " u'warezclient': 1,\n",
595 |        " u'warezmaster': 50.0}"
596 |       ]
597 |      },
598 |      "execution_count": 23,
599 |      "metadata": {},
600 |      "output_type": "execute_result"
601 |     }
602 |    ],
603 |    "source": [
604 |     "def set_sample_rate_between_vals(cnt, the_min, the_max):\n",
605 |     "    if the_min <= cnt <= the_max:\n",
606 |     "        # no sampling\n",
607 |     "        return 1\n",
608 |     "    \n",
609 |     "    elif cnt < the_min:\n",
610 |     "        # Oversampling: return many times the same observation\n",
611 |     "        return the_min/float(cnt)\n",
612 |     "\n",
613 |     "    else:\n",
614 |     "        # Subsampling: sometime don't retunt it\n",
615 |     "        return the_max/float(cnt)\n",
616 |     "        \n",
617 |     "sample_rates = {k:set_sample_rate_between_vals(v, 1000, 25000) \n",
618 |     "                for k,v in train_composition.iteritems()} \n",
619 |     "sample_rates"
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "code",
624 |    "execution_count": 24,
625 |    "metadata": {
626 |     "collapsed": false
627 |    },
628 |    "outputs": [],
629 |    "source": [
630 |     "bc_sample_rates = sc.broadcast(sample_rates)\n",
631 |     "\n",
632 |     "def map_and_sample(el, rates):\n",
633 |     "    rate = rates.value[el['target']]\n",
634 |     "    if rate > 1:\n",
635 |     "        return [el]*int(rate)\n",
636 |     "    else:\n",
637 |     "        import random\n",
638 |     "        return [el] if random.random() < rate else []\n",
639 |     "            \n",
640 |     "sampled_train_df = (train_df\n",
641 |     "                   .flatMap(\n",
642 |     "                     lambda x: map_and_sample(x, bc_sample_rates))\n",
643 |     "                   .toDF()\n",
644 |     "                   .cache())"
645 |    ]
646 |   },
647 |   {
648 |    "cell_type": "code",
649 |    "execution_count": 25,
650 |    "metadata": {
651 |     "collapsed": false
652 |    },
653 |    "outputs": [
654 |     {
655 |      "data": {
656 |       "text/plain": [
657 |        "96755"
658 |       ]
659 |      },
660 |      "execution_count": 25,
661 |      "metadata": {},
662 |      "output_type": "execute_result"
663 |     }
664 |    ],
665 |    "source": [
666 |     "sampled_train_df.count()"
667 |    ]
668 |   },
669 |   {
670 |    "cell_type": "code",
671 |    "execution_count": 26,
672 |    "metadata": {
673 |     "collapsed": false,
674 |     "scrolled": true
675 |    },
676 |    "outputs": [
677 |     {
678 |      "data": {
679 |       "text/plain": [
680 |        "Row(duration=0.0, protocol_type=u'tcp', service=u'http', flag=u'SF', src_bytes=181.0, dst_bytes=5450.0, land=0.0, wrong_fragment=0.0, urgent=0.0, hot=0.0, num_failed_logins=0.0, logged_in=1.0, num_compromised=0.0, root_shell=0.0, su_attempted=0.0, num_root=0.0, num_file_creations=0.0, num_shells=0.0, num_access_files=0.0, num_outbound_cmds=0.0, is_host_login=0.0, is_guest_login=0.0, count=8.0, srv_count=8.0, serror_rate=0.0, srv_serror_rate=0.0, rerror_rate=0.0, srv_rerror_rate=0.0, same_srv_rate=1.0, diff_srv_rate=0.0, srv_diff_host_rate=0.0, dst_host_count=9.0, dst_host_srv_count=9.0, dst_host_same_srv_rate=1.0, dst_host_diff_srv_rate=0.0, dst_host_same_src_port_rate=0.11, dst_host_srv_diff_host_rate=0.0, dst_host_serror_rate=0.0, dst_host_srv_serror_rate=0.0, dst_host_rerror_rate=0.0, dst_host_srv_rerror_rate=0.0, target=u'normal')"
681 |       ]
682 |      },
683 |      "execution_count": 26,
684 |      "metadata": {},
685 |      "output_type": "execute_result"
686 |     }
687 |    ],
688 |    "source": [
689 |     "sampled_train_df.first()"
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "code",
694 |    "execution_count": 27,
695 |    "metadata": {
696 |     "collapsed": false
697 |    },
698 |    "outputs": [
699 |     {
700 |      "name": "stdout",
701 |      "output_type": "stream",
702 |      "text": [
703 |       "F1-score test set: 0.966865218179\n"
704 |      ]
705 |     }
706 |    ],
707 |    "source": [
708 |     "full_model = full_pipeline.fit(sampled_train_df)\n",
709 |     "predictions = full_model.transform(test_df)\n",
710 |     "print \"F1-score test set:\", evaluator.evaluate(predictions)"
711 |    ]
712 |   },
713 |   {
714 |    "cell_type": "code",
715 |    "execution_count": 28,
716 |    "metadata": {
717 |     "collapsed": false,
718 |     "scrolled": false
719 |    },
720 |    "outputs": [
721 |     {
722 |      "name": "stdout",
723 |      "output_type": "stream",
724 |      "text": [
725 |       "F1-score test set: 0.967669293816\n"
726 |      ]
727 |     }
728 |    ],
729 |    "source": [
730 |     "clf = RandomForestClassifier(\n",
731 |     "    numTrees=50, maxBins=100, seed=101,\n",
732 |     "    labelCol=\"target_cat\", featuresCol=\"features\")\n",
733 |     "\n",
734 |     "stages = full_pipeline.getStages()[:-1]\n",
735 |     "stages.append(clf)\n",
736 |     "\n",
737 |     "refined_pipeline = Pipeline(stages=stages)\n",
738 |     "\n",
739 |     "refined_model = refined_pipeline.fit(sampled_train_df)\n",
740 |     "predictions = refined_model.transform(test_df)\n",
741 |     "print \"F1-score test set:\", evaluator.evaluate(predictions)"
742 |    ]
743 |   },
744 |   {
745 |    "cell_type": "code",
746 |    "execution_count": 29,
747 |    "metadata": {
748 |     "collapsed": false
749 |    },
750 |    "outputs": [],
751 |    "source": [
752 |     "pipeline_to_clf = Pipeline(\n",
753 |     "    stages=preproc_stages + [assembler]).fit(sampled_train_df)\n",
754 |     "train = pipeline_to_clf.transform(sampled_train_df).cache()\n",
755 |     "test = pipeline_to_clf.transform(test_df)"
756 |    ]
757 |   },
758 |   {
759 |    "cell_type": "code",
760 |    "execution_count": 30,
761 |    "metadata": {
762 |     "collapsed": false
763 |    },
764 |    "outputs": [],
765 |    "source": [
766 |     "# May take some 10 minutes\n",
767 |     "\n",
768 |     "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n",
769 |     "    \n",
770 |     "rf = RandomForestClassifier(\n",
771 |     "    cacheNodeIds=True, seed=101, labelCol=\"target_cat\", \n",
772 |     "    featuresCol=\"features\", maxBins=100)\n",
773 |     "\n",
774 |     "grid = (ParamGridBuilder() \n",
775 |     "        .addGrid(rf.maxDepth, [3, 6, 9, 12]) \n",
776 |     "        .addGrid(rf.numTrees, [20, 50]) \n",
777 |     "        .build())\n",
778 |     "\n",
779 |     "cv = CrossValidator(\n",
780 |     "    estimator=rf, estimatorParamMaps=grid, \n",
781 |     "    evaluator=evaluator, numFolds=3)\n",
782 |     "cvModel = cv.fit(train)"
783 |    ]
784 |   },
785 |   {
786 |    "cell_type": "code",
787 |    "execution_count": 31,
788 |    "metadata": {
789 |     "collapsed": false
790 |    },
791 |    "outputs": [
792 |     {
793 |      "name": "stdout",
794 |      "output_type": "stream",
795 |      "text": [
796 |       "F1-score test set: 0.969948273422\n"
797 |      ]
798 |     }
799 |    ],
800 |    "source": [
801 |     "predictions = cvModel.transform(test)\n",
802 |     "print \"F1-score test set:\", evaluator.evaluate(predictions)"
803 |    ]
804 |   },
805 |   {
806 |    "cell_type": "code",
807 |    "execution_count": 32,
808 |    "metadata": {
809 |     "collapsed": false
810 |    },
811 |    "outputs": [
812 |     {
813 |      "data": {
814 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAU4AAAEoCAYAAAA3/hguAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X28VVWdx/HPuRcU5EFRFBUfbj6RWika4IA8+DANOpNp\nT4b2qGljPjRpU9o4QmSTlTaWljmaZlbO6BipU2hZwsVEAQXEZzTNBxAVSQFREe788Vubs++55+y9\nNufsc9Y59/vmdV7ss88+e69zz+XHWmuvtX4gIiIiIiIiIiIiIiIiIiIiIiIivdws4GS3fSJwR43P\n3wFsBNpqfF4f/YHbgL8B/1PFefL4uTTKeOCxRhdCJM0zwApgq9i+zwN3NaQ0Pd0FnJTj+TtID5wn\nAAuA1cAy4HfAuBpc+1PAfSnXbiUbgT0aXQjx11t+MTdXG/ClGpyn4B6t5GzgP4ELgR2AXYEfAcfU\n4Ny7A09gAaW3SPr96FO3UohU6Wnga8BKYGu3r7TGORaYjzUp5wF/F3ttFhZU/gysBfbEAsFpwFLg\ndWC62z/XneO/gb7u/dsA/we8BLyKNV2Hx84fr3F+Fpjjtr+K1QCjx3rgWvfa1sBPsdrh88A3Kf7n\n2QZcDLwMPAWcTuUa59bu3B8p81pkS+BS4AX3+E9gC/faJHf9s7Fa/TL3GQC+AbwFvO2ucRIwDbg+\ndu6OkrJ91pX5deAvWE042j8n9r6072s6cLc7zx3AdhU+W1T+f8W+n2XAscDRWMBfCZwbO3409h2v\ncsdeRvF77nSfZY37vB+Lnf+rwHLgOrfvOfeePd01RrrnO2Pf24QK5RWpm6eBI4CbsQAD3QPnttg/\nhBOxf8CfwALcEPf6LKy5v697vS/2D2QGMBDYDwsQf8ICwWDgYeDTsfMfB/Rzx9/o3hupFDjjdsGC\n1j+45zOAK7A+xO2x5vCp7rV/Bh7FgvMQd/4NlA+ck7GAnNRimQ7cAwx1jz+7fWBBYD0WENuBo7D/\nXKL/oKYCP4+dayqVA+cA4DVgb/faMOxnC91/Lj7f11JgL+xnfhfw7QqfLSr/+a78nwdeAX7pyrMf\n8AZWcwY4CAuebW7fI3RvyZQ21aPzfxv7velH98CJu+bD2Hd5B/DdCmUVqaungcOB/bEaylC6B85P\nAfeWvOce4DNu+y4sMMRtpHstZwFWa4lcjNXMyjkQ+4ceSQuc/YH7Y+cfBryJ/SOMTMECN+7vU2Ov\n/T2Va5wnYjWhJE9iATbyAexnChYE3ig59wosuEDPGmbp8w66B85VwIexzxz3WYo/F5/v6+ux104D\nZpZ+qJLyR83rQa48o2LHLAA+VOH9/wL8Ova8XOB8i2INPdoXD5wAtwBLgEUUa7BSB+rjTPcw1mQ+\nF+iK7d8ZeLbk2L+6/ZHSX3SwABFZV+b5QLe9FXAlVmt9DZiN1ch8+0p/itUgv+ee747941qOBZpV\nwE+wmifATiXlLf1scSux/0iSfn92xn4e8fPFfzYr6d6H+QbFz57FWuB4rMa8DPuuRlQoT9r39WJs\nO/5dlLOS4u/DOvd36Xc5wG3v48q1HPsuv0XlboDIy1h3RZKrsf/YL8NqqFInCpx+pgKn0L2P8QWK\nTbHI7m5/pIvNdw72D240FjAn4n+T6VysyXlybN9zWC1mO6x5OsSd973u9eXAbrHj49ul5rpzHZdw\nzDKsZhg/37L0ope1hu6jG3Ysef33WI12R2zIzlVlzuHzfeXlCqx5vhf2M/830v/tpf3uDMT6kK/G\n+oWHJB8utaTA6ecpbDxhvF9qJhbYpmB3PY8H3o3VLCI+Qa5QYXsgVmt5Deufm+pZ1qOAM7Gm61ux\n/cuxAPN9rGnZht1kiG4o3AicRbGPM35zo9RrwAXYXfQPYUGtr7v2d9wxN2B9gFEf5wV0b25nsciV\nc1cs8JwXe20HV4YBWK1rLdY3W6pW39fmGIjd+HnDXfO0ktdXYN9FFj/AbnCdCvwWaz1InShw+puO\nBYioJrAS+CesZvgK8BX3PN4PWVprKFeL6CrZjp5fivXZvYL1xc2s8P7S930cC1SPUryz/mP32qex\nfrNHXDlvolh7uwq7ybAY65+7OeF6YAH4bCw4voQ1g79I8QbWhe48D7rHArev3OdO+jwAd2L/cT2I\n3RW/LfZ6G/BlrOa4EhsoflqZ82T9vkrLUK6MSc/jvoLd6X8d+C9s9ET8+GnYnfNVwEcTrh3t+xBW\nw44+59nYDagpCWUQERHxcg1Wo1+ScMwPsREViykO8RIR6bXGY8GwUuA8GpvxBjCGniMvRER6pQ4q\nB86fYP3dkcewoXuJ1McpIr3ZcLoPw3semziSKMg5sIUBO3d1rd3ckSsi0ijjJ0xkTufs2o1OaN+y\niw1vpR9XtBqbhZdFaXlThxEGGTi71i6j34Gn99i/fvk8+u40utu+VfMvr1ex6uLC6dM4/4JpjS5G\nrlr9M7b654PKn7F/3xqP6NrwVtlYUMmbi340KOMVXsCGuUWiacqJGtVUn4z1JSzFFtIQESmv0Ob/\nyO5WiutDHIJNr15R+XDTiBpnO3A5cCQW2edjhX+0AWURkdAVqqrF3oDNuhuK9WVOpTiv/0rsjvrR\n2NoKa4HP+Zy0EYFzNFbIZ9zz/8YG9KYGzraBw9MOaXoTJk5qdBFy1+qfsdU/H9T5M25eTTLiMyng\njKwnbUTgLHcXa4zPG9sHKXC2glb/jK3++aDOn7GtvX7X8tSIwOm18MX65fM2bbcNHN4rgqZIs+mc\nPYvO2bPyvUh1TfVcNCJwlt7F2hWrdXZTevdcRMIzYeKkbrXPb33zG7W/SHVN9Vw0okQLsNW6O7AF\nJ47Hbg6JiPRUKPg/6qQRNc53sM7YO7A77NGCuyIiPQVY42zUAPiZVE5LAPgPbB8yKtsNsVYbMC/S\n8tTHKSKSkWqcIiIZqcYpIpKRapwiIhkpcIqIZNSmprqISDaqcYqIZKSbQyIiGWmRDxGRjAJsqodX\nIhGRuOrnqqdlnBgCzMDyqt8H7J9WJAVOEQlbdakzoowTk4H9sIWN9y055uvAA8ABWBqNH6QVqemb\n6lnnng8Zfab/ueddlrU4IlJr1d0c8sk4sS9wkdt+HFu5bXvg5UonVY1TRMJWXY2zXMaJ0lXRFwMf\ndtujgd1Jya3e9DVOEWlxCTXODSuXsnHlk0nv9sk4cRHWPF8ILHF/b0h6gwKniIQt4a56+9ARtA8d\nsen5hidvLz3EJ+PEauCk2POngb8kFUlNdREJW3VNdZ+ME1u71wBOAWYDa5KKpBqniIStuptDlTJO\nfMG9fiV2t/1nWLP+IeDktJMqcIpI2KofAF8u48SVse25wAgyUOAUkbBprrqISEYBTrlU4BSRsKnG\nKSKSTVubapwNl2UapVIPiwQgvApn7wucItJcCmqqi4hko8ApIpKRAqeISEYKnCIiWYUXNxU4RSRs\nqnGKiGSkwCkikpECp4hIRgqcIiJZhRc3tQK8iIStUCh4PypIy6s+FLgdWIQtZPzZtDKpxplAqYdF\nGq/KpnqUV/1ILP/QfCx1Rjw98BlYgrbzsCD6OPALbPX4slTjFJGgFdoK3o8y4nnV11PMqx63HBjs\ntgcDK0kImqAap4gErsoaZ7m86mNKjrkK+BOwDBgEfDztpAqcIhK0pMD59rKHeXv5w0lv98mr/nWs\nf3MSsCfwB+AALG1wWQqcIhK0pMC55fD3sOXw92x6vnbhTaWH+ORVHwt8y20/heVVH4GlFi5LfZwi\nErQq76r75FV/DLt5BDAMC5p/SSqTapwiErbqxnH65FX/D+BaYDFWmfwq8GrSSRU4RSRoNZg5lJZX\n/RXgg1lOqMApIkHTlEsRkYwUOEVEsgovbjZ/4Ny40WeY1uZpKz8ToaJMqYfHnpPt3Pdckul4kVah\nGqeISEYKnCIiGSlwdvcM8DqwAZt8P7qBZRGRQFVYvKOhGhk4u7C5oYkDTUWkd1ONs6fwfiIiEpQQ\nA2cj56p3AXdic0lPaWA5RCRghYL/o14aWeMchy0guj22jNNjwJzoxQunT9t04ISJk5gwcVJ9Syci\nqTpnz6Jz9qxcrxFijTOUEk0F1gDRYMWudev9xmeGNI4zC43jlFbUv28BahtXuvb+19Jp5pUt/d5R\ntb5+WY1qqm+FrbQMMAD4ALCkQWURkYDVIFlbzTWqqT4MmBErwy+B3zeoLCISsABb6g0LnE8DBzbo\n2iLSRPLsMttcjR6OVLWQfqgbMvS3Zu2zHHLIv/if+95LM51bJGQ1qHFOBi7FFjK+GvhOyetfAU50\n232AfbE0wX+rdEKlzhCRoFXZxxnlVZ8M7AdMwQJj3MXASPc4D5hFQtAEBU4RCVyV4zh98qrHnQDc\nkFYmBU4RCVqVNc5yedWHV7jUVsA/ADenlanp+zhFpLUlDTNa88wi1jyzOOntWQZ6fxC4m5RmOihw\nikjgkm4AD95jJIP3GLnp+Uud15ce4pNXPfIJPJrpoKa6iASuyj5On7zqAFsDE4BbfMqkGqeIBK3K\nGUE+edUBjnXHrPM5qQKniAStBuM40/KqA1znHl4UOEUkaCGujqTAKSJBCzBu9r7Aec6tj3gfe8kx\n+2U6d3uO0z+zTKPM8zOK1JtqnCIiGQUYNxU4RSRsqnGKiGQUYNxU4BSRsKnGKSKSUYBxU4FTRMKm\nGqeISEYKnCIiGYWUHieiwCkiQQuwwqnAKSJhU1M9ANf8+DbvY5t1OmKzlrsZvfbG+kzHb71V35xK\n0roCjJu9L3CKSHNpCzByagV4EQlalSvAg6UGfgxYCnytwjGTgIXAQ1h64ESqcYpI0Krs44zyqh+J\n5R+aj6XOeDR2zDbAj7AMl88DQ9NOqhqniAStreD/KMMnr/oJWErgKInbK6ll2szPIiJSF3XIq743\nsC1wF5bc7VNpZVJTXUSCltRSX/n4/ax84oGkt/vkVe8LHAQcAWwFzAXuxfpEy0oKnJelFOYsjwKJ\niFSlQOXIOXTE+xk64v2bnj/526tLD/HJq/4c1jxf5x6dwAFsZuC8n2K0jkre5bZ9oriISNWqnHEZ\nz6u+DMurPqXkmFuwG0jtwJbAGOD7SSdNCpw/K3k+AFjrW1oRkVqoQ171x4DbgQeBjcBVQGLiLp8+\nzrHA1cAgrJp7IHAq8MWsn0BEJKsaJEH0yat+sXt48bmrfik2gDS6Rb8ImOh7ARGRatRgAHzN+d5V\nf7bk+Tu1Lki9jP7HQxtdhF7j7qWpw+G6OXTv1HHHdZFl/rnmnuevWRf5eBYY57a3wO6mP1r5cBGR\n2gkwbnoFztOAH2CDRl8Afg+cnmehREQiIS7y4RM4X8amJImI1F14YdPv5tCewG3YzaGXsTFPe+RZ\nKBGRSJVTLnPhEzh/BdwI7ATsDNwE3JBnoUREIlUu8pFPmTyO6Q9cj60ssh74BdAvz0KJiERCrHEm\n9XFui3UvzATOo1jLPJ6eg0lFRHIR4L2hxMD5AN3npJ/q/o7mqp+bV6FERCLNNo6zo16FEBGpJMC0\n6t4zh94D7Ef3vs2f1744IiLdNVuNMzINm5u+P/Bb4Cjgbpo0cM77eYYBAWeOSz9GKgplCmVWmkYZ\nlvDCpl/g/Ci2qOcDwOeAYcAv8yyUiEikBqsj1ZzPcKR1wAZsYY+tgZfovqJykmuAFcCS2L5tgT8A\nT2DTN7fxLayI9D41GI6Ulh54EvAalh54IXB+Wpl8Aud8YAi2uOcCd+J7PN4HcC1W6LhzscC5D/BH\ndHdeRBJUuaxclB54MnafZgqwb5njZgMj3ePCtDL5NNWjBYt/gq2iPBhY7PE+gDn0vDt/DMX1PK/D\nkr8reIpIWVUu8hFPDwzF9MClK7xlukhS4DyYyrmFDsL6PDfHMKz5jvt72GaeR0R6gSpvqpdLDzym\n5JguLNPFYmwFuK9QReqMS0hOynZY0ok9daVcQ0R6uSqHI/nElwew+zZvYKOGfoN1JVaUFDgn+ZYs\noxXAjsCL2MIhL5U76MLp0zZtT5g4iQkT8yqOiGyuztmz6Jw9K9drJN2Ief6hebzw0Lykt/ukB14d\n254J/Bi7if1qpZPW4z5/B7Ys3Xvd8+8CK4HvYH2b29Czj7Nr3fp8KqJDRp3hfeyq+ZfnUgaRVtW/\nbwFqG1e6zpzhn3DisuP2Lb1+H+Bx4AgsPfA87AZR/KTDsApcF9YneiMpMyd9Zw5trhuwG0FDsX6G\nC4CLXMFOxjpsP55zGUSkiVU5jNMnPfBHsUwX72DN9U+knTTvwFma+D1yZM7XFZEWUYPx72npgX/k\nHt58xnG2AZ/CaosAu2HVWRGR3DXbepyRHwMbgcOB6cAat+/9OZYrNyf9+xfTD5KGOOfWxBEg3Vxy\nzH45lkRCEuCMS6/AOQYbTb/QPX8V0CoIIlIXAS6O5BU438Y6VSPbYzVQEZHcNWt64MuAGcAOwH9g\nd6BSJ8GLiNRCe3hx0ytw/gK4HxsHBeXneYqI5KJZa5y7AWuxQexgg0R3A57Nq1AiIpEA46ZX4Pwd\nxfme/YB3YSPx98+rUCIikWa9q/6ekucHAafnUBYRkR6atale6gF6LsskIpKLAOOmV+A8J7bdhtU4\nX8inOCIi3TVrU31gbPsd4P+Am/MpjohId4UA81ymBc52LFXGOSnHiYjkotlqnH2wGuY4bH07rdQu\nInXXbIFzHtafuQi4BbgJW6sOLIj+Ot+iiYhUnTojF0nLykWl7Yet2H448E/u8cGcyyUiAliN0/dR\nQVpe9cgorJX94bQyJdU4twfOBpaknUREJC9VVjijvOpHYqOB5gO30nPaeDuWzud2PFJ/JAXOdmDQ\n5pRURKRW+lTXyembV/1M4H+xWmd6mRJeexH4RqYiiojUWB3yqg/HgunhWOBMvRGed84hEZGqtCW0\nnJ944F6WLrw36e0+o4EuxTLtdmHN9Kqa6kqoJiINl1TjHHHwIYw4+JBNz2de+4PSQ3zyqh+MNeHB\nMvIeBazH+kLLSgqcKxNeExGpiyrHcS4A9sbypC8Djqdn9t09YtvXYktoVgyaoKa6iASuytWRfPKq\nZ6bAKSJBq8H497S86nGf8zlheEPyTde69fnM8Bwy6gzvY1fNvzyXMoi0qv59C1DbuNJ19X1/9T74\n82N2r/X1y1KNU0SCFuCMSwVOEQlb0rzwRlHgFJGghbjIhwKniAQtvLCpwCkigWuVZG0iInUTXthU\n4BSRwLUFuAS8AqeIBE131UVEMtJddRGRjMILm70wcHZMVrqkuJdeezPT8Tts3c/72LVvvpO1ON4G\n9AvjVzfrZwyl3M1ENU4RkYzUxykikpFqnCIiGYUXNsOsBYuIbFIo+D8qSMur/iFgMbAQuB9L2pZI\nNU4RCVpSsjYPPnnV7wRucdvvBWYAeyWXSUQkYFXWOON51ddTzKsetza2PRB4Ja1MCpwiErRChj9l\nlMurPrzMccditdCZwFlpZVJTXUSClnRT/cF5f2bJ/HuS3u6bg+c37jEeuB4YkXSwAqeIBK09IXKO\nHHMoI8ccuun5r664uPQQn7zqcXOwuLgdCSnS1VQXkaBV2ccZz6u+BZZXvTRn+p4URz0d5P6uGDSh\nF9Y493rXto0uQlDWb8iWTfSFV9d5Hzt82/6Zzr3u7Q2Zjg/Bm+9szHT8gJzK0coq9F368smr/hHg\n09jNozXAJ9JO2usCp4g0lxosx5mWV/277uEt76b6NcAKYEls3zSsj2Ghe0zOuQwi0sSqvKuei7wD\n57X0DIxdwPeBke5xe85lEJEmVoOZQzWXd1N9DtYpWyrE6aciEqB61iR9Nequ+pnY3NCfAts0qAwi\n0gTaCv6PemnEzaErgOlu+5vAJcDJpQddOH3apu0JEycxYeKkOhRNRLLonD2Lztmzcr1GiDXOepSo\nA7gNmzzv+1rXuvXZhsn4+tg1872PvemkUbmUISRZhhdlledwpP5btGctTi5Wrnk70/HbDdwip5KE\noX/fAtQ2rnTNeeJV74PH77Ntra9fViOa6jvFto+j+x13EZFuChke9ZJ3U/0GYCIwFJtoPxWYBByI\n3V1/muJAVBGRHtp64QrwU8rsuybna4pICwkvbGrmkIiELsDIGWCRAOj66yt+Ny2ypKsVabQhx13h\nfeyqGaflWJJ85HFz6L6n/uZ98Jg9t6n19ctSjVNEghZi7U6BU0TCFmDkVOAUkaCFOABegVNEghbg\naCStAC8iYavBAPi0vOonYmtnPAj8GXhfWplU4xSRsFVX4/TJq/4XYALwGhZk/ws4JOmkqnGKSNCq\nXMjYJ6/6XCxoAtwH7JJWJgVOEQlalQsZ++ZVj5wM/C6tTGqqi0jQklrq8+fOYcG9c5LenmWZtcOA\nk4BxaQcqcIpI2BIi56ix4xk1dvym51deelHpIb551d8HXIX1ca6qokgNldt6nCLNYsioMzIdv2r+\n5TmVxF8eUy4XP7va++ADdhtUev0+wOPAEcAyYB62+FD85tBuwJ+ATwL3+lxHNU4RCVqV4zh98qpf\nAAzBslOA3UQanXRSBU4RCVoNqq9pedU/7x7eFDhFJGiFAKcOKXCKSNACjJsKnCIStgDjpgKniAQu\nwMipwCkiQdOyciIiGamPU0QkowDjpgKniAQuwMipwCkiQQuxjzO8Epnc5qpnmf8bwtxfkWaSx1z1\npSve8D5472Fb1fr6ZanGKSJBC7F2p8ApImELMHIqcIpI0ELs41TgFJGgtYUXNxU4RSRwAQZOJWsT\nkaBVmeUS0vOqvxvLdPkmcI5PmVTjFJGgVTnl0iev+krgTOBY35OqxikiQStkeJThk1f9ZWCBe92L\nAqeIBK3OedW9qKkuIoGr3Fafe/ds5t7dmfTmXKYgBni/ClB6YJFc5ZV6OI8pl8+vesv74F2GbFl6\n/UOAadgNIoDzgI3Ad8q8fSqwBrgk7TpqqotI0Krs41wA7A10AFsAx2M3hypdyoua6iIStDrkVd8R\nu9s+GKuNfgnYD6t9lqXAKSJBq8GUy7S86i8Cu2Y5oQKniIQtwDsxCpwiErQA46YCp4iETcnaREQy\nKgQYORU4RSRo4YVNBU4RCVyAFU4FThEJm1aAFxHJSDXOALyzYaP3sX3aNSNVWlPW1NdDxnqt79tr\n5BkZdgXuAh4GHgLOcvu3Bf4APAH8HtgmxzKISJOrclm5XOQZONcDXwb2x1YoOR3YFzgXC5z7AH90\nz0VEyqpB6oyayzNwvggscttrsIn1w4FjgOvc/uvIsFy9iPQ+IdY469XH2QGMBO4DhgEr3P4V7rmI\nSFkB3huqS+AcCNyMLdW0uuS1Liqs0Hzh9GmbtidMnMSEiZPyKZ2IbLYNrz/HxtXPpR9YjQAjZ96B\nsy8WNK8HfuP2rcDWv3sR2Al4qdwbz79gWs5FE5FqtQ/elfbBxRXZNiy7t+bXCHEcZ559nAVs0dBH\ngEtj+28FPuO2P0MxoKbqnD2rVmULlj5j82v1zwdW06yXGvRxpuVVB/ihe30x1q2YKM/AOQ74JHAY\nsNA9JgMXAX+PDUc63D330ht+IfUZm1+rfz4g/+Z5TJWpM6K86pOxVd2nYKN74o4G9sJSbJwKXJFW\npjyb6ndTOTAfmeN1RaSFVLk6UjyvOhTzqj8aOyY+0uc+bGx5/CZ2D5oaIyJBq0Ne9XLH7FLLz1Av\nsyjecddDDz2a5zGL2sp6/ddL3v8R4KrY808Cl5UccxvWtRi5EzgoqVChzlWf1OgCiEgQqr2l/gLd\nE7HtitUok47Zxe0TEemV+gBPUcyrvojyN4d+57YPAWo/pkpEpMkcBTyO3SQ6z+37AsXc6mB33p/E\nhiMlNtNFRKTF+QxibXbPAA9iY17nNbYoNXENNqRjSWxfqy0rWO4zTsP60eLjl5uZlohsUu1YNboD\nm8ZZrp+iFTyN/TK2ivHYLIx4UPku8FW3/TUyTIAIVLnPOBU4uzHFycWOwIFueyDW7N2X1vsuW87f\nAbfHnp9La67j+TSwXaMLUWMddA8qj1FcEWtH97zZddAzcLbykum/wSaxtOJ36aVZBsD7DGJtBV3Y\nGLIFwCkNLkteesuygmdiNxp+Sms1YTvQEpFNEzi7Gl2AOhmH/VIeha2YP76xxcldNGi51VwBvAtr\n3i4HLmlscWpms5aIbEXNEjh9BrG2guXu75eBGdg821YTLSsICcsKNrmXKAaSq2mN7zFpiUho3e+y\nrGYJnAuwlUs6sEGsx2PL07WSrYBBbnsA8AG695u1is1eVrCJ7BTbPo7m/x5rvkSk1E+5Qayt5F3Y\naIFF2JCPVviMNwDLgLexPurPYaMG7qR1hrCUfsaTgJ9jw8oWY8Gk2fv+DgU2Yr+b8SFWrfZdioiI\niIiIiIiIiIiIiIiIiIiIiORhAzb2bglwI9C/inP9DMvjApbLJWmVqonYAi1ZPUP5VaIq7Y9bk/Fa\n02jtBTkkB80yc0iq8wY2B/692EDtfy55PUvuqfic5FPonma11GHA2Aznjl8jy/6sx1RzvIgCZy80\nB9gLqw3OAW7BZiq1Ad/DFlBeDJzqji9gaQUewxat3SF2rlnAwW57MnA/NrvkD8DuWGqCL2O13XHA\n9sD/umvMoxhUt8NmnjyE1WJ9EnTNwKbiPkTPlaS+7/bfCQx1+/YEZrr3dAIjPK4hIr1YtJJNHyxQ\nfgELnGuwAAcWKP/NbW8JzMfWBvgwFtQK2BzsVW4f2KrgB2EB8dnYuaKpd6UL+v6KYhrW3bC5zwA/\nBM5320dj0/vKNcnjCz0PcX/3x7ogoucbgSlu+98ppoL9I/YfBsAY9zwqo5rqkkmo6YGltvpjtT6w\n2tY1WACbB/zV7f8A1pT/qHs+GFtYZTwW8Lqw1Zv+VHLuApYZsDN2rr+VvB45ku59ooOwBU3GY4th\ngGUbXOXxmb4EHOu2d3VlnYcFzv9x+38B/NpdYyxwU+z9W3hcQ6QsBc7eYR3Wx1lqbcnzM7BmdtzR\npDedffsJC1ht7+0Kr/maBByBBew3sZpvvwrn7MK6IVZR/mcgkpn6OCVyB/BFiv+Z7oMtddeJLePX\nhjXVDyt5XxeWh3oC1rSHYnN6NcWl8sCa/GfFnh/g/u4ETnDbR1FsdlcyGAuEbwLvxgJopA34mNs+\nAevHXY3m0qtDAAAAlElEQVQ186PadAF4X8o1RCpS4OwdytUIS1fsvhrrc3wA6zO8AkuSNwPLLPoI\ncB1wT5lzvYL1kf4auzl0g9t/G9YEj24OnQW8H7v59DDFvNbfwALvQ+74qMlf6XPcjgX4R4BvA3Nj\nx6zFFg5egtVMp7v9JwInU1y275gy5xUREREREREREREREREREREREREREZGA/T9G7auTAnZNiwAA\nAABJRU5ErkJggg==\n",
815 |       "text/plain": [
816 |        "<matplotlib.figure.Figure at 0x7f58b5a7e290>"
817 |       ]
818 |      },
819 |      "metadata": {},
820 |      "output_type": "display_data"
821 |     }
822 |    ],
823 |    "source": [
824 |     "metrics = MulticlassMetrics(predictions.select(\n",
825 |     "        \"prediction\", \"target_cat\").rdd)\n",
826 |     "conf_matrix = metrics.confusionMatrix().toArray()\n",
827 |     "plot_confusion_matrix(conf_matrix)"
828 |    ]
829 |   },
830 |   {
831 |    "cell_type": "code",
832 |    "execution_count": 33,
833 |    "metadata": {
834 |     "collapsed": false
835 |    },
836 |    "outputs": [
837 |     {
838 |      "data": {
839 |       "text/plain": [
840 |        "DataFrame[duration: double, protocol_type: string, service: string, flag: string, src_bytes: double, dst_bytes: double, land: double, wrong_fragment: double, urgent: double, hot: double, num_failed_logins: double, logged_in: double, num_compromised: double, root_shell: double, su_attempted: double, num_root: double, num_file_creations: double, num_shells: double, num_access_files: double, num_outbound_cmds: double, is_host_login: double, is_guest_login: double, count: double, srv_count: double, serror_rate: double, srv_serror_rate: double, rerror_rate: double, srv_rerror_rate: double, same_srv_rate: double, diff_srv_rate: double, srv_diff_host_rate: double, dst_host_count: double, dst_host_srv_count: double, dst_host_same_srv_rate: double, dst_host_diff_srv_rate: double, dst_host_same_src_port_rate: double, dst_host_srv_diff_host_rate: double, dst_host_serror_rate: double, dst_host_srv_serror_rate: double, dst_host_rerror_rate: double, dst_host_srv_rerror_rate: double, target: string, protocol_type_cat: double, service_cat: double, flag_cat: double, target_cat: double, features: vector]"
841 |       ]
842 |      },
843 |      "execution_count": 33,
844 |      "metadata": {},
845 |      "output_type": "execute_result"
846 |     }
847 |    ],
848 |    "source": [
849 |     "#cleanup\n",
850 |     "bc_sample_rates.unpersist()\n",
851 |     "sampled_train_df.unpersist()\n",
852 |     "train.unpersist()"
853 |    ]
854 |   },
855 |   {
856 |    "cell_type": "code",
857 |    "execution_count": null,
858 |    "metadata": {
859 |     "collapsed": true
860 |    },
861 |    "outputs": [],
862 |    "source": []
863 |   }
864 |  ],
865 |  "metadata": {
866 |   "kernelspec": {
867 |    "display_name": "Python 2",
868 |    "language": "python",
869 |    "name": "python2"
870 |   },
871 |   "language_info": {
872 |    "codemirror_mode": {
873 |     "name": "ipython",
874 |     "version": 2
875 |    },
876 |    "file_extension": ".py",
877 |    "mimetype": "text/x-python",
878 |    "name": "python",
879 |    "nbconvert_exporter": "python",
880 |    "pygments_lexer": "ipython2",
881 |    "version": "2.7.6"
882 |   }
883 |  },
884 |  "nbformat": 4,
885 |  "nbformat_minor": 0
886 | }
887 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Packt Publishing
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # Large Scale Machine Learning with Python
 5 | This is the code repository for [Large Scale Machine Learning with Python](https://www.packtpub.com/big-data-and-business-intelligence/large-scale-machine-learning-python?utm_source=github&utm_medium=repository&utm_campaign=9781785887215), published by Packt. It contains all the supporting project files necessary to work through the book from start to finish.
 6 | 
 7 | ## Instructions
 8 | The execution of the code examples provided in this book requires an installation of Python 2.7 or higher versions on macOS, Linux, or Microsoft Windows.
 9 | The examples throughout the book will make frequent use of Python's essential libraries, such as SciPy, NumPy, Scikit-learn, and StatsModels, and to a minor extent, matplotlib and pandas, for scientific and statistical computing. We will also make use of an out-of-core cloud computing application called H2O.
10 | This book is highly dependent on Jupyter and its Notebooks powered by the Python kernel. We will use its most recent version, 4.1, for this book.
11 | The first chapter will provide you with all the step-by-step instructions and some useful tips to set up your Python environment, these core libraries, and all the necessary tools.
12 | 
13 | ## Related books
14 | - [R Machine Learning By Example](https://www.packtpub.com/big-data-and-business-intelligence/r-machine-learning-example?utm_source=github&utm_medium=repository&utm_campaign=9781784390846)
15 | - [R Machine Learning Essentials](https://www.packtpub.com/big-data-and-business-intelligence/r-machine-learning-essentials?utm_source=github&utm_medium=repository&utm_campaign=9781783987740)
16 | - [Machine Learning with R](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-r?utm_source=github&utm_medium=repository&utm_campaign=9781782162148) 
17 | ### Download a free PDF
18 | 
19 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
20 | <p align="center"> <a href="https://packt.link/free-ebook/9781785887215">https://packt.link/free-ebook/9781785887215 </a> </p>


--------------------------------------------------------------------------------
/vowpal_wabbit_for_windows/x64/vw.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Large-Scale-Machine-Learning-With-Python/681b476109470a04f354c9f4d152b8de40670eb7/vowpal_wabbit_for_windows/x64/vw.exe


--------------------------------------------------------------------------------
/vowpal_wabbit_for_windows/x86/vw.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Large-Scale-Machine-Learning-With-Python/681b476109470a04f354c9f4d152b8de40670eb7/vowpal_wabbit_for_windows/x86/vw.exe


--------------------------------------------------------------------------------