├── Chapter 02 └── Chapter_2_code.ipynb ├── Chapter 03 └── Chapter_3_code.ipynb ├── Chapter 04 └── Chapter_4_code.ipynb ├── Chapter 05 └── Chapter_5_code.ipynb ├── Chapter 06 └── Chapter_6_code.ipynb ├── Chapter 07 └── Chapter_7_code.ipynb ├── Chapter 08 ├── Chapter_8_code_HDFS.ipynb ├── Chapter_8_code_MR.ipynb ├── Chapter_8_code_Spark.ipynb └── Chapter_8_code_Vagrantfile ├── Chapter 09 ├── Chapter_9_code_01.ipynb └── Chapter_9_code_02.ipynb ├── LICENSE ├── README.md └── vowpal_wabbit_for_windows ├── x64 └── vw.exe └── x86 └── vw.exe /Chapter 03/Chapter_3_code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Datasets for experimenting yourself" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import urllib2 # import urllib.request as urllib2 in Python3\n", 19 | "import requests, io, os, StringIO\n", 20 | "import numpy as np\n", 21 | "import tarfile, zipfile, gzip\n", 22 | "\n", 23 | "\n", 24 | "def unzip_from_UCI(UCI_url, dest=''):\n", 25 | " \"\"\"\n", 26 | " Downloads and unpacks datasets from UCI in zip format\n", 27 | " \"\"\"\n", 28 | " response = requests.get(UCI_url)\n", 29 | " compressed_file = io.BytesIO(response.content)\n", 30 | " z = zipfile.ZipFile(compressed_file)\n", 31 | " print ('Extracting in %s' % os.getcwd()+'\\\\'+dest)\n", 32 | " for name in z.namelist():\n", 33 | " if '.csv' in name:\n", 34 | " print ('\\tunzipping %s' %name)\n", 35 | " z.extract(name, path=os.getcwd()+'\\\\'+dest)\n", 36 | "\n", 37 | "def gzip_from_UCI(UCI_url, dest=''):\n", 38 | " \"\"\"\n", 39 | " Downloads and unpacks datasets from UCI in gzip format\n", 40 | " \"\"\"\n", 41 | " response = urllib2.urlopen(UCI_url)\n", 42 | " compressed_file = io.BytesIO(response.read())\n", 43 | " decompressed_file = gzip.GzipFile(fileobj=compressed_file)\n", 44 | " filename = UCI_url.split('/')[-1][:-3]\n", 45 | " with open(os.getcwd()+'\\\\'+filename, 'wb') as outfile:\n", 46 | " outfile.write(decompressed_file.read())\n", 47 | " print ('File %s decompressed' % filename)\n", 48 | " \n", 49 | "def targzip_from_UCI(UCI_url, dest='.'):\n", 50 | " \"\"\"\n", 51 | " Downloads and unpacks datasets from UCI in tar.gz format\n", 52 | " \"\"\"\n", 53 | " response = urllib2.urlopen(UCI_url)\n", 54 | " compressed_file = StringIO.StringIO(response.read())\n", 55 | " tar = tarfile.open(mode=\"r:gz\", fileobj = compressed_file)\n", 56 | " tar.extractall(path=dest)\n", 57 | " datasets = tar.getnames()\n", 58 | " for dataset in datasets:\n", 59 | " size = os.path.getsize(dest+'\\\\'+dataset)\n", 60 | " print ('File %s is %i bytes' % (dataset,size))\n", 61 | " tar.close()\n", 62 | "\n", 63 | "def load_matrix(UCI_url):\n", 64 | " \"\"\"\n", 65 | " Downloads datasets from UCI in matrix form\n", 66 | " \"\"\"\n", 67 | " return np.loadtxt(urllib2.urlopen(UCI_url))" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 2, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "Current directory is: \"C:\\scisoft\\WinPython-64bit-2.7.9.4\\notebooks\\Packt - Large Scale\"\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "import os\n", 87 | "print \"Current directory is: \\\"%s\\\"\" % (os.getcwd())" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 3, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "import zlib\n", 99 | "from random import shuffle, seed\n", 100 | "\n", 101 | "def ram_shuffle(filename_in, filename_out, header=True, random_seed=0):\n", 102 | " with open(filename_in, 'rb') as f:\n", 103 | " zlines = [zlib.compress(line, 9) for line in f]\n", 104 | " if header:\n", 105 | " first_row = zlines.pop(0)\n", 106 | " seed(random_seed)\n", 107 | " shuffle(zlines)\n", 108 | " with open(filename_out, 'wb') as f:\n", 109 | " if header:\n", 110 | " f.write(zlib.decompress(first_row))\n", 111 | " for zline in zlines:\n", 112 | " f.write(zlib.decompress(zline))" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "###Bike Sharing Dataset Data Set" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 8, 125 | "metadata": { 126 | "collapsed": false 127 | }, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "Extracting in C:\\scisoft\\WinPython-64bit-2.7.9.4\\notebooks\\Packt - Large Scale\\bikesharing\n", 134 | "\tunzipping day.csv\n", 135 | "\tunzipping hour.csv\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip'\n", 141 | "unzip_from_UCI(UCI_url, dest='bikesharing')" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "###Covertype Data Set " 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 10, 154 | "metadata": { 155 | "collapsed": false 156 | }, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "File covtype.data decompressed\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz'\n", 168 | "gzip_from_UCI(UCI_url)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 3, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "import os\n", 180 | "from random import seed\n", 181 | "local_path = os.getcwd()\n", 182 | "source = 'covtype.data'\n", 183 | "ram_shuffle(filename_in=local_path+'\\\\'+source, \\\n", 184 | " filename_out=local_path+'\\\\shuffled_covtype.data', header=False)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "#Non-linear & faster with Vowpal Wabbit " 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "###Useful functions" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 1, 204 | "metadata": { 205 | "collapsed": true 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "import numpy as np\n", 210 | "\n", 211 | "def sigmoid(x):\n", 212 | " return 1. / (1. + np.exp(-x))\n", 213 | "\n", 214 | "def apply_log(x): \n", 215 | " return np.log(x + 1.0)\n", 216 | "\n", 217 | "def apply_exp(x): \n", 218 | " return np.exp(x) - 1.0" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "###Useful dataset examples" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 37, 231 | "metadata": { 232 | "collapsed": false 233 | }, 234 | "outputs": [ 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "0 | price:.23 sqft:.25 age:.05 2006\n", 240 | "1 2 'second_house | price:.18 sqft:.15 age:.35 1976\n", 241 | "0 1 0.5 'third_house | price:.53 sqft:.32 age:.87 1924\n" 242 | ] 243 | } 244 | ], 245 | "source": [ 246 | "with open('house_dataset','wb') as W:\n", 247 | " W.write(\"0 | price:.23 sqft:.25 age:.05 2006\\n\")\n", 248 | " W.write(\"1 2 'second_house | price:.18 sqft:.15 age:.35 1976\\n\")\n", 249 | " W.write(\"0 1 0.5 'third_house | price:.53 sqft:.32 age:.87 1924\\n\")\n", 250 | "\n", 251 | "with open('house_dataset','rb') as R:\n", 252 | " for line in R:\n", 253 | " print line.strip()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "###A way to call VW from Python" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 2, 266 | "metadata": { 267 | "collapsed": false 268 | }, 269 | "outputs": [ 270 | { 271 | "name": "stdout", 272 | "output_type": "stream", 273 | "text": [ 274 | "Num weight bits = 18\n", 275 | "learning rate = 0.5\n", 276 | "initial_t = 0\n", 277 | "power_t = 0.5\n", 278 | "using no cache\n", 279 | "Reading datafile = house_dataset\n", 280 | "num sources = 1\n", 281 | "average since example example current current current\n", 282 | "loss last counter weight label predict features\n", 283 | "0.000000 0.000000 1 1.0 0.0000 0.0000 5\n", 284 | "0.666667 1.000000 2 3.0 1.0000 0.0000 5\n", 285 | "\n", 286 | "finished run\n", 287 | "number of examples per pass = 3\n", 288 | "passes used = 1\n", 289 | "weighted example sum = 4.000000\n", 290 | "weighted label sum = 2.000000\n", 291 | "average loss = 0.750000\n", 292 | "best constant = 0.500000\n", 293 | "best constant's loss = 0.250000\n", 294 | "total feature number = 15\n", 295 | "------------ COMPLETED ------------\n", 296 | "\n" 297 | ] 298 | } 299 | ], 300 | "source": [ 301 | "import subprocess\n", 302 | "\n", 303 | "def execute_vw(parameters):\n", 304 | " execution = subprocess.Popen('vw '+parameters, shell=True, stderr=subprocess.PIPE)\n", 305 | " line = \"\"\n", 306 | " history = \"\"\n", 307 | " while True:\n", 308 | " out = execution.stderr.read(1)\n", 309 | " history += out\n", 310 | " if out == '' and execution.poll() != None:\n", 311 | " print '------------ COMPLETED ------------\\n'\n", 312 | " break\n", 313 | " if out != '':\n", 314 | " line += out\n", 315 | " if '\\n' in line[-2:]:\n", 316 | " print line[:-2]\n", 317 | " line = ''\n", 318 | " return history.split('\\r\\n')\n", 319 | "\n", 320 | "\n", 321 | "params = \"house_dataset\"\n", 322 | "results = execute_vw(params)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "###Processing examples" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 2, 335 | "metadata": { 336 | "collapsed": true 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "import csv\n", 341 | "\n", 342 | "def vw_convert(origin_file, target_file, binary_features, numeric_features, target, transform_target=lambda(x):x,\n", 343 | " separator=',', classification=True, multiclass=False, fieldnames= None, header=True, sparse=True):\n", 344 | " \"\"\"\n", 345 | " Reads a online style stream and returns a generator of normalized feature vectors\n", 346 | " \n", 347 | " Parameters\n", 348 | " ‐‐‐‐‐‐‐‐‐‐\n", 349 | " original_file = the csv file you are taken the data from \n", 350 | " target file = the file to stream from\n", 351 | " binary_features = the list of qualitative features to consider\n", 352 | " numeric_features = the list of numeric features to consider\n", 353 | " target = the label of the response variable\n", 354 | " transform_target = a function transforming the response\n", 355 | " separator = the field separator character\n", 356 | " classification = a Boolean indicating if it is classification\n", 357 | " multiclass = a Boolean indicating if it is multiclass classification\n", 358 | " fieldnames = the fields' labels (can be ommitted and read from file)\n", 359 | " header = a boolean indicating if the original file has an header\n", 360 | " sparse = if a sparse vector is to be returned from the generator\n", 361 | " \"\"\"\n", 362 | " with open(target_file, 'wb') as W:\n", 363 | " with open(origin_file, 'rb') as R:\n", 364 | " iterator = csv.DictReader(R, fieldnames, delimiter=separator)\n", 365 | " for n, row in enumerate(iterator):\n", 366 | " if not header or n>0:\n", 367 | " # DATA PROCESSING\n", 368 | " response = transform_target(float(row[target]))\n", 369 | " if classification and not multiclass:\n", 370 | " if response == 0:\n", 371 | " stream_row = '-1 '\n", 372 | " else:\n", 373 | " stream_row = '1 '\n", 374 | " else:\n", 375 | " stream_row = str(response)+' '\n", 376 | " quantitative = list()\n", 377 | " qualitative = list()\n", 378 | " for k,v in row.iteritems():\n", 379 | " if k in binary_features:\n", 380 | " qualitative.append(str(k)+'_'+str(v)+':1')\n", 381 | " else:\n", 382 | " if k in numeric_features and (float(v)!=0 or not sparse):\n", 383 | " quantitative.append(str(k)+':'+str(v))\n", 384 | " if quantitative:\n", 385 | " stream_row += '|n '+' '.join(quantitative)\n", 386 | " if qualitative:\n", 387 | " stream_row += '|q ' + ' '.join(qualitative)\n", 388 | " W.write(stream_row+'\\n')" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "###Examples with toys datasets" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 210, 401 | "metadata": { 402 | "collapsed": true 403 | }, 404 | "outputs": [], 405 | "source": [ 406 | "import numpy as np\n", 407 | "from sklearn.datasets import load_iris, load_boston\n", 408 | "from random import seed\n", 409 | "iris = load_iris()\n", 410 | "seed(2)\n", 411 | "re_order = np.random.permutation(len(iris.target))\n", 412 | "with open('iris_versicolor.vw','wb') as W1:\n", 413 | " for k in re_order:\n", 414 | " y = iris.target[k]\n", 415 | " X = iris.values()[1][k,:]\n", 416 | " features = ' |f '+' '.join([a+':'+str(b) for a,b in zip(map(lambda(a): a[:-5].replace(' ','_'), iris.feature_names),X)])\n", 417 | " target = '1' if y==1 else '-1'\n", 418 | " W1.write(target+features+'\\n')" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 203, 424 | "metadata": { 425 | "collapsed": false 426 | }, 427 | "outputs": [], 428 | "source": [ 429 | "boston = load_boston()\n", 430 | "seed(2)\n", 431 | "re_order = np.random.permutation(len(boston.target))\n", 432 | "with open('boston.vw','wb') as W1:\n", 433 | " for k in re_order:\n", 434 | " y = boston.target[k]\n", 435 | " X = boston.data[k,:]\n", 436 | " features = ' |f '+' '.join([a+':'+str(b) for a,b in zip(map(lambda(a): a[:-5].replace(' ','_'), iris.feature_names),X)])\n", 437 | " W1.write(str(y)+features+'\\n')" 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": {}, 443 | "source": [ 444 | "###Binary Iris" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 197, 450 | "metadata": { 451 | "collapsed": false 452 | }, 453 | "outputs": [ 454 | { 455 | "name": "stdout", 456 | "output_type": "stream", 457 | "text": [ 458 | "using l2 regularization = 1e-006\n", 459 | "predictions = iris_bin.test\n", 460 | "Lambda = 1e-006\n", 461 | "Kernel = rbf\n", 462 | "bandwidth = 0.1\n", 463 | "Num weight bits = 18\n", 464 | "learning rate = 0.5\n", 465 | "initial_t = 0\n", 466 | "power_t = 0.5\n", 467 | "using no cache\n", 468 | "Reading datafile = iris_versicolor.vw\n", 469 | "num sources = 1\n", 470 | "average since example example current current current\n", 471 | "loss last counter weight label predict features\n", 472 | "1.000000 1.000000 1 1.0 -1.0000 0.0000 5\n", 473 | "0.960606 0.921212 2 2.0 -1.0000 -0.0788 5\n", 474 | "1.030685 1.100763 4 4.0 -1.0000 -0.7865 5\n", 475 | "0.790707 0.550729 8 8.0 -1.0000 -0.3755 5\n", 476 | "0.647808 0.504909 16 16.0 -1.0000 -1.2473 5\n", 477 | "0.477695 0.307582 32 32.0 1.0000 0.8621 5\n", 478 | "0.319804 0.161914 64 64.0 -1.0000 -1.7015 5\n", 479 | "0.272695 0.225585 128 128.0 -1.0000 -1.3150 5\n", 480 | "\n", 481 | "finished run\n", 482 | "number of examples = 150\n", 483 | "weighted example sum = 150.000000\n", 484 | "weighted label sum = -50.000000\n", 485 | "average loss = 0.248892\n", 486 | "best constant = -0.333333\n", 487 | "best constant's loss = 0.888889\n", 488 | "total feature number = 750\n", 489 | "Num support = 49\n", 490 | "Number of kernel evaluations = 8836 Number of cache queries = 18555\n", 491 | "Total loss = 37.333748\n", 492 | "Done freeing model\n", 493 | "Done freeing kernel params\n", 494 | "Done with finish \n", 495 | "------------ COMPLETED ------------\n", 496 | "\n" 497 | ] 498 | } 499 | ], 500 | "source": [ 501 | "params = '--ksvm --l2 0.000001 --reprocess 2 -b 18 --kernel rbf --bandwidth=0.1 -p iris_bin.test -d iris_versicolor.vw'\n", 502 | "results = execute_vw(params)" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 198, 508 | "metadata": { 509 | "collapsed": false 510 | }, 511 | "outputs": [ 512 | { 513 | "name": "stdout", 514 | "output_type": "stream", 515 | "text": [ 516 | "holdout accuracy: 0.966\n" 517 | ] 518 | } 519 | ], 520 | "source": [ 521 | "import numpy as np\n", 522 | "def sigmoid(x):\n", 523 | " return 1. / (1. + np.exp(-x))\n", 524 | "\n", 525 | "accuracy = 0\n", 526 | "with open('iris_bin.test', 'rb') as R:\n", 527 | " with open('iris_versicolor.vw', 'rb') as TRAIN:\n", 528 | " holdouts = 0.0\n", 529 | " for n,(line, example) in enumerate(zip(R,TRAIN)):\n", 530 | " if (n+1) % 10==0:\n", 531 | " predicted = float(line.strip())\n", 532 | " y = float(example.split('|')[0])\n", 533 | " accuracy += np.sign(predicted)==np.sign(y)\n", 534 | " holdouts += 1 \n", 535 | "print 'holdout accuracy: %0.3f' % ((accuracy / holdouts)**0.5)" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": {}, 541 | "source": [ 542 | "###Boston" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 211, 548 | "metadata": { 549 | "collapsed": false 550 | }, 551 | "outputs": [ 552 | { 553 | "name": "stdout", 554 | "output_type": "stream", 555 | "text": [ 556 | "final_regressor = boston.model\n", 557 | "using dropout for neural network training\n", 558 | "Num weight bits = 18\n", 559 | "learning rate = 0.5\n", 560 | "initial_t = 0\n", 561 | "power_t = 0.5\n", 562 | "decay_learning_rate = 1\n", 563 | "creating cache_file = cache_train.vw\n", 564 | "Reading datafile = boston.vw\n", 565 | "num sources = 1\n", 566 | "average since example example current current current\n", 567 | "loss last counter weight label predict features\n", 568 | "2500.000000 2500.000000 1 1.0 50.0000 0.0000 4\n", 569 | "1570.433136 640.866272 2 2.0 26.4000 1.0847 3\n", 570 | "945.682968 320.932800 4 4.0 21.0000 3.4834 3\n", 571 | "738.617393 531.551817 8 8.0 35.4000 6.9177 4\n", 572 | "559.106543 379.595694 16 16.0 23.1000 6.6911 3\n", 573 | "362.538769 165.970995 32 32.0 16.7000 12.2397 3\n", 574 | "301.716126 240.893483 64 64.0 19.7000 12.3789 3\n", 575 | "236.351873 170.987621 128 128.0 16.1000 15.3972 3\n", 576 | "180.695258 125.038643 256 256.0 26.5000 24.0065 3\n", 577 | "99.536619 99.536619 512 512.0 28.7000 18.4439 3 h\n", 578 | "83.688702 67.840785 1024 1024.0 50.0000 20.8653 4 h\n", 579 | "72.301786 60.914870 2048 2048.0 10.4000 0.0000 3 h\n", 580 | "59.041621 45.840391 4096 4096.0 20.6000 21.1746 4 h\n", 581 | "\n", 582 | "finished run\n", 583 | "number of examples per pass = 456\n", 584 | "passes used = 10\n", 585 | "weighted example sum = 4560.000000\n", 586 | "weighted label sum = 103341.001506\n", 587 | "average loss = 43.299850 h\n", 588 | "best constant = 22.662500\n", 589 | "total feature number = 15220\n", 590 | "------------ COMPLETED ------------\n", 591 | "\n" 592 | ] 593 | } 594 | ], 595 | "source": [ 596 | "params = 'boston.vw -f boston.model --loss_function squared -k --cache_file cache_train.vw --passes=20 --nn 5 --dropout'\n", 597 | "results = execute_vw(params)" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": 212, 603 | "metadata": { 604 | "collapsed": false 605 | }, 606 | "outputs": [ 607 | { 608 | "name": "stdout", 609 | "output_type": "stream", 610 | "text": [ 611 | "only testing\n", 612 | "predictions = boston.test\n", 613 | "using dropout for neural network testing\n", 614 | "Num weight bits = 18\n", 615 | "learning rate = 0.5\n", 616 | "initial_t = 0\n", 617 | "power_t = 0.5\n", 618 | "creating cache_file = cache_test.vw\n", 619 | "Reading datafile = boston.vw\n", 620 | "num sources = 1\n", 621 | "average since example example current current current\n", 622 | "loss last counter weight label predict features\n", 623 | "922.607483 922.607483 1 1.0 50.0000 19.6255 4\n", 624 | "464.302045 5.996608 2 2.0 26.4000 23.9512 3\n", 625 | "253.949617 43.597188 4 4.0 21.0000 21.2530 3\n", 626 | "175.713928 97.478239 8 8.0 35.4000 25.5958 4\n", 627 | "130.466937 85.219947 16 16.0 15.2000 15.8726 3\n", 628 | "79.291346 28.115755 32 32.0 15.6000 19.7057 4\n", 629 | "85.270478 91.249610 64 64.0 22.8000 20.4866 3\n", 630 | "83.265921 81.261364 128 128.0 20.8000 18.1267 3\n", 631 | "70.838572 58.411224 256 256.0 27.5000 16.6386 3\n", 632 | "\n", 633 | "finished run\n", 634 | "number of examples per pass = 506\n", 635 | "passes used = 1\n", 636 | "weighted example sum = 506.000000\n", 637 | "weighted label sum = 11401.600174\n", 638 | "average loss = 65.960779\n", 639 | "best constant = 22.532808\n", 640 | "total feature number = 1687\n", 641 | "------------ COMPLETED ------------\n", 642 | "\n" 643 | ] 644 | } 645 | ], 646 | "source": [ 647 | "params = '-t boston.vw -i boston.model -k --cache_file cache_test.vw -p boston.test'\n", 648 | "results = execute_vw(params)" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": 214, 654 | "metadata": { 655 | "collapsed": false 656 | }, 657 | "outputs": [ 658 | { 659 | "name": "stdout", 660 | "output_type": "stream", 661 | "text": [ 662 | "holdout RMSE: 7.010\n" 663 | ] 664 | } 665 | ], 666 | "source": [ 667 | "val_rmse = 0\n", 668 | "with open('boston.test', 'rb') as R:\n", 669 | " with open('boston.vw', 'rb') as TRAIN:\n", 670 | " holdouts = 0.0\n", 671 | " for n,(line, example) in enumerate(zip(R,TRAIN)):\n", 672 | " if (n+1) % 10==0:\n", 673 | " predicted = float(line.strip())\n", 674 | " y = float(example.split('|')[0])\n", 675 | " val_rmse += (predicted - y)**2\n", 676 | " holdouts += 1 \n", 677 | "print 'holdout RMSE: %0.3f' % ((val_rmse / holdouts)**0.5)" 678 | ] 679 | }, 680 | { 681 | "cell_type": "markdown", 682 | "metadata": {}, 683 | "source": [ 684 | "###Bike sharing" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": 6, 690 | "metadata": { 691 | "collapsed": false 692 | }, 693 | "outputs": [], 694 | "source": [ 695 | "import os\n", 696 | "local_path = os.getcwd()\n", 697 | "b_vars = ['holiday','hr','mnth', 'season','weathersit','weekday','workingday','yr']\n", 698 | "n_vars = ['hum', 'temp', 'atemp', 'windspeed']\n", 699 | "source = '\\\\bikesharing\\\\hour.csv'\n", 700 | "origin = target_file=local_path+'\\\\'+source\n", 701 | "target = target_file=local_path+'\\\\'+'bike.vw'\n", 702 | "vw_convert(origin, target, binary_features=b_vars, numeric_features=n_vars, target = 'cnt', transform_target=apply_log,\n", 703 | " separator=',', classification=False, multiclass=False, fieldnames= None, header=True)" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 45, 709 | "metadata": { 710 | "collapsed": false 711 | }, 712 | "outputs": [ 713 | { 714 | "name": "stdout", 715 | "output_type": "stream", 716 | "text": [ 717 | "final_regressor = regression.model\n", 718 | "Num weight bits = 18\n", 719 | "learning rate = 0.5\n", 720 | "initial_t = 0\n", 721 | "power_t = 0.5\n", 722 | "decay_learning_rate = 1\n", 723 | "creating cache_file = cache_train.vw\n", 724 | "Reading datafile = bike.vw\n", 725 | "num sources = 1\n", 726 | "average since example example current current current\n", 727 | "loss last counter weight label predict features\n", 728 | "8.027098 8.027098 1 1.0 2.8332 0.0000 12\n", 729 | "7.243733 6.460369 2 2.0 3.7136 1.1718 12\n", 730 | "4.184013 1.124293 4 4.0 2.6391 2.4762 12\n", 731 | "2.709537 1.235061 8 8.0 1.3863 1.5636 12\n", 732 | "2.265795 1.822052 16 16.0 4.7095 3.7598 13\n", 733 | "1.325281 0.384768 32 32.0 2.1972 1.5774 13\n", 734 | "1.350559 1.375836 64 64.0 5.0626 3.8186 13\n", 735 | "1.395717 1.440876 128 128.0 4.2195 4.0547 13\n", 736 | "1.165618 0.935518 256 256.0 2.0794 3.3485 13\n", 737 | "0.952714 0.739810 512 512.0 4.0775 3.6438 13\n", 738 | "0.757944 0.563175 1024 1024.0 5.4116 4.0760 13\n", 739 | "0.583856 0.409769 2048 2048.0 1.0986 1.0007 13\n", 740 | "0.453590 0.323324 4096 4096.0 5.4027 5.5651 13\n", 741 | "0.393729 0.333867 8192 8192.0 3.8286 4.1227 12\n", 742 | "0.561750 0.561750 16384 16384.0 4.3944 4.0809 13 h\n", 743 | "0.509105 0.456460 32768 32768.0 4.4659 4.4656 13 h\n", 744 | "0.468332 0.427559 65536 65536.0 4.5951 4.4378 13 h\n", 745 | "\n", 746 | "finished run\n", 747 | "number of examples per pass = 15999\n", 748 | "passes used = 6\n", 749 | "weighted example sum = 95994.000000\n", 750 | "weighted label sum = 439183.191893\n", 751 | "average loss = 0.427485 h\n", 752 | "best constant = 4.575111\n", 753 | "total feature number = 1235898\n", 754 | "------------ COMPLETED ------------\n", 755 | "\n" 756 | ] 757 | } 758 | ], 759 | "source": [ 760 | "params = 'bike.vw -f regression.model -k --cache_file cache_train.vw --passes=1000 --hash strings --holdout_after 16000'\n", 761 | "results = execute_vw(params)" 762 | ] 763 | }, 764 | { 765 | "cell_type": "code", 766 | "execution_count": 47, 767 | "metadata": { 768 | "collapsed": false 769 | }, 770 | "outputs": [ 771 | { 772 | "name": "stdout", 773 | "output_type": "stream", 774 | "text": [ 775 | "only testing\n", 776 | "predictions = pred.test\n", 777 | "Num weight bits = 18\n", 778 | "learning rate = 0.5\n", 779 | "initial_t = 0\n", 780 | "power_t = 0.5\n", 781 | "creating cache_file = cache_test.vw\n", 782 | "Reading datafile = bike.vw\n", 783 | "num sources = 1\n", 784 | "average since example example current current current\n", 785 | "loss last counter weight label predict features\n", 786 | "0.127379 0.127379 1 1.0 2.8332 3.1901 12\n", 787 | "0.751745 1.376112 2 2.0 3.7136 2.5405 12\n", 788 | "1.210345 1.668944 4 4.0 2.6391 1.5334 12\n", 789 | "2.774795 4.339245 8 8.0 1.3863 4.3803 12\n", 790 | "2.276018 1.777242 16 16.0 4.7095 4.8526 13\n", 791 | "2.179675 2.083333 32 32.0 2.1972 4.6568 13\n", 792 | "1.411963 0.644251 64 64.0 5.0626 5.1554 13\n", 793 | "0.836451 0.260938 128 128.0 4.2195 4.6608 13\n", 794 | "0.677186 0.517921 256 256.0 2.0794 2.8816 13\n", 795 | "0.600932 0.524678 512 512.0 4.0775 4.0583 13\n", 796 | "0.512835 0.424738 1024 1024.0 5.4116 4.8593 13\n", 797 | "0.498590 0.484345 2048 2048.0 1.0986 1.0587 13\n", 798 | "0.422767 0.346943 4096 4096.0 5.4027 5.7840 13\n", 799 | "0.407376 0.391985 8192 8192.0 3.8286 3.9312 12\n", 800 | "0.374806 0.342236 16384 16384.0 5.7900 5.4536 12\n", 801 | "\n", 802 | "finished run\n", 803 | "number of examples per pass = 17379\n", 804 | "passes used = 1\n", 805 | "weighted example sum = 17379.000000\n", 806 | "weighted label sum = 79504.382239\n", 807 | "average loss = 0.380562\n", 808 | "best constant = 4.574739\n", 809 | "total feature number = 223723\n", 810 | "------------ COMPLETED ------------\n", 811 | "\n" 812 | ] 813 | } 814 | ], 815 | "source": [ 816 | "params = '-t bike.vw -i regression.model -k --cache_file cache_test.vw -p pred.test'\n", 817 | "results = execute_vw(params)" 818 | ] 819 | }, 820 | { 821 | "cell_type": "code", 822 | "execution_count": 10, 823 | "metadata": { 824 | "collapsed": false 825 | }, 826 | "outputs": [ 827 | { 828 | "name": "stdout", 829 | "output_type": "stream", 830 | "text": [ 831 | "holdout RMSE: 135.306\n", 832 | "holdout RMSLE: 0.845\n" 833 | ] 834 | } 835 | ], 836 | "source": [ 837 | "val_rmse = 0\n", 838 | "val_rmsle = 0\n", 839 | "with open('pred.test', 'rb') as R:\n", 840 | " with open('bike.vw', 'rb') as TRAIN:\n", 841 | " holdouts = 0.0\n", 842 | " for n,(line, example) in enumerate(zip(R,TRAIN)):\n", 843 | " if n > 16000:\n", 844 | " predicted = float(line.strip())\n", 845 | " y_log = float(example.split('|')[0])\n", 846 | " y = apply_exp(y_log)\n", 847 | " val_rmse += (apply_exp(predicted) - y)**2\n", 848 | " val_rmsle += (predicted - y_log)**2\n", 849 | " holdouts += 1\n", 850 | " \n", 851 | "print 'holdout RMSE: %0.3f' % ((val_rmse / holdouts)**0.5)\n", 852 | "print 'holdout RMSLE: %0.3f' % ((val_rmsle / holdouts)**0.5)\n" 853 | ] 854 | }, 855 | { 856 | "cell_type": "markdown", 857 | "metadata": {}, 858 | "source": [ 859 | "###Covertype" 860 | ] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": 8, 865 | "metadata": { 866 | "collapsed": false 867 | }, 868 | "outputs": [], 869 | "source": [ 870 | "import os\n", 871 | "local_path = os.getcwd()\n", 872 | "n_vars = ['var_'+'0'*int(j<10)+str(j) for j in range(54)]\n", 873 | "source = 'shuffled_covtype.data'\n", 874 | "origin = target_file=local_path+'\\\\'+source\n", 875 | "target = target_file=local_path+'\\\\'+'covtype.vw'\n", 876 | "vw_convert(origin, target, binary_features=list(), fieldnames= n_vars+['covertype'], numeric_features=n_vars,\n", 877 | " target = 'covertype', separator=',', classification=True, multiclass=True, header=False, sparse=False)" 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": 20, 883 | "metadata": { 884 | "collapsed": false 885 | }, 886 | "outputs": [ 887 | { 888 | "name": "stdout", 889 | "output_type": "stream", 890 | "text": [ 891 | "creating cubic features for triples: nnn \n", 892 | "final_regressor = multiclass.model\n", 893 | "Num weight bits = 18\n", 894 | "learning rate = 1\n", 895 | "initial_t = 0\n", 896 | "power_t = 0.5\n", 897 | "decay_learning_rate = 1\n", 898 | "creating cache_file = cache_train.vw\n", 899 | "Reading datafile = covtype.vw\n", 900 | "num sources = 1\n", 901 | "average since example example current current current\n", 902 | "loss last counter weight label predict features\n", 903 | "0.000000 0.000000 1 1.0 1 1 377\n", 904 | "0.000000 0.000000 2 2.0 1 1 377\n", 905 | "0.250000 0.500000 4 4.0 2 1 377\n", 906 | "0.375000 0.500000 8 8.0 1 2 377\n", 907 | "0.437500 0.500000 16 16.0 2 1 231\n", 908 | "0.531250 0.625000 32 32.0 1 2 377\n", 909 | "0.546875 0.562500 64 64.0 2 1 377\n", 910 | "0.500000 0.453125 128 128.0 1 1 377\n", 911 | "0.519531 0.539063 256 256.0 2 2 377\n", 912 | "0.484375 0.449219 512 512.0 2 2 377\n", 913 | "0.446289 0.408203 1024 1024.0 3 6 377\n", 914 | "0.416504 0.386719 2048 2048.0 2 2 377\n", 915 | "0.402100 0.387695 4096 4096.0 1 1 377\n", 916 | "0.372559 0.343018 8192 8192.0 1 1 298\n", 917 | "0.348694 0.324829 16384 16384.0 1 1 377\n", 918 | "0.319092 0.289490 32768 32768.0 2 2 377\n", 919 | "0.297256 0.275421 65536 65536.0 2 2 377\n", 920 | "0.278419 0.259583 131072 131072.0 2 2 377\n", 921 | "0.263660 0.248901 262144 262144.0 2 2 377\n", 922 | "0.253858 0.253858 524288 524288.0 1 1 377 h\n", 923 | "\n", 924 | "finished run\n", 925 | "number of examples per pass = 522911\n", 926 | "passes used = 2\n", 927 | "weighted example sum = 1045822.000000\n", 928 | "weighted label sum = 0.000000\n", 929 | "average loss = 0.235538 h\n", 930 | "total feature number = 384838154\n", 931 | "------------ COMPLETED ------------\n", 932 | "\n" 933 | ] 934 | } 935 | ], 936 | "source": [ 937 | "params = 'covtype.vw --ect 7 -f multiclass.model -k --cache_file cache_train.vw --passes=2 -l 1.0 --cubic nnn'\n", 938 | "results = execute_vw(params)" 939 | ] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "execution_count": 21, 944 | "metadata": { 945 | "collapsed": false 946 | }, 947 | "outputs": [ 948 | { 949 | "name": "stdout", 950 | "output_type": "stream", 951 | "text": [ 952 | "creating cubic features for triples: nnn \n", 953 | "only testing\n", 954 | "predictions = covertype.test\n", 955 | "Num weight bits = 18\n", 956 | "learning rate = 0.5\n", 957 | "initial_t = 0\n", 958 | "power_t = 0.5\n", 959 | "creating cache_file = cache_test.vw\n", 960 | "Reading datafile = covtype.vw\n", 961 | "num sources = 1\n", 962 | "average since example example current current current\n", 963 | "loss last counter weight label predict features\n", 964 | "0.000000 0.000000 1 1.0 1 1 377\n", 965 | "0.000000 0.000000 2 2.0 1 1 377\n", 966 | "0.000000 0.000000 4 4.0 2 2 377\n", 967 | "0.000000 0.000000 8 8.0 1 1 377\n", 968 | "0.187500 0.375000 16 16.0 1 2 377\n", 969 | "0.156250 0.125000 32 32.0 3 3 377\n", 970 | "0.156250 0.156250 64 64.0 2 1 377\n", 971 | "0.218750 0.281250 128 128.0 2 2 377\n", 972 | "0.222656 0.226563 256 256.0 2 2 377\n", 973 | "0.240234 0.257813 512 512.0 2 2 377\n", 974 | "0.234375 0.228516 1024 1024.0 2 2 377\n", 975 | "0.242676 0.250977 2048 2048.0 2 2 377\n", 976 | "0.242920 0.243164 4096 4096.0 1 1 377\n", 977 | "0.236328 0.229736 8192 8192.0 1 1 377\n", 978 | "0.231079 0.225830 16384 16384.0 1 1 298\n", 979 | "0.229858 0.228638 32768 32768.0 1 1 377\n", 980 | "0.232224 0.234589 65536 65536.0 1 1 377\n", 981 | "0.231529 0.230835 131072 131072.0 2 2 377\n", 982 | "0.231815 0.232101 262144 262144.0 2 2 377\n", 983 | "0.231606 0.231396 524288 524288.0 1 1 377\n", 984 | "\n", 985 | "finished run\n", 986 | "number of examples per pass = 581012\n", 987 | "passes used = 1\n", 988 | "weighted example sum = 581012.000000\n", 989 | "weighted label sum = 0.000000\n", 990 | "average loss = 0.231111\n", 991 | "total feature number = 213797603\n", 992 | "------------ COMPLETED ------------\n", 993 | "\n" 994 | ] 995 | } 996 | ], 997 | "source": [ 998 | "params = '-t covtype.vw -i multiclass.model -k --cache_file cache_test.vw -p covertype.test'\n", 999 | "results = execute_vw(params)" 1000 | ] 1001 | }, 1002 | { 1003 | "cell_type": "code", 1004 | "execution_count": 8, 1005 | "metadata": { 1006 | "collapsed": false 1007 | }, 1008 | "outputs": [ 1009 | { 1010 | "name": "stdout", 1011 | "output_type": "stream", 1012 | "text": [ 1013 | "holdout accuracy: 0.769\n" 1014 | ] 1015 | } 1016 | ], 1017 | "source": [ 1018 | "accuracy = 0\n", 1019 | "with open('covertype.test', 'rb') as R:\n", 1020 | " with open('covtype.vw', 'rb') as TRAIN:\n", 1021 | " holdouts = 0.0\n", 1022 | " for n,(line, example) in enumerate(zip(R,TRAIN)):\n", 1023 | " if (n+1) % 10==0:\n", 1024 | " predicted = float(line.strip())\n", 1025 | " y = float(example.split('|')[0])\n", 1026 | " accuracy += predicted ==y\n", 1027 | " holdouts += 1\n", 1028 | "print 'holdout accuracy: %0.3f' % (accuracy / holdouts)" 1029 | ] 1030 | } 1031 | ], 1032 | "metadata": { 1033 | "kernelspec": { 1034 | "display_name": "Python 2", 1035 | "language": "python", 1036 | "name": "python2" 1037 | }, 1038 | "language_info": { 1039 | "codemirror_mode": { 1040 | "name": "ipython", 1041 | "version": 2 1042 | }, 1043 | "file_extension": ".py", 1044 | "mimetype": "text/x-python", 1045 | "name": "python", 1046 | "nbconvert_exporter": "python", 1047 | "pygments_lexer": "ipython2", 1048 | "version": "2.7.9" 1049 | } 1050 | }, 1051 | "nbformat": 4, 1052 | "nbformat_minor": 0 1053 | } 1054 | -------------------------------------------------------------------------------- /Chapter 08/Chapter_8_code_HDFS.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "16/05/10 19:34:19 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\n", 15 | "Deleted /tmp\n", 16 | "16/05/10 19:34:22 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\n" 17 | ] 18 | } 19 | ], 20 | "source": [ 21 | "# Clean up\n", 22 | "!hdfs dfs -rm -r -f /datasets /tmp\n", 23 | "!rm -rf /tmp/hadoop_git_readme*\n", 24 | "!hdfs dfs -expunge" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Command line" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "Configured Capacity: 42241163264 (39.34 GB)\r\n", 46 | "Present Capacity: 37536710656 (34.96 GB)\r\n", 47 | "DFS Remaining: 37346992128 (34.78 GB)\r\n", 48 | "DFS Used: 189718528 (180.93 MB)\r\n", 49 | "DFS Used%: 0.51%\r\n", 50 | "Under replicated blocks: 0\r\n", 51 | "Blocks with corrupt replicas: 0\r\n", 52 | "Missing blocks: 0\r\n", 53 | "\r\n", 54 | "-------------------------------------------------\r\n", 55 | "Live datanodes (1):\r\n", 56 | "\r\n", 57 | "Name: 127.0.0.1:50010 (localhost)\r\n", 58 | "Hostname: sparkbox\r\n", 59 | "Decommission Status : Normal\r\n", 60 | "Configured Capacity: 42241163264 (39.34 GB)\r\n", 61 | "DFS Used: 189718528 (180.93 MB)\r\n", 62 | "Non DFS Used: 4704452608 (4.38 GB)\r\n", 63 | "DFS Remaining: 37346992128 (34.78 GB)\r\n", 64 | "DFS Used%: 0.45%\r\n", 65 | "DFS Remaining%: 88.41%\r\n", 66 | "Configured Cache Capacity: 0 (0 B)\r\n", 67 | "Cache Used: 0 (0 B)\r\n", 68 | "Cache Remaining: 0 (0 B)\r\n", 69 | "Cache Used%: 100.00%\r\n", 70 | "Cache Remaining%: 0.00%\r\n", 71 | "Xceivers: 1\r\n", 72 | "Last contact: Tue May 10 19:34:23 UTC 2016\r\n", 73 | "\r\n", 74 | "\r\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "!hdfs dfsadmin -report" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 3, 85 | "metadata": { 86 | "collapsed": false 87 | }, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "Found 2 items\r\n", 94 | "drwxr-xr-x - vagrant supergroup 0 2016-05-10 19:05 /spark\r\n", 95 | "drwxr-xr-x - vagrant supergroup 0 2016-05-10 18:48 /user\r\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "!hdfs dfs -ls /" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 4, 106 | "metadata": { 107 | "collapsed": false 108 | }, 109 | "outputs": [ 110 | { 111 | "name": "stdout", 112 | "output_type": "stream", 113 | "text": [ 114 | "Filesystem Size Used Available Use%\r\n", 115 | "hdfs://localhost:9000 39.3 G 180.9 M 34.8 G 0%\r\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "!hdfs dfs -df -h /" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 5, 126 | "metadata": { 127 | "collapsed": false 128 | }, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "179.0 M /spark\r\n", 135 | "473.4 K /user\r\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "!hdfs dfs -du -h /" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 6, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "!hdfs dfs -mkdir /datasets" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 7, 157 | "metadata": { 158 | "collapsed": false 159 | }, 160 | "outputs": [], 161 | "source": [ 162 | "!wget -q http://www.gutenberg.org/cache/epub/100/pg100.txt \\\n", 163 | " -O ../datasets/shakespeare_all.txt" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 8, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [], 173 | "source": [ 174 | "!hdfs dfs -put ../datasets/shakespeare_all.txt \\\n", 175 | " /datasets/shakespeare_all.txt\n", 176 | "\n", 177 | "!hdfs dfs -put ../datasets/hadoop_git_readme.txt \\\n", 178 | " /datasets/hadoop_git_readme.txt" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 9, 184 | "metadata": { 185 | "collapsed": false 186 | }, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "Found 2 items\r\n", 193 | "-rw-r--r-- 1 vagrant supergroup 1365 2016-05-10 19:34 /datasets/hadoop_git_readme.txt\r\n", 194 | "-rw-r--r-- 1 vagrant supergroup 5589889 2016-05-10 19:34 /datasets/shakespeare_all.txt\r\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "!hdfs dfs -ls /datasets" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 10, 205 | "metadata": { 206 | "collapsed": false 207 | }, 208 | "outputs": [ 209 | { 210 | "name": "stdout", 211 | "output_type": "stream", 212 | "text": [ 213 | "30\r\n" 214 | ] 215 | } 216 | ], 217 | "source": [ 218 | "!hdfs dfs -cat /datasets/hadoop_git_readme.txt | wc -l" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 11, 224 | "metadata": { 225 | "collapsed": false 226 | }, 227 | "outputs": [ 228 | { 229 | "name": "stdout", 230 | "output_type": "stream", 231 | "text": [ 232 | "60\r\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "!hdfs dfs -cat \\\n", 238 | " hdfs:///datasets/hadoop_git_readme.txt \\\n", 239 | " file:///home/vagrant/datasets/hadoop_git_readme.txt | wc -l" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 12, 245 | "metadata": { 246 | "collapsed": true 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "!hdfs dfs -cp /datasets/hadoop_git_readme.txt \\\n", 251 | " /datasets/copy_hadoop_git_readme.txt" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 13, 257 | "metadata": { 258 | "collapsed": false 259 | }, 260 | "outputs": [ 261 | { 262 | "name": "stdout", 263 | "output_type": "stream", 264 | "text": [ 265 | "16/05/10 19:35:07 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\r\n", 266 | "Deleted /datasets/copy_hadoop_git_readme.txt\r\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "!hdfs dfs -rm /datasets/copy_hadoop_git_readme.txt" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 14, 277 | "metadata": { 278 | "collapsed": false 279 | }, 280 | "outputs": [ 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "16/05/10 19:35:09 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\r\n" 286 | ] 287 | } 288 | ], 289 | "source": [ 290 | "!hdfs dfs -expunge" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 15, 296 | "metadata": { 297 | "collapsed": false 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "!hdfs dfs -get /datasets/hadoop_git_readme.txt \\\n", 302 | " /tmp/hadoop_git_readme.txt" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 16, 308 | "metadata": { 309 | "collapsed": false 310 | }, 311 | "outputs": [ 312 | { 313 | "name": "stdout", 314 | "output_type": "stream", 315 | "text": [ 316 | "ntry, of \r\n", 317 | "encryption software. BEFORE using any encryption software, please \r\n", 318 | "check your country's laws, regulations and policies concerning the\r\n", 319 | "import, possession, or use, and re-export of encryption software, to \r\n", 320 | "see if this is permitted. See for more\r\n", 321 | "information.\r\n", 322 | "\r\n", 323 | "The U.S. Government Department of Commerce, Bureau of Industry and\r\n", 324 | "Security (BIS), has classified this software as Export Commodity \r\n", 325 | "Control Number (ECCN) 5D002.C.1, which includes information security\r\n", 326 | "software using or performing cryptographic functions with asymmetric\r\n", 327 | "algorithms. The form and manner of this Apache Software Foundation\r\n", 328 | "distribution makes it eligible for export under the License Exception\r\n", 329 | "ENC Technology Software Unrestricted (TSU) exception (see the BIS \r\n", 330 | "Export Administration Regulations, Section 740.13) for both object \r\n", 331 | "code and source code.\r\n", 332 | "\r\n", 333 | "The following provides more details on the included cryptographic\r\n", 334 | "software:\r\n", 335 | " Hadoop Core uses the SSL libraries from the Jetty project written \r\n", 336 | "by mortbay.org." 337 | ] 338 | } 339 | ], 340 | "source": [ 341 | "!hdfs dfs -tail /datasets/hadoop_git_readme.txt" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": { 347 | "collapsed": true 348 | }, 349 | "source": [ 350 | "## Snakebite" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 17, 356 | "metadata": { 357 | "collapsed": false 358 | }, 359 | "outputs": [], 360 | "source": [ 361 | "from snakebite.client import Client\n", 362 | "client = Client(\"localhost\", 9000)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 18, 368 | "metadata": { 369 | "collapsed": false 370 | }, 371 | "outputs": [ 372 | { 373 | "data": { 374 | "text/plain": [ 375 | "{'blockSize': 134217728L,\n", 376 | " 'bytesPerChecksum': 512,\n", 377 | " 'checksumType': 2,\n", 378 | " 'encryptDataTransfer': False,\n", 379 | " 'fileBufferSize': 4096,\n", 380 | " 'replication': 1,\n", 381 | " 'trashInterval': 0L,\n", 382 | " 'writePacketSize': 65536}" 383 | ] 384 | }, 385 | "execution_count": 18, 386 | "metadata": {}, 387 | "output_type": "execute_result" 388 | } 389 | ], 390 | "source": [ 391 | "client.serverdefaults()" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 19, 397 | "metadata": { 398 | "collapsed": false 399 | }, 400 | "outputs": [ 401 | { 402 | "name": "stdout", 403 | "output_type": "stream", 404 | "text": [ 405 | "/datasets\n", 406 | "/spark\n", 407 | "/user\n" 408 | ] 409 | } 410 | ], 411 | "source": [ 412 | "for x in client.ls(['/']):\n", 413 | " print x['path']" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 20, 419 | "metadata": { 420 | "collapsed": false 421 | }, 422 | "outputs": [ 423 | { 424 | "data": { 425 | "text/plain": [ 426 | "{'capacity': 42241163264L,\n", 427 | " 'corrupt_blocks': 0L,\n", 428 | " 'filesystem': 'hdfs://localhost:9000',\n", 429 | " 'missing_blocks': 0L,\n", 430 | " 'remaining': 37341663232L,\n", 431 | " 'under_replicated': 0L,\n", 432 | " 'used': 195353480L}" 433 | ] 434 | }, 435 | "execution_count": 20, 436 | "metadata": {}, 437 | "output_type": "execute_result" 438 | } 439 | ], 440 | "source": [ 441 | "client.df()" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 21, 447 | "metadata": { 448 | "collapsed": false 449 | }, 450 | "outputs": [ 451 | { 452 | "data": { 453 | "text/plain": [ 454 | "[{'length': 5591254L, 'path': '/datasets'},\n", 455 | " {'length': 187698038L, 'path': '/spark'},\n", 456 | " {'length': 484810L, 'path': '/user'}]" 457 | ] 458 | }, 459 | "execution_count": 21, 460 | "metadata": {}, 461 | "output_type": "execute_result" 462 | } 463 | ], 464 | "source": [ 465 | "list(client.du([\"/\"]))" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 22, 471 | "metadata": { 472 | "collapsed": false 473 | }, 474 | "outputs": [], 475 | "source": [ 476 | "# Note:\n", 477 | "# put command is not yet available" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": 23, 483 | "metadata": { 484 | "collapsed": false 485 | }, 486 | "outputs": [ 487 | { 488 | "name": "stdout", 489 | "output_type": "stream", 490 | "text": [ 491 | "30\n" 492 | ] 493 | } 494 | ], 495 | "source": [ 496 | "for el in client.cat(['/datasets/hadoop_git_readme.txt']):\n", 497 | " print el.next().count(\"\\n\")" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": 24, 503 | "metadata": { 504 | "collapsed": false 505 | }, 506 | "outputs": [], 507 | "source": [ 508 | "# Note:\n", 509 | "# copy command is not yet available" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 25, 515 | "metadata": { 516 | "collapsed": false 517 | }, 518 | "outputs": [ 519 | { 520 | "data": { 521 | "text/plain": [ 522 | "{'path': '/datasets/shakespeare_all.txt', 'result': True}" 523 | ] 524 | }, 525 | "execution_count": 25, 526 | "metadata": {}, 527 | "output_type": "execute_result" 528 | } 529 | ], 530 | "source": [ 531 | "client.delete(['/datasets/shakespeare_all.txt']).next()" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 26, 537 | "metadata": { 538 | "collapsed": false 539 | }, 540 | "outputs": [ 541 | { 542 | "data": { 543 | "text/plain": [ 544 | "{'error': '',\n", 545 | " 'path': '/tmp/hadoop_git_readme_2.txt',\n", 546 | " 'result': True,\n", 547 | " 'source_path': '/datasets/hadoop_git_readme.txt'}" 548 | ] 549 | }, 550 | "execution_count": 26, 551 | "metadata": {}, 552 | "output_type": "execute_result" 553 | } 554 | ], 555 | "source": [ 556 | "(client\n", 557 | ".copyToLocal(['/datasets/hadoop_git_readme.txt'], \n", 558 | " '/tmp/hadoop_git_readme_2.txt')\n", 559 | ".next())" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 27, 565 | "metadata": { 566 | "collapsed": false 567 | }, 568 | "outputs": [ 569 | { 570 | "data": { 571 | "text/plain": [ 572 | "[{'path': '/datasets_2', 'result': True}]" 573 | ] 574 | }, 575 | "execution_count": 27, 576 | "metadata": {}, 577 | "output_type": "execute_result" 578 | } 579 | ], 580 | "source": [ 581 | "list(client.mkdir(['/datasets_2']))" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 28, 587 | "metadata": { 588 | "collapsed": false 589 | }, 590 | "outputs": [ 591 | { 592 | "data": { 593 | "text/plain": [ 594 | "[{'path': '/datasets', 'result': True},\n", 595 | " {'path': '/datasets_2', 'result': True}]" 596 | ] 597 | }, 598 | "execution_count": 28, 599 | "metadata": {}, 600 | "output_type": "execute_result" 601 | } 602 | ], 603 | "source": [ 604 | "list(client.delete(['/datasets*'], recurse=True))" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": { 611 | "collapsed": true 612 | }, 613 | "outputs": [], 614 | "source": [] 615 | } 616 | ], 617 | "metadata": { 618 | "kernelspec": { 619 | "display_name": "Python 2", 620 | "language": "python", 621 | "name": "python2" 622 | }, 623 | "language_info": { 624 | "codemirror_mode": { 625 | "name": "ipython", 626 | "version": 2 627 | }, 628 | "file_extension": ".py", 629 | "mimetype": "text/x-python", 630 | "name": "python", 631 | "nbconvert_exporter": "python", 632 | "pygments_lexer": "ipython2", 633 | "version": "2.7.6" 634 | } 635 | }, 636 | "nbformat": 4, 637 | "nbformat_minor": 0 638 | } 639 | -------------------------------------------------------------------------------- /Chapter 08/Chapter_8_code_MR.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Let's first insert some data in the HDFS" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Found 2 items\r\n", 22 | "-rw-r--r-- 1 vagrant supergroup 1365 2016-05-10 19:58 /datasets/hadoop_git_readme.txt\r\n", 23 | "-rw-r--r-- 1 vagrant supergroup 5589889 2016-05-10 19:58 /datasets/shakespeare_all.txt\r\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "!hdfs dfs -mkdir -p /datasets\n", 29 | "!wget -q http://www.gutenberg.org/cache/epub/100/pg100.txt \\\n", 30 | " -O ../datasets/shakespeare_all.txt\n", 31 | "!hdfs dfs -put -f ../datasets/shakespeare_all.txt /datasets/shakespeare_all.txt\n", 32 | "!hdfs dfs -put -f ../datasets/hadoop_git_readme.txt /datasets/hadoop_git_readme.txt\n", 33 | "!hdfs dfs -ls /datasets" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "## MR with Hadoop streaming" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": { 47 | "collapsed": true 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "with open('mapper_hadoop.py', 'w') as fh:\n", 52 | " fh.write(\"\"\"#!/usr/bin/env python\n", 53 | "\n", 54 | "import sys\n", 55 | "\n", 56 | "for line in sys.stdin:\n", 57 | " print \"chars\", len(line.rstrip('\\\\n'))\n", 58 | " print \"words\", len(line.split())\n", 59 | " print \"lines\", 1\n", 60 | " \"\"\")\n", 61 | "\n", 62 | "\n", 63 | "with open('reducer_hadoop.py', 'w') as fh:\n", 64 | " fh.write(\"\"\"#!/usr/bin/env python\n", 65 | "\n", 66 | "import sys\n", 67 | "\n", 68 | "counts = {\"chars\": 0, \"words\":0, \"lines\":0}\n", 69 | "\n", 70 | "for line in sys.stdin:\n", 71 | " kv = line.rstrip().split()\n", 72 | " counts[kv[0]] += int(kv[1])\n", 73 | "\n", 74 | "for k,v in counts.items():\n", 75 | " print k, v\n", 76 | " \"\"\") " 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 3, 82 | "metadata": { 83 | "collapsed": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "!chmod a+x *_hadoop.py" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 4, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "chars 1335\r\n", 102 | "lines 31\r\n", 103 | "words 179\r\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "!cat ../datasets/hadoop_git_readme.txt | ./mapper_hadoop.py | sort -k1,1 | ./reducer_hadoop.py" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 5, 114 | "metadata": { 115 | "collapsed": false 116 | }, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "16/05/10 19:58:48 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\n", 123 | "Deleted /tmp/mr.out\n", 124 | "packageJobJar: [/tmp/hadoop-unjar5384590696382062055/] [] /tmp/streamjob1965588122940844531.jar tmpDir=null\n", 125 | "16/05/10 19:58:50 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 126 | "16/05/10 19:58:51 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n", 127 | "16/05/10 19:58:51 INFO mapred.FileInputFormat: Total input paths to process : 1\n", 128 | "16/05/10 19:58:51 INFO mapreduce.JobSubmitter: number of splits:2\n", 129 | "16/05/10 19:58:52 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1462906052477_0019\n", 130 | "16/05/10 19:58:52 INFO impl.YarnClientImpl: Submitted application application_1462906052477_0019\n", 131 | "16/05/10 19:58:52 INFO mapreduce.Job: The url to track the job: http://sparkbox:8088/proxy/application_1462906052477_0019/\n", 132 | "16/05/10 19:58:52 INFO mapreduce.Job: Running job: job_1462906052477_0019\n", 133 | "16/05/10 19:58:58 INFO mapreduce.Job: Job job_1462906052477_0019 running in uber mode : false\n", 134 | "16/05/10 19:58:58 INFO mapreduce.Job: map 0% reduce 0%\n", 135 | "16/05/10 19:59:03 INFO mapreduce.Job: map 50% reduce 0%\n", 136 | "16/05/10 19:59:08 INFO mapreduce.Job: map 100% reduce 0%\n", 137 | "16/05/10 19:59:14 INFO mapreduce.Job: map 100% reduce 100%\n", 138 | "16/05/10 19:59:14 INFO mapreduce.Job: Job job_1462906052477_0019 completed successfully\n", 139 | "16/05/10 19:59:14 INFO mapreduce.Job: Counters: 49\n", 140 | "\tFile System Counters\n", 141 | "\t\tFILE: Number of bytes read=1060\n", 142 | "\t\tFILE: Number of bytes written=332854\n", 143 | "\t\tFILE: Number of read operations=0\n", 144 | "\t\tFILE: Number of large read operations=0\n", 145 | "\t\tFILE: Number of write operations=0\n", 146 | "\t\tHDFS: Number of bytes read=2256\n", 147 | "\t\tHDFS: Number of bytes written=33\n", 148 | "\t\tHDFS: Number of read operations=9\n", 149 | "\t\tHDFS: Number of large read operations=0\n", 150 | "\t\tHDFS: Number of write operations=2\n", 151 | "\tJob Counters \n", 152 | "\t\tLaunched map tasks=2\n", 153 | "\t\tLaunched reduce tasks=1\n", 154 | "\t\tData-local map tasks=2\n", 155 | "\t\tTotal time spent by all maps in occupied slots (ms)=6732\n", 156 | "\t\tTotal time spent by all reduces in occupied slots (ms)=3739\n", 157 | "\t\tTotal time spent by all map tasks (ms)=6732\n", 158 | "\t\tTotal time spent by all reduce tasks (ms)=3739\n", 159 | "\t\tTotal vcore-milliseconds taken by all map tasks=6732\n", 160 | "\t\tTotal vcore-milliseconds taken by all reduce tasks=3739\n", 161 | "\t\tTotal megabyte-milliseconds taken by all map tasks=6893568\n", 162 | "\t\tTotal megabyte-milliseconds taken by all reduce tasks=3828736\n", 163 | "\tMap-Reduce Framework\n", 164 | "\t\tMap input records=31\n", 165 | "\t\tMap output records=93\n", 166 | "\t\tMap output bytes=868\n", 167 | "\t\tMap output materialized bytes=1066\n", 168 | "\t\tInput split bytes=208\n", 169 | "\t\tCombine input records=0\n", 170 | "\t\tCombine output records=0\n", 171 | "\t\tReduce input groups=23\n", 172 | "\t\tReduce shuffle bytes=1066\n", 173 | "\t\tReduce input records=93\n", 174 | "\t\tReduce output records=3\n", 175 | "\t\tSpilled Records=186\n", 176 | "\t\tShuffled Maps =2\n", 177 | "\t\tFailed Shuffles=0\n", 178 | "\t\tMerged Map outputs=2\n", 179 | "\t\tGC time elapsed (ms)=78\n", 180 | "\t\tCPU time spent (ms)=1830\n", 181 | "\t\tPhysical memory (bytes) snapshot=699170816\n", 182 | "\t\tVirtual memory (bytes) snapshot=2495647744\n", 183 | "\t\tTotal committed heap usage (bytes)=512229376\n", 184 | "\tShuffle Errors\n", 185 | "\t\tBAD_ID=0\n", 186 | "\t\tCONNECTION=0\n", 187 | "\t\tIO_ERROR=0\n", 188 | "\t\tWRONG_LENGTH=0\n", 189 | "\t\tWRONG_MAP=0\n", 190 | "\t\tWRONG_REDUCE=0\n", 191 | "\tFile Input Format Counters \n", 192 | "\t\tBytes Read=2048\n", 193 | "\tFile Output Format Counters \n", 194 | "\t\tBytes Written=33\n", 195 | "16/05/10 19:59:14 INFO streaming.StreamJob: Output directory: /tmp/mr.out\n" 196 | ] 197 | } 198 | ], 199 | "source": [ 200 | "!hdfs dfs -mkdir -p /tmp\n", 201 | "!hdfs dfs -rm -f -r /tmp/mr.out\n", 202 | "\n", 203 | "!hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.6.4.jar \\\n", 204 | "-files mapper_hadoop.py,reducer_hadoop.py \\\n", 205 | "-mapper mapper_hadoop.py -reducer reducer_hadoop.py \\\n", 206 | "-input /datasets/hadoop_git_readme.txt -output /tmp/mr.out\n", 207 | "\n" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 6, 213 | "metadata": { 214 | "collapsed": false 215 | }, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "Found 2 items\r\n", 222 | "-rw-r--r-- 1 vagrant supergroup 0 2016-05-10 19:59 /tmp/mr.out/_SUCCESS\r\n", 223 | "-rw-r--r-- 1 vagrant supergroup 33 2016-05-10 19:59 /tmp/mr.out/part-00000\r\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "!hdfs dfs -ls /tmp/mr.out" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 7, 234 | "metadata": { 235 | "collapsed": false 236 | }, 237 | "outputs": [ 238 | { 239 | "name": "stdout", 240 | "output_type": "stream", 241 | "text": [ 242 | "chars 1335\t\r\n", 243 | "lines 31\t\r\n", 244 | "words 179\t\r\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "!hdfs dfs -cat /tmp/mr.out/part-00000" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "collapsed": true 257 | }, 258 | "outputs": [], 259 | "source": [] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "## MR with Python MrJob library" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 8, 271 | "metadata": { 272 | "collapsed": true 273 | }, 274 | "outputs": [], 275 | "source": [ 276 | "with open(\"MrJob_job1.py\", \"w\") as fh:\n", 277 | " fh.write(\"\"\"\n", 278 | "from mrjob.job import MRJob\n", 279 | "\n", 280 | "\n", 281 | "class MRWordFrequencyCount(MRJob):\n", 282 | "\n", 283 | " def mapper(self, _, line):\n", 284 | " yield \"chars\", len(line)\n", 285 | " yield \"words\", len(line.split())\n", 286 | " yield \"lines\", 1\n", 287 | "\n", 288 | " def reducer(self, key, values):\n", 289 | " yield key, sum(values)\n", 290 | "\n", 291 | "\n", 292 | "if __name__ == '__main__':\n", 293 | " MRWordFrequencyCount.run() \n", 294 | " \"\"\")" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 9, 300 | "metadata": { 301 | "collapsed": false 302 | }, 303 | "outputs": [ 304 | { 305 | "name": "stdout", 306 | "output_type": "stream", 307 | "text": [ 308 | "No configs found; falling back on auto-configuration\r\n", 309 | "Creating temp directory /tmp/MrJob_job1.vagrant.20160510.195920.590984\r\n", 310 | "Running step 1 of 1...\r\n", 311 | "Streaming final output from /tmp/MrJob_job1.vagrant.20160510.195920.590984/output...\r\n", 312 | "\"chars\"\t1335\r\n", 313 | "\"lines\"\t31\r\n", 314 | "\"words\"\t179\r\n", 315 | "Removing temp directory /tmp/MrJob_job1.vagrant.20160510.195920.590984...\r\n" 316 | ] 317 | } 318 | ], 319 | "source": [ 320 | "!python MrJob_job1.py ../datasets/hadoop_git_readme.txt" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 10, 326 | "metadata": { 327 | "collapsed": false 328 | }, 329 | "outputs": [ 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "No configs found; falling back on auto-configuration\n", 335 | "Looking for hadoop binary in /usr/local/hadoop/bin...\n", 336 | "Found hadoop binary: /usr/local/hadoop/bin/hadoop\n", 337 | "Creating temp directory /tmp/MrJob_job1.vagrant.20160510.195920.870616\n", 338 | "Using Hadoop version 2.6.4\n", 339 | "Copying local files to hdfs:///user/vagrant/tmp/mrjob/MrJob_job1.vagrant.20160510.195920.870616/files/...\n", 340 | "Looking for Hadoop streaming jar in /usr/local/hadoop...\n", 341 | "Found Hadoop streaming jar: /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.6.4.jar\n", 342 | "Running step 1 of 1...\n", 343 | " packageJobJar: [/tmp/hadoop-unjar7634308048659876233/] [] /tmp/streamjob5879999650692493094.jar tmpDir=null\n", 344 | " Connecting to ResourceManager at /0.0.0.0:8032\n", 345 | " Connecting to ResourceManager at /0.0.0.0:8032\n", 346 | " Total input paths to process : 1\n", 347 | " number of splits:2\n", 348 | " Submitting tokens for job: job_1462906052477_0020\n", 349 | " Submitted application application_1462906052477_0020\n", 350 | " The url to track the job: http://sparkbox:8088/proxy/application_1462906052477_0020/\n", 351 | " Running job: job_1462906052477_0020\n", 352 | " Job job_1462906052477_0020 running in uber mode : false\n", 353 | " map 0% reduce 0%\n", 354 | " map 50% reduce 0%\n", 355 | " map 100% reduce 0%\n", 356 | " map 100% reduce 100%\n", 357 | " Job job_1462906052477_0020 completed successfully\n", 358 | " Output directory: hdfs:///user/vagrant/tmp/mrjob/MrJob_job1.vagrant.20160510.195920.870616/output\n", 359 | "Counters: 50\n", 360 | "\tFile Input Format Counters \n", 361 | "\t\tBytes Read=2048\n", 362 | "\tFile Output Format Counters \n", 363 | "\t\tBytes Written=36\n", 364 | "\tFile System Counters\n", 365 | "\t\tFILE: Number of bytes read=1153\n", 366 | "\t\tFILE: Number of bytes written=337717\n", 367 | "\t\tFILE: Number of large read operations=0\n", 368 | "\t\tFILE: Number of read operations=0\n", 369 | "\t\tFILE: Number of write operations=0\n", 370 | "\t\tHDFS: Number of bytes read=2256\n", 371 | "\t\tHDFS: Number of bytes written=36\n", 372 | "\t\tHDFS: Number of large read operations=0\n", 373 | "\t\tHDFS: Number of read operations=9\n", 374 | "\t\tHDFS: Number of write operations=2\n", 375 | "\tJob Counters \n", 376 | "\t\tData-local map tasks=2\n", 377 | "\t\tKilled map tasks=1\n", 378 | "\t\tLaunched map tasks=2\n", 379 | "\t\tLaunched reduce tasks=1\n", 380 | "\t\tTotal megabyte-milliseconds taken by all map tasks=7394304\n", 381 | "\t\tTotal megabyte-milliseconds taken by all reduce tasks=3846144\n", 382 | "\t\tTotal time spent by all map tasks (ms)=7221\n", 383 | "\t\tTotal time spent by all maps in occupied slots (ms)=7221\n", 384 | "\t\tTotal time spent by all reduce tasks (ms)=3756\n", 385 | "\t\tTotal time spent by all reduces in occupied slots (ms)=3756\n", 386 | "\t\tTotal vcore-milliseconds taken by all map tasks=7221\n", 387 | "\t\tTotal vcore-milliseconds taken by all reduce tasks=3756\n", 388 | "\tMap-Reduce Framework\n", 389 | "\t\tCPU time spent (ms)=1830\n", 390 | "\t\tCombine input records=0\n", 391 | "\t\tCombine output records=0\n", 392 | "\t\tFailed Shuffles=0\n", 393 | "\t\tGC time elapsed (ms)=66\n", 394 | "\t\tInput split bytes=208\n", 395 | "\t\tMap input records=31\n", 396 | "\t\tMap output bytes=961\n", 397 | "\t\tMap output materialized bytes=1159\n", 398 | "\t\tMap output records=93\n", 399 | "\t\tMerged Map outputs=2\n", 400 | "\t\tPhysical memory (bytes) snapshot=726175744\n", 401 | "\t\tReduce input groups=3\n", 402 | "\t\tReduce input records=93\n", 403 | "\t\tReduce output records=3\n", 404 | "\t\tReduce shuffle bytes=1159\n", 405 | "\t\tShuffled Maps =2\n", 406 | "\t\tSpilled Records=186\n", 407 | "\t\tTotal committed heap usage (bytes)=515899392\n", 408 | "\t\tVirtual memory (bytes) snapshot=2496479232\n", 409 | "\tShuffle Errors\n", 410 | "\t\tBAD_ID=0\n", 411 | "\t\tCONNECTION=0\n", 412 | "\t\tIO_ERROR=0\n", 413 | "\t\tWRONG_LENGTH=0\n", 414 | "\t\tWRONG_MAP=0\n", 415 | "\t\tWRONG_REDUCE=0\n", 416 | "Streaming final output from hdfs:///user/vagrant/tmp/mrjob/MrJob_job1.vagrant.20160510.195920.870616/output...\n", 417 | "\"chars\"\t1335\n", 418 | "\"lines\"\t31\n", 419 | "\"words\"\t179\n", 420 | "Removing HDFS temp directory hdfs:///user/vagrant/tmp/mrjob/MrJob_job1.vagrant.20160510.195920.870616...\n", 421 | "Removing temp directory /tmp/MrJob_job1.vagrant.20160510.195920.870616...\n" 422 | ] 423 | } 424 | ], 425 | "source": [ 426 | "!python MrJob_job1.py -r hadoop hdfs:///datasets/hadoop_git_readme.txt" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 11, 432 | "metadata": { 433 | "collapsed": true 434 | }, 435 | "outputs": [], 436 | "source": [ 437 | "with open(\"MrJob_job2.py\", \"w\") as fh:\n", 438 | " fh.write(\"\"\"\n", 439 | "from mrjob.job import MRJob\n", 440 | "from mrjob.step import MRStep\n", 441 | "import re\n", 442 | "\n", 443 | "WORD_RE = re.compile(r\"[\\w']+\")\n", 444 | "\n", 445 | "\n", 446 | "class MRMostUsedWord(MRJob):\n", 447 | "\n", 448 | " def steps(self):\n", 449 | " return [\n", 450 | " MRStep(mapper=self.mapper_get_words,\n", 451 | " reducer=self.reducer_count_words),\n", 452 | " MRStep(mapper=self.mapper_word_count_one_key,\n", 453 | " reducer=self.reducer_find_max_word)\n", 454 | " ]\n", 455 | "\n", 456 | " def mapper_get_words(self, _, line):\n", 457 | " # yield each word in the line\n", 458 | " for word in WORD_RE.findall(line):\n", 459 | " yield (word.lower(), 1)\n", 460 | "\n", 461 | " def reducer_count_words(self, word, counts):\n", 462 | " # send all (num_occurrences, word) pairs to the same reducer.\n", 463 | " yield (word, sum(counts))\n", 464 | " \n", 465 | " def mapper_word_count_one_key(self, word, counts):\n", 466 | " # send all the tuples to same reducer\n", 467 | " yield None, (counts, word)\n", 468 | "\n", 469 | " def reducer_find_max_word(self, _, count_word_pairs):\n", 470 | " # each item of word_count_pairs is a tuple (count, word),\n", 471 | " yield max(count_word_pairs)\n", 472 | "\n", 473 | "\n", 474 | "if __name__ == '__main__':\n", 475 | " MRMostUsedWord.run()\n", 476 | "\"\"\")" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 12, 482 | "metadata": { 483 | "collapsed": false 484 | }, 485 | "outputs": [ 486 | { 487 | "name": "stdout", 488 | "output_type": "stream", 489 | "text": [ 490 | "27801\t\"the\"\r\n" 491 | ] 492 | } 493 | ], 494 | "source": [ 495 | "# This time is running on a big dataset\n", 496 | "!python MrJob_job2.py --quiet ../datasets/shakespeare_all.txt" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 13, 502 | "metadata": { 503 | "collapsed": false 504 | }, 505 | "outputs": [ 506 | { 507 | "name": "stdout", 508 | "output_type": "stream", 509 | "text": [ 510 | "27801\t\"the\"\r\n" 511 | ] 512 | } 513 | ], 514 | "source": [ 515 | "!python MrJob_job2.py -r hadoop --quiet hdfs:///datasets/shakespeare_all.txt" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": { 522 | "collapsed": true 523 | }, 524 | "outputs": [], 525 | "source": [] 526 | } 527 | ], 528 | "metadata": { 529 | "kernelspec": { 530 | "display_name": "Python 2", 531 | "language": "python", 532 | "name": "python2" 533 | }, 534 | "language_info": { 535 | "codemirror_mode": { 536 | "name": "ipython", 537 | "version": 2 538 | }, 539 | "file_extension": ".py", 540 | "mimetype": "text/x-python", 541 | "name": "python", 542 | "nbconvert_exporter": "python", 543 | "pygments_lexer": "ipython2", 544 | "version": "2.7.6" 545 | } 546 | }, 547 | "nbformat": 4, 548 | "nbformat_minor": 0 549 | } 550 | -------------------------------------------------------------------------------- /Chapter 08/Chapter_8_code_Spark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Let's first insert some data in the HDFS" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Found 2 items\n", 22 | "-rw-r--r-- 1 vagrant supergroup 1365 2016-05-10 20:06 /datasets/hadoop_git_readme.txt\n", 23 | "-rw-r--r-- 1 vagrant supergroup 5589889 2016-05-10 20:06 /datasets/shakespeare_all.txt\n", 24 | "16/05/10 20:06:36 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\n", 25 | "Deleted /tmp\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "!hdfs dfs -mkdir -p /datasets\n", 31 | "!wget -q http://www.gutenberg.org/cache/epub/100/pg100.txt \\\n", 32 | " -O ../datasets/shakespeare_all.txt\n", 33 | "!hdfs dfs -put -f ../datasets/shakespeare_all.txt /datasets/shakespeare_all.txt\n", 34 | "!hdfs dfs -put -f ../datasets/hadoop_git_readme.txt /datasets/hadoop_git_readme.txt\n", 35 | "!hdfs dfs -ls /datasets\n", 36 | "!hdfs dfs -rm -r /tmp" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## pySpark" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "[(u'spark.rdd.compress', u'True'),\n", 57 | " (u'spark.master', u'yarn-client'),\n", 58 | " (u'spark.serializer.objectStreamReset', u'100'),\n", 59 | " (u'spark.yarn.isPython', u'true'),\n", 60 | " (u'spark.submit.deployMode', u'client'),\n", 61 | " (u'spark.executor.cores', u'2'),\n", 62 | " (u'spark.app.name', u'PySparkShell')]" 63 | ] 64 | }, 65 | "execution_count": 2, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "sc._conf.getAll()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 3, 77 | "metadata": { 78 | "collapsed": false 79 | }, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/plain": [ 84 | "ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:423" 85 | ] 86 | }, 87 | "execution_count": 3, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "numbers = range(10)\n", 94 | "numbers_rdd = sc.parallelize(numbers)\n", 95 | "\n", 96 | "numbers_rdd" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 4, 102 | "metadata": { 103 | "collapsed": false 104 | }, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]" 110 | ] 111 | }, 112 | "execution_count": 4, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "numbers_rdd.collect()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 5, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "[0, 1, 2, 3]" 132 | ] 133 | }, 134 | "execution_count": 5, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "numbers_rdd.take(4)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 6, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [ 150 | { 151 | "data": { 152 | "text/plain": [ 153 | "u'For the latest information about Hadoop, please visit our website at:'" 154 | ] 155 | }, 156 | "execution_count": 6, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "sc.textFile(\"hdfs:///datasets/hadoop_git_readme.txt\").first()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 7, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "u'For the latest information about Hadoop, please visit our website at:'" 176 | ] 177 | }, 178 | "execution_count": 7, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "sc.textFile(\"file:///home/vagrant/datasets/hadoop_git_readme.txt\").first()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 8, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "numbers_rdd.saveAsTextFile(\"hdfs:///tmp/numbers_1_10.txt\")" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 9, 201 | "metadata": { 202 | "collapsed": false 203 | }, 204 | "outputs": [ 205 | { 206 | "name": "stdout", 207 | "output_type": "stream", 208 | "text": [ 209 | "Found 5 items\r\n", 210 | "-rw-r--r-- 1 vagrant supergroup 0 2016-05-10 20:06 /tmp/numbers_1_10.txt/_SUCCESS\r\n", 211 | "-rw-r--r-- 1 vagrant supergroup 4 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00000\r\n", 212 | "-rw-r--r-- 1 vagrant supergroup 4 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00001\r\n", 213 | "-rw-r--r-- 1 vagrant supergroup 4 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00002\r\n", 214 | "-rw-r--r-- 1 vagrant supergroup 8 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00003\r\n" 215 | ] 216 | } 217 | ], 218 | "source": [ 219 | "!hdfs dfs -ls /tmp/numbers_1_10.txt" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 10, 225 | "metadata": { 226 | "collapsed": true 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "numbers_rdd.coalesce(1).saveAsTextFile(\"hdfs:///tmp/numbers_1_10_one_file.txt\")" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 11, 236 | "metadata": { 237 | "collapsed": false 238 | }, 239 | "outputs": [ 240 | { 241 | "name": "stdout", 242 | "output_type": "stream", 243 | "text": [ 244 | "Found 2 items\r\n", 245 | "-rw-r--r-- 1 vagrant supergroup 0 2016-05-10 20:06 /tmp/numbers_1_10_one_file.txt/_SUCCESS\r\n", 246 | "-rw-r--r-- 1 vagrant supergroup 20 2016-05-10 20:06 /tmp/numbers_1_10_one_file.txt/part-00000\r\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "!hdfs dfs -ls /tmp/numbers_1_10_one_file.txt" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 12, 257 | "metadata": { 258 | "collapsed": false 259 | }, 260 | "outputs": [ 261 | { 262 | "name": "stdout", 263 | "output_type": "stream", 264 | "text": [ 265 | "0\r\n", 266 | "1\r\n", 267 | "2\r\n", 268 | "3\r\n", 269 | "4\r\n", 270 | "5\r\n", 271 | "6\r\n", 272 | "7\r\n", 273 | "8\r\n", 274 | "9\r\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "!hdfs dfs -cat /tmp/numbers_1_10_one_file.txt/part-00000" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 13, 285 | "metadata": { 286 | "collapsed": true 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "numbers_rdd.saveAsTextFile(\"file:///tmp/numbers_1_10.txt\")" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 14, 296 | "metadata": { 297 | "collapsed": false 298 | }, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "part-00000 part-00001\tpart-00002 part-00003\t_SUCCESS\r\n" 305 | ] 306 | } 307 | ], 308 | "source": [ 309 | "!ls /tmp/numbers_1_10.txt" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 15, 315 | "metadata": { 316 | "collapsed": false 317 | }, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/plain": [ 322 | "[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]" 323 | ] 324 | }, 325 | "execution_count": 15, 326 | "metadata": {}, 327 | "output_type": "execute_result" 328 | } 329 | ], 330 | "source": [ 331 | "def sq(x):\n", 332 | " return x**2\n", 333 | "\n", 334 | "numbers_rdd.map(sq).collect()" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 16, 340 | "metadata": { 341 | "collapsed": false 342 | }, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/plain": [ 347 | "[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]" 348 | ] 349 | }, 350 | "execution_count": 16, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "numbers_rdd.map(lambda x: x**2).collect()" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 17, 362 | "metadata": { 363 | "collapsed": false 364 | }, 365 | "outputs": [ 366 | { 367 | "data": { 368 | "text/plain": [ 369 | "285" 370 | ] 371 | }, 372 | "execution_count": 17, 373 | "metadata": {}, 374 | "output_type": "execute_result" 375 | } 376 | ], 377 | "source": [ 378 | "numbers_rdd.map(lambda x: x**2).reduce(lambda a,b: a+b)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 18, 384 | "metadata": { 385 | "collapsed": false 386 | }, 387 | "outputs": [ 388 | { 389 | "data": { 390 | "text/plain": [ 391 | "285" 392 | ] 393 | }, 394 | "execution_count": 18, 395 | "metadata": {}, 396 | "output_type": "execute_result" 397 | } 398 | ], 399 | "source": [ 400 | "numbers_rdd.map(lambda x: x**2).sum()" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 19, 406 | "metadata": { 407 | "collapsed": false 408 | }, 409 | "outputs": [ 410 | { 411 | "data": { 412 | "text/plain": [ 413 | "[('even', 0),\n", 414 | " ('odd', 1),\n", 415 | " ('even', 2),\n", 416 | " ('odd', 3),\n", 417 | " ('even', 4),\n", 418 | " ('odd', 5),\n", 419 | " ('even', 6),\n", 420 | " ('odd', 7),\n", 421 | " ('even', 8),\n", 422 | " ('odd', 9)]" 423 | ] 424 | }, 425 | "execution_count": 19, 426 | "metadata": {}, 427 | "output_type": "execute_result" 428 | } 429 | ], 430 | "source": [ 431 | "def tag(x):\n", 432 | " return \"even\" if x%2==0 else \"odd\"\n", 433 | " \n", 434 | " \n", 435 | "numbers_rdd.map(lambda x: (tag(x), x) ).collect()" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 20, 441 | "metadata": { 442 | "collapsed": false 443 | }, 444 | "outputs": [ 445 | { 446 | "data": { 447 | "text/plain": [ 448 | "[('even', 20), ('odd', 25)]" 449 | ] 450 | }, 451 | "execution_count": 20, 452 | "metadata": {}, 453 | "output_type": "execute_result" 454 | } 455 | ], 456 | "source": [ 457 | "numbers_rdd.map(lambda x: (tag(x), x) ).reduceByKey(lambda a,b: a+b).collect()" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "metadata": { 464 | "collapsed": true 465 | }, 466 | "outputs": [], 467 | "source": [] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": { 473 | "collapsed": true 474 | }, 475 | "outputs": [], 476 | "source": [] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 21, 481 | "metadata": { 482 | "collapsed": false 483 | }, 484 | "outputs": [ 485 | { 486 | "name": "stdout", 487 | "output_type": "stream", 488 | "text": [ 489 | "{'chars': 1335, 'lines': 31, 'words': 179}\n" 490 | ] 491 | } 492 | ], 493 | "source": [ 494 | "def emit_feats(line):\n", 495 | " return [(\"chars\", len(line)), \\\n", 496 | " (\"words\", len(line.split())), \\\n", 497 | " (\"lines\", 1)]\n", 498 | "\n", 499 | "print (sc.textFile(\"/datasets/hadoop_git_readme.txt\")\n", 500 | " .flatMap(emit_feats)\n", 501 | " .reduceByKey(lambda a,b: a+b)\n", 502 | " .collectAsMap())" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 22, 508 | "metadata": { 509 | "collapsed": false 510 | }, 511 | "outputs": [ 512 | { 513 | "name": "stdout", 514 | "output_type": "stream", 515 | "text": [ 516 | "[(27801, u'the')]\n" 517 | ] 518 | } 519 | ], 520 | "source": [ 521 | "import re\n", 522 | "WORD_RE = re.compile(r\"[\\w']+\")\n", 523 | "\n", 524 | "print (sc.textFile(\"/datasets/shakespeare_all.txt\")\n", 525 | " .flatMap(lambda line: WORD_RE.findall(line))\n", 526 | " .map(lambda word: (word.lower(), 1))\n", 527 | " .reduceByKey(lambda a,b: a+b)\n", 528 | " .map(lambda (k,v): (v,k))\n", 529 | " .takeOrdered(1, key = lambda x: -x[0]))" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 23, 535 | "metadata": { 536 | "collapsed": false 537 | }, 538 | "outputs": [ 539 | { 540 | "name": "stdout", 541 | "output_type": "stream", 542 | "text": [ 543 | "[(u'the', 27801)]\n" 544 | ] 545 | } 546 | ], 547 | "source": [ 548 | "print (sc.textFile(\"/datasets/shakespeare_all.txt\")\n", 549 | " .flatMap(lambda line: [(word.lower(), 1) for word in WORD_RE.findall(line)])\n", 550 | " .reduceByKey(lambda a,b: a+b)\n", 551 | " .takeOrdered(1, key = lambda x: -x[1]))" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "metadata": { 558 | "collapsed": true 559 | }, 560 | "outputs": [], 561 | "source": [] 562 | } 563 | ], 564 | "metadata": { 565 | "kernelspec": { 566 | "display_name": "Python 2", 567 | "language": "python", 568 | "name": "python2" 569 | }, 570 | "language_info": { 571 | "codemirror_mode": { 572 | "name": "ipython", 573 | "version": 2 574 | }, 575 | "file_extension": ".py", 576 | "mimetype": "text/x-python", 577 | "name": "python", 578 | "nbconvert_exporter": "python", 579 | "pygments_lexer": "ipython2", 580 | "version": "2.7.6" 581 | } 582 | }, 583 | "nbformat": 4, 584 | "nbformat_minor": 0 585 | } 586 | -------------------------------------------------------------------------------- /Chapter 08/Chapter_8_code_Vagrantfile: -------------------------------------------------------------------------------- 1 | Vagrant.configure("2") do |config| 2 | config.vm.box = "sparkpy/sparkbox_test_1" 3 | config.vm.hostname = "sparkbox" 4 | config.ssh.insert_key = false 5 | 6 | # Hadoop ResourceManager 7 | config.vm.network :forwarded_port, guest: 8088, host: 8088, auto_correct: true 8 | 9 | # Hadoop NameNode 10 | config.vm.network :forwarded_port, guest: 50070, host: 50070, auto_correct: true 11 | 12 | # Hadoop DataNode 13 | config.vm.network :forwarded_port, guest: 50075, host: 50075, auto_correct: true 14 | 15 | # Ipython notebooks (yarn and standalone) 16 | config.vm.network :forwarded_port, guest: 8888, host: 8888, auto_correct: true 17 | 18 | 19 | config.vm.provider "virtualbox" do |v| 20 | v.customize ["modifyvm", :id, "--natdnshostresolver1", "on"] 21 | v.customize ["modifyvm", :id, "--natdnsproxy1", "on"] 22 | v.customize ["modifyvm", :id, "--nictype1", "virtio"] 23 | 24 | v.name = "sparkbox_test" 25 | v.memory = "4096" 26 | v.cpus = "2" 27 | end 28 | 29 | end 30 | -------------------------------------------------------------------------------- /Chapter 09/Chapter_9_code_01.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Sharing data within the cluster" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "##### Read-only variables (broadcast)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "# Example: let's encode the gender found in the demographic data\n", 26 | "# As a hot encode. Note: the association should be the same\n", 27 | "# on every machine in the cluster, requiring a shared mapping\n", 28 | "\n", 29 | "one_hot_encoding = {\"M\": (1, 0, 0),\n", 30 | " \"F\": (0, 1, 0),\n", 31 | " \"U\": (0, 0, 1)\n", 32 | " }" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [ 42 | { 43 | "data": { 44 | "text/plain": [ 45 | "[(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (0, 0, 1)]" 46 | ] 47 | }, 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "# Gender one-hot-encoding\n", 55 | "(sc.parallelize([\"M\", \"F\", \"U\", \"F\", \"M\", \"U\"])\n", 56 | " .map(lambda x: one_hot_encoding[x])\n", 57 | " .collect())\n", 58 | "\n", 59 | "# The command above works only in the single node configuration\n", 60 | "# since the variable \"one_hot_encoding\" is defined only on this machine\n", 61 | "# On a multi-node cluster, it will raise a Java error" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "metadata": { 68 | "collapsed": false 69 | }, 70 | "outputs": [ 71 | { 72 | "data": { 73 | "text/plain": [ 74 | "[(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (0, 0, 1)]" 75 | ] 76 | }, 77 | "execution_count": 3, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "# Solution 1: include the encoding map in the .map() function \n", 84 | "# In this way, all the nodes will see it\n", 85 | "\n", 86 | "def map_ohe(x):\n", 87 | " ohe = {\"M\": (1, 0, 0),\n", 88 | " \"F\": (0, 1, 0),\n", 89 | " \"U\": (0, 0, 1)\n", 90 | " }\n", 91 | " return ohe[x]\n", 92 | "\n", 93 | "sc.parallelize([\"M\", \"F\", \"U\", \"F\", \"M\", \"U\"]).map(map_ohe).collect()\n", 94 | "\n" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 4, 100 | "metadata": { 101 | "collapsed": false 102 | }, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "[(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (0, 0, 1)]" 108 | ] 109 | }, 110 | "execution_count": 4, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "# Solution 2: broadcast the map to all the nodes.\n", 117 | "# All of them will be able to read-only it\n", 118 | "\n", 119 | "bcast_map = sc.broadcast(one_hot_encoding)\n", 120 | "\n", 121 | "def bcast_map_ohe(x, shared_ohe):\n", 122 | " return shared_ohe[x]\n", 123 | "\n", 124 | "(sc.parallelize([\"M\", \"F\", \"U\", \"F\", \"M\", \"U\"])\n", 125 | " .map(lambda x: bcast_map_ohe(x, bcast_map.value))\n", 126 | " .collect())" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 5, 132 | "metadata": { 133 | "collapsed": true 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "bcast_map.unpersist()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "##### Write-only variables (broadcast)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 6, 150 | "metadata": { 151 | "collapsed": false 152 | }, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "The number of empty lines is:\n" 159 | ] 160 | }, 161 | { 162 | "data": { 163 | "text/plain": [ 164 | "6" 165 | ] 166 | }, 167 | "execution_count": 6, 168 | "metadata": {}, 169 | "output_type": "execute_result" 170 | } 171 | ], 172 | "source": [ 173 | "# Let's coint the empty line in a file\n", 174 | "\n", 175 | "print \"The number of empty lines is:\"\n", 176 | "\n", 177 | "(sc.textFile('file:///home/vagrant/datasets/hadoop_git_readme.txt')\n", 178 | " .filter(lambda line: len(line) == 0)\n", 179 | " .count())" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 7, 185 | "metadata": { 186 | "collapsed": false 187 | }, 188 | "outputs": [ 189 | { 190 | "name": "stdout", 191 | "output_type": "stream", 192 | "text": [ 193 | "In the file there are 31 lines\n", 194 | "And 6 lines are empty\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "# Let's count the lines in a file, and at the same time,\n", 200 | "# count the empty ones\n", 201 | "\n", 202 | "accum = sc.accumulator(0)\n", 203 | "\n", 204 | "def split_line(line): \n", 205 | " if len(line) == 0:\n", 206 | " accum.add(1)\n", 207 | " return 1\n", 208 | "\n", 209 | "tot_lines = (\n", 210 | " sc.textFile('file:///home/vagrant/datasets/hadoop_git_readme.txt')\n", 211 | " .map(split_line)\n", 212 | " .count())\n", 213 | "\n", 214 | "empty_lines = accum.value\n", 215 | "\n", 216 | "\n", 217 | "print \"In the file there are %d lines\" % tot_lines\n", 218 | "print \"And %d lines are empty\" % empty_lines" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": {}, 224 | "source": [ 225 | "# Real world example with broadcast and accumulator\n", 226 | "### train multiple classifiers and select the best one, accumulating the errors" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 8, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "# step 1: load the dataset\n", 238 | "# note: if the dataset is large, you should read the next section\n", 239 | "\n", 240 | "from sklearn.datasets import load_iris\n", 241 | "\n", 242 | "bcast_dataset = sc.broadcast(load_iris())" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 9, 248 | "metadata": { 249 | "collapsed": false 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "# step 2: create an accumulator that stores the errors in a list\n", 254 | "\n", 255 | "from pyspark import AccumulatorParam\n", 256 | "\n", 257 | "class ErrorAccumulator(AccumulatorParam):\n", 258 | " def zero(self, initialList):\n", 259 | " return initialList\n", 260 | "\n", 261 | " def addInPlace(self, v1, v2):\n", 262 | " if not isinstance(v1, list):\n", 263 | " v1 = [v1]\n", 264 | " if not isinstance(v2, list):\n", 265 | " v2 = [v2]\n", 266 | " return v1 + v2\n", 267 | "\n", 268 | "errAccum = sc.accumulator([], ErrorAccumulator())" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 10, 274 | "metadata": { 275 | "collapsed": true 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "# step 3: create mappers: each of them will use a classifier\n", 280 | "\n", 281 | "def apply_classifier(clf, dataset):\n", 282 | " \n", 283 | " clf_name = clf.__class__.__name__\n", 284 | " X = dataset.value.data\n", 285 | " y = dataset.value.target\n", 286 | " \n", 287 | " try:\n", 288 | " from sklearn.metrics import accuracy_score\n", 289 | " \n", 290 | " clf.fit(X, y)\n", 291 | " y_pred = clf.predict(X)\n", 292 | " acc = accuracy_score(y, y_pred)\n", 293 | "\n", 294 | " return [(clf_name, acc)]\n", 295 | "\n", 296 | " except Exception as e:\n", 297 | " errAccum.add((clf_name, str(e)))\n", 298 | " return []\n" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 11, 304 | "metadata": { 305 | "collapsed": false 306 | }, 307 | "outputs": [ 308 | { 309 | "data": { 310 | "text/plain": [ 311 | "[('DummyClassifier', 0.33333333333333331),\n", 312 | " ('SGDClassifier', 0.66666666666666663)]" 313 | ] 314 | }, 315 | "execution_count": 11, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "from sklearn.linear_model import SGDClassifier\n", 322 | "from sklearn.dummy import DummyClassifier\n", 323 | "from sklearn.decomposition import PCA\n", 324 | "from sklearn.manifold import MDS\n", 325 | "\n", 326 | "classifiers = [DummyClassifier('most_frequent'), \n", 327 | " SGDClassifier(), \n", 328 | " PCA(), \n", 329 | " MDS()]\n", 330 | "\n", 331 | "(sc.parallelize(classifiers)\n", 332 | " .flatMap(lambda x: apply_classifier(x, bcast_dataset))\n", 333 | " .collect())" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 12, 339 | "metadata": { 340 | "collapsed": false 341 | }, 342 | "outputs": [ 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "The errors are:\n" 348 | ] 349 | }, 350 | { 351 | "data": { 352 | "text/plain": [ 353 | "[('PCA', \"'PCA' object has no attribute 'predict'\"),\n", 354 | " ('MDS',\n", 355 | " \"Proximity must be 'precomputed' or 'euclidean'. Got euclidean instead\")]" 356 | ] 357 | }, 358 | "execution_count": 12, 359 | "metadata": {}, 360 | "output_type": "execute_result" 361 | } 362 | ], 363 | "source": [ 364 | "print \"The errors are:\"\n", 365 | "errAccum.value" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 13, 371 | "metadata": { 372 | "collapsed": false 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "bcast_dataset.unpersist()" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "# Load the data" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 14, 389 | "metadata": { 390 | "collapsed": true 391 | }, 392 | "outputs": [], 393 | "source": [ 394 | "from pyspark.sql import SQLContext\n", 395 | "sqlContext = SQLContext(sc)" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 15, 401 | "metadata": { 402 | "collapsed": false 403 | }, 404 | "outputs": [ 405 | { 406 | "name": "stdout", 407 | "output_type": "stream", 408 | "text": [ 409 | "{\"user_id\":0, \"balance\": 10.0}\r\n", 410 | "{\"user_id\":1, \"gender\":\"M\", \"balance\": 1.0}\r\n", 411 | "{\"user_id\":2, \"gender\":\"F\", \"balance\": -0.5}\r\n", 412 | "{\"user_id\":3, \"gender\":\"F\", \"balance\": 0.0}\r\n", 413 | "{\"user_id\":4, \"balance\": 5.0}\r\n", 414 | "{\"user_id\":5, \"gender\":\"M\", \"balance\": 3.0}" 415 | ] 416 | } 417 | ], 418 | "source": [ 419 | "!cat /home/vagrant/datasets/users.json" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 16, 425 | "metadata": { 426 | "collapsed": false 427 | }, 428 | "outputs": [ 429 | { 430 | "name": "stdout", 431 | "output_type": "stream", 432 | "text": [ 433 | "+-------+------+-------+\n", 434 | "|balance|gender|user_id|\n", 435 | "+-------+------+-------+\n", 436 | "| 10.0| null| 0|\n", 437 | "| 1.0| M| 1|\n", 438 | "| -0.5| F| 2|\n", 439 | "| 0.0| F| 3|\n", 440 | "| 5.0| null| 4|\n", 441 | "| 3.0| M| 5|\n", 442 | "+-------+------+-------+\n", 443 | "\n" 444 | ] 445 | } 446 | ], 447 | "source": [ 448 | "df = sqlContext.read.json(\"file:///home/vagrant/datasets/users.json\")\n", 449 | "df.show()" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 17, 455 | "metadata": { 456 | "collapsed": false 457 | }, 458 | "outputs": [ 459 | { 460 | "name": "stdout", 461 | "output_type": "stream", 462 | "text": [ 463 | "root\n", 464 | " |-- balance: double (nullable = true)\n", 465 | " |-- gender: string (nullable = true)\n", 466 | " |-- user_id: long (nullable = true)\n", 467 | "\n" 468 | ] 469 | } 470 | ], 471 | "source": [ 472 | "df.printSchema()" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 18, 478 | "metadata": { 479 | "collapsed": false 480 | }, 481 | "outputs": [ 482 | { 483 | "name": "stdout", 484 | "output_type": "stream", 485 | "text": [ 486 | "+-------+------+-------+\n", 487 | "|balance|gender|user_id|\n", 488 | "+-------+------+-------+\n", 489 | "| 1.0| M| 1|\n", 490 | "| 3.0| M| 5|\n", 491 | "+-------+------+-------+\n", 492 | "\n" 493 | ] 494 | } 495 | ], 496 | "source": [ 497 | "(df.filter(df['gender'] != 'null')\n", 498 | " .filter(df['balance'] > 0)\n", 499 | " .select(['balance', 'gender', 'user_id'])\n", 500 | " .show())" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": 19, 506 | "metadata": { 507 | "collapsed": false 508 | }, 509 | "outputs": [ 510 | { 511 | "name": "stdout", 512 | "output_type": "stream", 513 | "text": [ 514 | "+-------+------+-------+\n", 515 | "|balance|gender|user_id|\n", 516 | "+-------+------+-------+\n", 517 | "| 1.0| M| 1|\n", 518 | "| 3.0| M| 5|\n", 519 | "+-------+------+-------+\n", 520 | "\n" 521 | ] 522 | } 523 | ], 524 | "source": [ 525 | "(df.filter('gender is not null')\n", 526 | " .filter('balance > 0').select(\"*\").show())" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 20, 532 | "metadata": { 533 | "collapsed": false 534 | }, 535 | "outputs": [ 536 | { 537 | "name": "stdout", 538 | "output_type": "stream", 539 | "text": [ 540 | "+-------+------+-------+\n", 541 | "|balance|gender|user_id|\n", 542 | "+-------+------+-------+\n", 543 | "| 1.0| M| 1|\n", 544 | "| 3.0| M| 5|\n", 545 | "+-------+------+-------+\n", 546 | "\n" 547 | ] 548 | } 549 | ], 550 | "source": [ 551 | "df.filter('gender is not null and balance > 0').show()" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 21, 557 | "metadata": { 558 | "collapsed": false 559 | }, 560 | "outputs": [ 561 | { 562 | "name": "stdout", 563 | "output_type": "stream", 564 | "text": [ 565 | "+-------+------+-------+\n", 566 | "|balance|gender|user_id|\n", 567 | "+-------+------+-------+\n", 568 | "| 1.0| M| 1|\n", 569 | "| -0.5| F| 2|\n", 570 | "| 0.0| F| 3|\n", 571 | "| 3.0| M| 5|\n", 572 | "+-------+------+-------+\n", 573 | "\n" 574 | ] 575 | } 576 | ], 577 | "source": [ 578 | "df.na.drop().show()" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": 22, 584 | "metadata": { 585 | "collapsed": false 586 | }, 587 | "outputs": [ 588 | { 589 | "name": "stdout", 590 | "output_type": "stream", 591 | "text": [ 592 | "+-------+------+-------+\n", 593 | "|balance|gender|user_id|\n", 594 | "+-------+------+-------+\n", 595 | "| 1.0| M| 1|\n", 596 | "| -0.5| F| 2|\n", 597 | "| 0.0| F| 3|\n", 598 | "| 3.0| M| 5|\n", 599 | "+-------+------+-------+\n", 600 | "\n" 601 | ] 602 | } 603 | ], 604 | "source": [ 605 | "df.na.drop(subset=[\"gender\"]).show()" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": 23, 611 | "metadata": { 612 | "collapsed": false 613 | }, 614 | "outputs": [ 615 | { 616 | "name": "stdout", 617 | "output_type": "stream", 618 | "text": [ 619 | "+-------+------+-------+\n", 620 | "|balance|gender|user_id|\n", 621 | "+-------+------+-------+\n", 622 | "| 10.0| U| 0|\n", 623 | "| 1.0| M| 1|\n", 624 | "| -0.5| F| 2|\n", 625 | "| 0.0| F| 3|\n", 626 | "| 5.0| U| 4|\n", 627 | "| 3.0| M| 5|\n", 628 | "+-------+------+-------+\n", 629 | "\n" 630 | ] 631 | } 632 | ], 633 | "source": [ 634 | "df.na.fill({'gender': \"U\", 'balance': 0.0}).show()" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 24, 640 | "metadata": { 641 | "collapsed": false 642 | }, 643 | "outputs": [ 644 | { 645 | "name": "stdout", 646 | "output_type": "stream", 647 | "text": [ 648 | "+------+------------+\n", 649 | "|gender|avg(balance)|\n", 650 | "+------+------------+\n", 651 | "| F| -0.25|\n", 652 | "| M| 2.0|\n", 653 | "| U| 7.5|\n", 654 | "+------+------------+\n", 655 | "\n" 656 | ] 657 | } 658 | ], 659 | "source": [ 660 | "(df.na.fill({'gender': \"U\", 'balance': 0.0})\n", 661 | " .groupBy(\"gender\").avg('balance').show())" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 25, 667 | "metadata": { 668 | "collapsed": true 669 | }, 670 | "outputs": [], 671 | "source": [ 672 | "df.registerTempTable(\"users\")" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": 26, 678 | "metadata": { 679 | "collapsed": false 680 | }, 681 | "outputs": [ 682 | { 683 | "name": "stdout", 684 | "output_type": "stream", 685 | "text": [ 686 | "+------+-----+\n", 687 | "|gender| _c1|\n", 688 | "+------+-----+\n", 689 | "| F|-0.25|\n", 690 | "| M| 2.0|\n", 691 | "+------+-----+\n", 692 | "\n" 693 | ] 694 | } 695 | ], 696 | "source": [ 697 | "sqlContext.sql(\"\"\"\n", 698 | " SELECT gender, AVG(balance) \n", 699 | " FROM users \n", 700 | " WHERE gender IS NOT NULL \n", 701 | " GROUP BY gender\"\"\").show()" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": 27, 707 | "metadata": { 708 | "collapsed": false 709 | }, 710 | "outputs": [ 711 | { 712 | "data": { 713 | "text/plain": [ 714 | "pyspark.sql.dataframe.DataFrame" 715 | ] 716 | }, 717 | "execution_count": 27, 718 | "metadata": {}, 719 | "output_type": "execute_result" 720 | } 721 | ], 722 | "source": [ 723 | "type(sqlContext.table(\"users\"))" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": 28, 729 | "metadata": { 730 | "collapsed": false 731 | }, 732 | "outputs": [ 733 | { 734 | "data": { 735 | "text/plain": [ 736 | "[Row(balance=10.0, gender=None, user_id=0),\n", 737 | " Row(balance=1.0, gender=u'M', user_id=1),\n", 738 | " Row(balance=-0.5, gender=u'F', user_id=2),\n", 739 | " Row(balance=0.0, gender=u'F', user_id=3),\n", 740 | " Row(balance=5.0, gender=None, user_id=4),\n", 741 | " Row(balance=3.0, gender=u'M', user_id=5)]" 742 | ] 743 | }, 744 | "execution_count": 28, 745 | "metadata": {}, 746 | "output_type": "execute_result" 747 | } 748 | ], 749 | "source": [ 750 | "sqlContext.table(\"users\").collect()" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": 29, 756 | "metadata": { 757 | "collapsed": false 758 | }, 759 | "outputs": [ 760 | { 761 | "data": { 762 | "text/plain": [ 763 | "Row(balance=10.0, gender=None, user_id=0)" 764 | ] 765 | }, 766 | "execution_count": 29, 767 | "metadata": {}, 768 | "output_type": "execute_result" 769 | } 770 | ], 771 | "source": [ 772 | "a_row = sqlContext.sql(\"SELECT * FROM users\").first()\n", 773 | "a_row" 774 | ] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": 30, 779 | "metadata": { 780 | "collapsed": false 781 | }, 782 | "outputs": [ 783 | { 784 | "name": "stdout", 785 | "output_type": "stream", 786 | "text": [ 787 | "10.0\n", 788 | "10.0\n" 789 | ] 790 | } 791 | ], 792 | "source": [ 793 | "print a_row['balance']\n", 794 | "print a_row.balance" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": 31, 800 | "metadata": { 801 | "collapsed": false 802 | }, 803 | "outputs": [ 804 | { 805 | "data": { 806 | "text/plain": [ 807 | "{'balance': 10.0, 'gender': None, 'user_id': 0}" 808 | ] 809 | }, 810 | "execution_count": 31, 811 | "metadata": {}, 812 | "output_type": "execute_result" 813 | } 814 | ], 815 | "source": [ 816 | "a_row.asDict()" 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": 32, 822 | "metadata": { 823 | "collapsed": true 824 | }, 825 | "outputs": [], 826 | "source": [ 827 | "!rm -rf /tmp/complete_users*" 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": 33, 833 | "metadata": { 834 | "collapsed": false 835 | }, 836 | "outputs": [], 837 | "source": [ 838 | "(df.na.drop().write\n", 839 | " .save(\"file:///tmp/complete_users.json\", format='json'))" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": 34, 845 | "metadata": { 846 | "collapsed": false 847 | }, 848 | "outputs": [ 849 | { 850 | "name": "stdout", 851 | "output_type": "stream", 852 | "text": [ 853 | "total 28\r\n", 854 | "4 drwxrwxr-x 2 vagrant vagrant 4096 May 10 20:36 .\r\n", 855 | "4 drwxrwxrwt 22 root root 4096 May 10 20:36 ..\r\n", 856 | "4 -rw-r--r-- 1 vagrant vagrant 83 May 10 20:36 part-r-00000-f5728f74-10d9-4c7a-8865-64cb80c7ca0a\r\n", 857 | "4 -rw-rw-r-- 1 vagrant vagrant 12 May 10 20:36 .part-r-00000-f5728f74-10d9-4c7a-8865-64cb80c7ca0a.crc\r\n", 858 | "4 -rw-r--r-- 1 vagrant vagrant 82 May 10 20:36 part-r-00001-f5728f74-10d9-4c7a-8865-64cb80c7ca0a\r\n", 859 | "4 -rw-rw-r-- 1 vagrant vagrant 12 May 10 20:36 .part-r-00001-f5728f74-10d9-4c7a-8865-64cb80c7ca0a.crc\r\n", 860 | "0 -rw-r--r-- 1 vagrant vagrant 0 May 10 20:36 _SUCCESS\r\n", 861 | "4 -rw-rw-r-- 1 vagrant vagrant 8 May 10 20:36 ._SUCCESS.crc\r\n" 862 | ] 863 | } 864 | ], 865 | "source": [ 866 | "!ls -als /tmp/complete_users.json" 867 | ] 868 | }, 869 | { 870 | "cell_type": "code", 871 | "execution_count": 35, 872 | "metadata": { 873 | "collapsed": false 874 | }, 875 | "outputs": [ 876 | { 877 | "name": "stdout", 878 | "output_type": "stream", 879 | "text": [ 880 | "+-------+------+-------+\n", 881 | "|balance|gender|user_id|\n", 882 | "+-------+------+-------+\n", 883 | "| 0.0| F| 3|\n", 884 | "| 3.0| M| 5|\n", 885 | "| 1.0| M| 1|\n", 886 | "| -0.5| F| 2|\n", 887 | "+-------+------+-------+\n", 888 | "\n" 889 | ] 890 | } 891 | ], 892 | "source": [ 893 | "sqlContext.sql(\n", 894 | " \"SELECT * FROM json.`file:///tmp/complete_users.json`\").show()" 895 | ] 896 | }, 897 | { 898 | "cell_type": "code", 899 | "execution_count": 36, 900 | "metadata": { 901 | "collapsed": true 902 | }, 903 | "outputs": [], 904 | "source": [ 905 | "df.na.drop().write.save(\n", 906 | " \"file:///tmp/complete_users.parquet\", format='parquet')" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": 37, 912 | "metadata": { 913 | "collapsed": false 914 | }, 915 | "outputs": [ 916 | { 917 | "name": "stdout", 918 | "output_type": "stream", 919 | "text": [ 920 | "total 44\r\n", 921 | "4 drwxrwxr-x 2 vagrant vagrant 4096 May 10 20:36 .\r\n", 922 | "4 drwxrwxrwt 23 root root 4096 May 10 20:36 ..\r\n", 923 | "4 -rw-r--r-- 1 vagrant vagrant 376 May 10 20:36 _common_metadata\r\n", 924 | "4 -rw-rw-r-- 1 vagrant vagrant 12 May 10 20:36 ._common_metadata.crc\r\n", 925 | "4 -rw-r--r-- 1 vagrant vagrant 1082 May 10 20:36 _metadata\r\n", 926 | "4 -rw-rw-r-- 1 vagrant vagrant 20 May 10 20:36 ._metadata.crc\r\n", 927 | "4 -rw-r--r-- 1 vagrant vagrant 750 May 10 20:36 part-r-00000-810195c2-ffa9-4a54-add7-61e6a7c92095.gz.parquet\r\n", 928 | "4 -rw-rw-r-- 1 vagrant vagrant 16 May 10 20:36 .part-r-00000-810195c2-ffa9-4a54-add7-61e6a7c92095.gz.parquet.crc\r\n", 929 | "4 -rw-r--r-- 1 vagrant vagrant 746 May 10 20:36 part-r-00001-810195c2-ffa9-4a54-add7-61e6a7c92095.gz.parquet\r\n", 930 | "4 -rw-rw-r-- 1 vagrant vagrant 16 May 10 20:36 .part-r-00001-810195c2-ffa9-4a54-add7-61e6a7c92095.gz.parquet.crc\r\n", 931 | "0 -rw-r--r-- 1 vagrant vagrant 0 May 10 20:36 _SUCCESS\r\n", 932 | "4 -rw-rw-r-- 1 vagrant vagrant 8 May 10 20:36 ._SUCCESS.crc\r\n" 933 | ] 934 | } 935 | ], 936 | "source": [ 937 | "!ls -als /tmp/complete_users.parquet/" 938 | ] 939 | }, 940 | { 941 | "cell_type": "code", 942 | "execution_count": 38, 943 | "metadata": { 944 | "collapsed": false 945 | }, 946 | "outputs": [], 947 | "source": [ 948 | "from pyspark.sql import Row\n", 949 | "\n", 950 | "rdd_gender = \\\n", 951 | " sc.parallelize([Row(short_gender=\"M\", long_gender=\"Male\"),\n", 952 | " Row(short_gender=\"F\", long_gender=\"Female\")])\n", 953 | "\n", 954 | "(sqlContext.createDataFrame(rdd_gender)\n", 955 | " .registerTempTable(\"gender_maps\"))" 956 | ] 957 | }, 958 | { 959 | "cell_type": "code", 960 | "execution_count": 39, 961 | "metadata": { 962 | "collapsed": false 963 | }, 964 | "outputs": [ 965 | { 966 | "name": "stdout", 967 | "output_type": "stream", 968 | "text": [ 969 | "+-----------+------------+\n", 970 | "|long_gender|short_gender|\n", 971 | "+-----------+------------+\n", 972 | "| Male| M|\n", 973 | "| Female| F|\n", 974 | "+-----------+------------+\n", 975 | "\n" 976 | ] 977 | } 978 | ], 979 | "source": [ 980 | "sqlContext.table(\"gender_maps\").show()" 981 | ] 982 | }, 983 | { 984 | "cell_type": "code", 985 | "execution_count": 40, 986 | "metadata": { 987 | "collapsed": false 988 | }, 989 | "outputs": [ 990 | { 991 | "name": "stdout", 992 | "output_type": "stream", 993 | "text": [ 994 | "+-------+-----------+-------+\n", 995 | "|balance|long_gender|user_id|\n", 996 | "+-------+-----------+-------+\n", 997 | "| 1.0| Male| 1|\n", 998 | "| 3.0| Male| 5|\n", 999 | "| -0.5| Female| 2|\n", 1000 | "| 0.0| Female| 3|\n", 1001 | "+-------+-----------+-------+\n", 1002 | "\n" 1003 | ] 1004 | } 1005 | ], 1006 | "source": [ 1007 | "sqlContext.sql(\"\"\"\n", 1008 | " SELECT balance, long_gender, user_id \n", 1009 | " FROM parquet.`file:///tmp/complete_users.parquet` \n", 1010 | " JOIN gender_maps ON gender=short_gender\"\"\").show()" 1011 | ] 1012 | }, 1013 | { 1014 | "cell_type": "code", 1015 | "execution_count": 41, 1016 | "metadata": { 1017 | "collapsed": false 1018 | }, 1019 | "outputs": [ 1020 | { 1021 | "data": { 1022 | "text/plain": [ 1023 | "[u'gender_maps', u'users']" 1024 | ] 1025 | }, 1026 | "execution_count": 41, 1027 | "metadata": {}, 1028 | "output_type": "execute_result" 1029 | } 1030 | ], 1031 | "source": [ 1032 | "sqlContext.tableNames()" 1033 | ] 1034 | }, 1035 | { 1036 | "cell_type": "code", 1037 | "execution_count": 42, 1038 | "metadata": { 1039 | "collapsed": true 1040 | }, 1041 | "outputs": [], 1042 | "source": [ 1043 | "for table in sqlContext.tableNames():\n", 1044 | " sqlContext.dropTempTable(table)" 1045 | ] 1046 | }, 1047 | { 1048 | "cell_type": "code", 1049 | "execution_count": null, 1050 | "metadata": { 1051 | "collapsed": true 1052 | }, 1053 | "outputs": [], 1054 | "source": [] 1055 | } 1056 | ], 1057 | "metadata": { 1058 | "kernelspec": { 1059 | "display_name": "Python 2", 1060 | "language": "python", 1061 | "name": "python2" 1062 | }, 1063 | "language_info": { 1064 | "codemirror_mode": { 1065 | "name": "ipython", 1066 | "version": 2 1067 | }, 1068 | "file_extension": ".py", 1069 | "mimetype": "text/x-python", 1070 | "name": "python", 1071 | "nbconvert_exporter": "python", 1072 | "pygments_lexer": "ipython2", 1073 | "version": "2.7.6" 1074 | } 1075 | }, 1076 | "nbformat": 4, 1077 | "nbformat_minor": 0 1078 | } 1079 | -------------------------------------------------------------------------------- /Chapter 09/Chapter_9_code_02.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "!rm -rf kdd*\n", 12 | "\n", 13 | "# !wget -q -O ../datasets/kddtrain.gz \\\n", 14 | "# http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz\n", 15 | "\n", 16 | "!wget -q -O ../datasets/kddtrain.gz \\\n", 17 | "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\n", 18 | "\n", 19 | "!wget -q -O ../datasets/kddtest.gz \\\n", 20 | "http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz\n", 21 | " \n", 22 | "!wget -q -O ../datasets/kddnames \\\n", 23 | "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names\n", 24 | "\n", 25 | "!gunzip ../datasets/kdd*gz" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 2, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.\r\n", 40 | "0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.\r\n", 41 | "0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.\r\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "!head -3 ../datasets/kddtrain" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "Num features: 41\n", 61 | "First 10: ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot']\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "with open('../datasets/kddnames', 'r') as fh:\n", 67 | " header = [line.split(':')[0] \n", 68 | " for line in fh.read().splitlines()][1:]\n", 69 | "\n", 70 | "header.append('target')\n", 71 | "\n", 72 | "print \"Num features:\", len(header)-1\n", 73 | "print \"First 10:\", header[:10]" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "train_rdd = sc.textFile('file:///home/vagrant/datasets/kddtrain')\n", 85 | "test_rdd = sc.textFile('file:///home/vagrant/datasets/kddtest')" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": { 92 | "collapsed": false, 93 | "scrolled": false 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "def line_parser(line):\n", 98 | "\n", 99 | " def piece_parser(piece):\n", 100 | " if \".\" in piece or piece.isdigit():\n", 101 | " return float(piece)\n", 102 | " else:\n", 103 | " return piece\n", 104 | "\n", 105 | " return [piece_parser(piece) for piece in line[:-1].split(',')]\n", 106 | " \n", 107 | "train_df = sqlContext.createDataFrame(\n", 108 | " train_rdd.map(line_parser), header)\n", 109 | "\n", 110 | "test_df = sqlContext.createDataFrame(\n", 111 | " test_rdd.map(line_parser), header)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "Train observations: 494021\n", 126 | "Test observations: 311029\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "print \"Train observations:\", train_df.count()\n", 132 | "print \"Test observations:\", test_df.count()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 7, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [ 142 | { 143 | "name": "stdout", 144 | "output_type": "stream", 145 | "text": [ 146 | "root\n", 147 | " |-- duration: double (nullable = true)\n", 148 | " |-- protocol_type: string (nullable = true)\n", 149 | " |-- service: string (nullable = true)\n", 150 | " |-- flag: string (nullable = true)\n", 151 | " |-- src_bytes: double (nullable = true)\n", 152 | " |-- dst_bytes: double (nullable = true)\n", 153 | " |-- land: double (nullable = true)\n", 154 | " |-- wrong_fragment: double (nullable = true)\n", 155 | " |-- urgent: double (nullable = true)\n", 156 | " |-- hot: double (nullable = true)\n", 157 | " |-- num_failed_logins: double (nullable = true)\n", 158 | " |-- logged_in: double (nullable = true)\n", 159 | " |-- num_compromised: double (nullable = true)\n", 160 | " |-- root_shell: double (nullable = true)\n", 161 | " |-- su_attempted: double (nullable = true)\n", 162 | " |-- num_root: double (nullable = true)\n", 163 | " |-- num_file_creations: double (nullable = true)\n", 164 | " |-- num_shells: double (nullable = true)\n", 165 | " |-- num_access_files: double (nullable = true)\n", 166 | " |-- num_outbound_cmds: double (nullable = true)\n", 167 | " |-- is_host_login: double (nullable = true)\n", 168 | " |-- is_guest_login: double (nullable = true)\n", 169 | " |-- count: double (nullable = true)\n", 170 | " |-- srv_count: double (nullable = true)\n", 171 | " |-- serror_rate: double (nullable = true)\n", 172 | " |-- srv_serror_rate: double (nullable = true)\n", 173 | " |-- rerror_rate: double (nullable = true)\n", 174 | " |-- srv_rerror_rate: double (nullable = true)\n", 175 | " |-- same_srv_rate: double (nullable = true)\n", 176 | " |-- diff_srv_rate: double (nullable = true)\n", 177 | " |-- srv_diff_host_rate: double (nullable = true)\n", 178 | " |-- dst_host_count: double (nullable = true)\n", 179 | " |-- dst_host_srv_count: double (nullable = true)\n", 180 | " |-- dst_host_same_srv_rate: double (nullable = true)\n", 181 | " |-- dst_host_diff_srv_rate: double (nullable = true)\n", 182 | " |-- dst_host_same_src_port_rate: double (nullable = true)\n", 183 | " |-- dst_host_srv_diff_host_rate: double (nullable = true)\n", 184 | " |-- dst_host_serror_rate: double (nullable = true)\n", 185 | " |-- dst_host_srv_serror_rate: double (nullable = true)\n", 186 | " |-- dst_host_rerror_rate: double (nullable = true)\n", 187 | " |-- dst_host_srv_rerror_rate: double (nullable = true)\n", 188 | " |-- target: string (nullable = true)\n", 189 | "\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "train_df.printSchema()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 8, 200 | "metadata": { 201 | "collapsed": false 202 | }, 203 | "outputs": [], 204 | "source": [ 205 | "from pyspark.ml import Pipeline\n", 206 | "from pyspark.ml.feature import StringIndexer\n", 207 | "\n", 208 | "\n", 209 | "cols_categorical = [\"protocol_type\", \"service\", \"flag\",\"target\"]\n", 210 | "preproc_stages = []\n", 211 | "\n", 212 | "for col in cols_categorical:\n", 213 | " out_col = col + \"_cat\"\n", 214 | " preproc_stages.append(\n", 215 | " StringIndexer(\n", 216 | " inputCol=col, outputCol=out_col, handleInvalid=\"skip\"))\n", 217 | "\n", 218 | "pipeline = Pipeline(stages=preproc_stages)\n", 219 | "indexer = pipeline.fit(train_df)\n", 220 | "\n", 221 | "train_num_df = indexer.transform(train_df)\n", 222 | "test_num_df = indexer.transform(test_df)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 9, 228 | "metadata": { 229 | "collapsed": false 230 | }, 231 | "outputs": [ 232 | { 233 | "name": "stdout", 234 | "output_type": "stream", 235 | "text": [ 236 | "[StringIndexer_46ae881ca7febd4a4e81, StringIndexer_49f6bbd151ce1e9bb5a7, StringIndexer_4cfcb173a161bbe6cd60, StringIndexer_4aa581cc25ad8d6eed7e]\n", 237 | "\n", 238 | "Pipeline_450a8f0d2083e96d03ca\n", 239 | "PipelineModel_475d9917035781236edb\n" 240 | ] 241 | } 242 | ], 243 | "source": [ 244 | "print pipeline.getStages()\n", 245 | "print\n", 246 | "print pipeline\n", 247 | "print indexer" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 10, 253 | "metadata": { 254 | "collapsed": false 255 | }, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "First observation, after the 4 StringIndexers:\n", 262 | "\n", 263 | "Row(duration=0.0, protocol_type=u'tcp', service=u'http', flag=u'SF', src_bytes=181.0, dst_bytes=5450.0, land=0.0, wrong_fragment=0.0, urgent=0.0, hot=0.0, num_failed_logins=0.0, logged_in=1.0, num_compromised=0.0, root_shell=0.0, su_attempted=0.0, num_root=0.0, num_file_creations=0.0, num_shells=0.0, num_access_files=0.0, num_outbound_cmds=0.0, is_host_login=0.0, is_guest_login=0.0, count=8.0, srv_count=8.0, serror_rate=0.0, srv_serror_rate=0.0, rerror_rate=0.0, srv_rerror_rate=0.0, same_srv_rate=1.0, diff_srv_rate=0.0, srv_diff_host_rate=0.0, dst_host_count=9.0, dst_host_srv_count=9.0, dst_host_same_srv_rate=1.0, dst_host_diff_srv_rate=0.0, dst_host_same_src_port_rate=0.11, dst_host_srv_diff_host_rate=0.0, dst_host_serror_rate=0.0, dst_host_srv_serror_rate=0.0, dst_host_rerror_rate=0.0, dst_host_srv_rerror_rate=0.0, target=u'normal', protocol_type_cat=1.0, service_cat=2.0, flag_cat=0.0, target_cat=2.0)\n" 264 | ] 265 | } 266 | ], 267 | "source": [ 268 | "print \"First observation, after the 4 StringIndexers:\\n\"\n", 269 | "print train_num_df.first()" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 11, 275 | "metadata": { 276 | "collapsed": false 277 | }, 278 | "outputs": [ 279 | { 280 | "name": "stdout", 281 | "output_type": "stream", 282 | "text": [ 283 | "['num_access_files', 'src_bytes', 'srv_count', 'num_outbound_cmds', 'rerror_rate', 'urgent', 'protocol_type_cat', 'dst_host_same_srv_rate', 'duration', 'dst_host_diff_srv_rate', 'srv_serror_rate', 'is_host_login', 'wrong_fragment', 'serror_rate', 'num_compromised', 'is_guest_login', 'dst_host_rerror_rate', 'dst_host_srv_serror_rate', 'hot', 'dst_host_srv_count', 'logged_in', 'srv_rerror_rate', 'dst_host_srv_diff_host_rate', 'srv_diff_host_rate', 'dst_host_same_src_port_rate', 'root_shell', 'service_cat', 'su_attempted', 'dst_host_count', 'num_file_creations', 'flag_cat', 'count', 'land', 'same_srv_rate', 'dst_bytes', 'num_shells', 'dst_host_srv_rerror_rate', 'num_root', 'diff_srv_rate', 'num_failed_logins', 'dst_host_serror_rate']\n", 284 | "Total numerical features: 41\n" 285 | ] 286 | } 287 | ], 288 | "source": [ 289 | "features_header = set(header) \\\n", 290 | " - set(cols_categorical) \\\n", 291 | " | set([c + \"_cat\" for c in cols_categorical]) \\\n", 292 | " - set([\"target\", \"target_cat\"])\n", 293 | "features_header = list(features_header)\n", 294 | "print features_header\n", 295 | "print \"Total numerical features:\", len(features_header)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 12, 301 | "metadata": { 302 | "collapsed": false 303 | }, 304 | "outputs": [], 305 | "source": [ 306 | "from pyspark.mllib.linalg import Vectors\n", 307 | "from pyspark.ml.feature import VectorAssembler\n", 308 | "\n", 309 | "assembler = VectorAssembler(\n", 310 | " inputCols=features_header,\n", 311 | " outputCol=\"features\")\n", 312 | "\n", 313 | "Xy_train = (assembler\n", 314 | " .transform(train_num_df)\n", 315 | " .select(\"features\", \"target_cat\"))\n", 316 | "Xy_test = (assembler\n", 317 | " .transform(test_num_df)\n", 318 | " .select(\"features\", \"target_cat\"))" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 13, 324 | "metadata": { 325 | "collapsed": false 326 | }, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "Row(features=SparseVector(41, {1: 181.0, 2: 8.0, 6: 1.0, 7: 1.0, 19: 9.0, 20: 1.0, 24: 0.11, 26: 2.0, 28: 9.0, 31: 8.0, 33: 1.0, 34: 5450.0}), target_cat=2.0)" 332 | ] 333 | }, 334 | "execution_count": 13, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "Xy_train.first()" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 14, 346 | "metadata": { 347 | "collapsed": false 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "from pyspark.ml.classification import RandomForestClassifier\n", 352 | "\n", 353 | "clf = RandomForestClassifier(\n", 354 | " labelCol=\"target_cat\", featuresCol=\"features\", \n", 355 | " maxBins=100, seed=101)\n", 356 | "fit_clf = clf.fit(Xy_train)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 15, 362 | "metadata": { 363 | "collapsed": false 364 | }, 365 | "outputs": [ 366 | { 367 | "name": "stdout", 368 | "output_type": "stream", 369 | "text": [ 370 | "RandomForestClassifier_40f9923cb13e74b28cbe\n", 371 | "RandomForestClassificationModel (uid=rfc_ac17a1f959a3) with 20 trees\n" 372 | ] 373 | } 374 | ], 375 | "source": [ 376 | "print clf\n", 377 | "print fit_clf" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 16, 383 | "metadata": { 384 | "collapsed": false 385 | }, 386 | "outputs": [], 387 | "source": [ 388 | "Xy_pred_train = fit_clf.transform(Xy_train)\n", 389 | "Xy_pred_test = fit_clf.transform(Xy_test)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 17, 395 | "metadata": { 396 | "collapsed": false 397 | }, 398 | "outputs": [ 399 | { 400 | "name": "stdout", 401 | "output_type": "stream", 402 | "text": [ 403 | "First observation after classification stage:\n", 404 | "Row(features=SparseVector(41, {1: 105.0, 2: 1.0, 6: 2.0, 7: 1.0, 9: 0.01, 19: 254.0, 26: 1.0, 28: 255.0, 31: 1.0, 33: 1.0, 34: 146.0}), target_cat=2.0, rawPrediction=DenseVector([0.0283, 0.0112, 19.3474, 0.0677, 0.0251, 0.1414, 0.0357, 0.1194, 0.1309, 0.041, 0.0257, 0.0079, 0.0046, 0.0004, 0.0029, 0.0016, 0.002, 0.0023, 0.0013, 0.0008, 0.0012, 0.0006, 0.0006]), probability=DenseVector([0.0014, 0.0006, 0.9674, 0.0034, 0.0013, 0.0071, 0.0018, 0.006, 0.0065, 0.002, 0.0013, 0.0004, 0.0002, 0.0, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0, 0.0001, 0.0, 0.0]), prediction=2.0)\n" 405 | ] 406 | } 407 | ], 408 | "source": [ 409 | "print \"First observation after classification stage:\"\n", 410 | "print Xy_pred_test.first()" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 18, 416 | "metadata": { 417 | "collapsed": false 418 | }, 419 | "outputs": [ 420 | { 421 | "name": "stdout", 422 | "output_type": "stream", 423 | "text": [ 424 | "F1-score train set: 0.991904372002\n", 425 | "F1-score test set: 0.966840043466\n" 426 | ] 427 | } 428 | ], 429 | "source": [ 430 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", 431 | "\n", 432 | "evaluator = MulticlassClassificationEvaluator(\n", 433 | " labelCol=\"target_cat\", predictionCol=\"prediction\", \n", 434 | " metricName=\"f1\")\n", 435 | "\n", 436 | "print \"F1-score train set:\", evaluator.evaluate(Xy_pred_train)\n", 437 | "print \"F1-score test set:\", evaluator.evaluate(Xy_pred_test)" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 19, 443 | "metadata": { 444 | "collapsed": false 445 | }, 446 | "outputs": [ 447 | { 448 | "name": "stdout", 449 | "output_type": "stream", 450 | "text": [ 451 | "F1-score test set: 0.966840043466\n" 452 | ] 453 | } 454 | ], 455 | "source": [ 456 | "# All in one\n", 457 | "\n", 458 | "full_stages = preproc_stages + [assembler, clf]\n", 459 | "full_pipeline = Pipeline(stages=full_stages)\n", 460 | "full_model = full_pipeline.fit(train_df)\n", 461 | "predictions = full_model.transform(test_df)\n", 462 | "print \"F1-score test set:\", evaluator.evaluate(predictions)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 20, 468 | "metadata": { 469 | "collapsed": false 470 | }, 471 | "outputs": [], 472 | "source": [ 473 | "import matplotlib.pyplot as plt\n", 474 | "import numpy as np\n", 475 | "%matplotlib inline\n", 476 | " \n", 477 | "def plot_confusion_matrix(cm):\n", 478 | " cm_normalized = \\\n", 479 | " cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", 480 | " plt.imshow(\n", 481 | " cm_normalized, interpolation='nearest', cmap=plt.cm.Blues)\n", 482 | " plt.title('Normalized Confusion matrix')\n", 483 | " plt.colorbar()\n", 484 | " plt.tight_layout()\n", 485 | " plt.ylabel('True label')\n", 486 | " plt.xlabel('Predicted label')\n" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": 21, 492 | "metadata": { 493 | "collapsed": false 494 | }, 495 | "outputs": [ 496 | { 497 | "data": { 498 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAU4AAAEoCAYAAAA3/hguAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAH4dJREFUeJzt3Xu8VFXdx/HPnCMqclEUQyWVvJHaRTTFMOB4qcBXF+1m\naBfTNM1LT1qmPT2CaGVmpampaZpZWZqSWaFleTiYKKCAeMFLaaYgKpIKoiKc54/f2sw+c2b2XpuZ\nPbNmzvfNa17smdmzZ82Zw491/4GIiIiIiIiIiIiIiIiIiIiIiIj0cZ3A0e74COC2Gl9/BLAWaKvx\ndX30B24B/gv8torr5PFzaZSxwKJGF0IkzZPAUmCT2GNfBO5oSGl6uwM4KsfrjyA9cB4OzAVeARYD\nfwb2q8F7fxa4J+W9W8laYIdGF0L89ZVfzPXVBnylBtcpuFsrOQX4EXAO8BZgW+AS4CM1uPb2wKNY\nQOkrkn4/NqhbKUSq9ATwDWAZsKl7rLTGOQaYgzUpZwPvjT3XiQWVfwArgR2xQHA88BjwMjDVPT7L\nXeM3QD/3+s2APwLPAS9iTdfhsevHa5xHAjPd8WlYDTC6rQauds9tCvwMqx0+DZxN8T/PNuB84Hng\nn8AJVK5xbuqu/fEyz0U2Ai4AnnG3HwEbuuc63PufgtXqF7vPAHAW8DrwhnuPo4ApwLWxa48oKduR\nrswvA//CasLR4zNjr0v7vqYCd7rr3AZsUeGzReX/Ovb9LAYOAQ7GAv4y4PTY+ftg3/Fyd+5FFL/n\nLvdZVrjP+8nY9U8DlgDXuMf+416zo3uPUe7+Ntj3Nq5CeUXq5gngQOBGLMBAz8C5OfYP4QjsH/Cn\nsQA3xD3fiTX3d3XP98P+gUwDBgK7YQHi71ggGAw8CHwudv1DgY3d+de710YqBc64t2JB64Pu/jTg\nUqwPcUusOXyse+444GEsOA9x119D+cA5AQvISS2WqcBdwFB3+4d7DCwIrMYCYjswEfvPJfoPajLw\ni9i1JlM5cA4AXgJ2ds8Nw3620PPn4vN9PQbshP3M7wC+W+GzReX/liv/F4EXgF+58uwGvIrVnAH2\nxIJnm3vsIXq2ZEqb6tH1v4v93mxMz8CJe88Hse/yNuC8CmUVqasngAOA3bEaylB6Bs7PAneXvOYu\n4PPu+A4sMMStpWctZy5Wa4mcj9XMytkD+4ceSQuc/YF7Y9cfBryG/SOMTMICN+7vY2PPvZ/KNc4j\nsJpQksexABv5APYzBQsCr5ZceykWXKB3DbP0/gh6Bs7lwMewzxx3JMWfi8/39c3Yc8cD00s/VEn5\no+b1IFeevWPnzAU+WuH1/wPcFLtfLnC+TrGGHj0WD5wANwMLgfkUa7BSB+rjTPcg1mQ+HeiOPb4N\n8FTJuf92j0dKf9HBAkRkVZn7A93xJsDlWK31JWAGViPz7Sv9GVaD/L67vz32j2sJFmiWA5dhNU+A\nrUvKW/rZ4pZh/5Ek/f5sg/084teL/2yW0bMP81WKnz2LlcBhWI15MfZdjaxQnrTv69nYcfy7KGcZ\nxd+HVe7v0u9ygDvexZVrCfZdfpvK3QCR57HuiiRXYv+xX4TVUKVOFDj9TAaOoWcf4zMUm2KR7d3j\nkW7W36nYP7h9sIA5Hv9BptOxJufRscf+g9VitsCap0Pcdd/pnl8CbBc7P35capa71qEJ5yzGaobx\n6y1OL3pZK+g5u2Grkuf/gtVot8Km7FxR5ho+31deLsWa5zthP/P/Jf3fXtrvzkCsD/lKrF94SPLp\nUksKnH7+ic0njPdLTccC2yRs1PMw4O1YzSLiE+QKFY4HYrWWl7D+ucmeZZ0InIQ1XV+PPb4ECzA/\nxJqWbdggQzSgcD1wMsU+zvjgRqmXgDOxUfSPYkGtn3vv77lzrsP6AKM+zjPp2dzOYr4r57ZY4Dkj\n9txbXBkGYLWulVjfbKlafV/rYyA28POqe8/jS55fin0XWVyIDXAdC/wJaz1InShw+puKBYioJrAM\n+BBWM3wB+Jq7H++HLK01lKtFdJccR/cvwPrsXsD64qZXeH3p6z6FBaqHKY6s/8Q99zms3+whV84b\nKNbersAGGRZg/XM3JrwfWAA+BQuOz2HN4C9THMA6x13nfneb6x4r97mTPg/A7dh/XPdjo+K3xJ5v\nA76K1RyXYRPFjy9znazfV2kZypUx6X7c17CR/peBn2KzJ+LnT8FGzpcDn0h47+ixj2I17OhznoIN\nQE1KKIOIiIiXq7Aa/cKEc36MzahYQHGKl4hInzUWC4aVAufB2Io3gNH0nnkhItInjaBy4LwM6++O\nLMKm7iVSH6eI9GXD6TkN72ls4UiiINfAFgZs0929cn1nrohIo4wdN56ZXTNqNzuhfaNu1ryefl7R\nK9gqvCxKy5s6jTDIwNm9cjEb73FCr8dXL5lNv6336fHY8jkX16tYdXHO1Cl868wpjS5Grlr9M7b6\n54PKn7F/vxrP6FrzetlYUMlr8y8ZlPEdnsGmuUWiZcqJGtVUn4D1JTyGbaQhIlJeoc3/lt0fKO4P\nsS+2vHpp5dNNI2qc7cDFwEFYZJ+DFf7hBpRFREJXqKoWex226m4o1pc5meK6/suxEfWDsb0VVgJf\n8LloIwLnPlghn3T3f4NN6E0NnG0Dh6ed0vTGje9odBFy1+qfsdU/H9T5M65fTTLisyjgxKwXbUTg\nLDeKNdrnhe2DFDhbQat/xlb/fFDnz9jWXr/38tSIwOm18cXqJbPXHbcNHN4ngqZIs+ma0UnXjM58\n36S6pnouGhE4S0extsVqnT2Ujp6LSHjGje/oUfv89tln1f5Nqmuq56IRJZqL7dY9Attw4jBscEhE\npLdCwf9WJ42ocb6Jdcbeho2wRxvuioj0FmCNs1ET4KdTOS0B4D+xfcje2QbEWm3CvEjLUx+niEhG\nqnGKiGSkGqeISEaqcYqIZKTAKSKSUZua6iIi2ajGKSKSkQaHREQy0iYfIiIZBdhUD69EIiJx1a9V\nT8s4MQSYhuVVvwfYPa1ICpwiErbqUmdEGScmALthGxvvWnLON4H7gHdjaTQuTCtS0zfVs649z7K2\nXevaRQJQ3eCQT8aJXYFz3fEj2M5tWwLPV7qoapwiErbqapzlMk6U7oq+APiYO94H2J6U3OpNX+MU\nkRaXUONcs+wx1i57POnVPhknzsWa5/OAhe7vNUkvUOAUkbAljKq3Dx1J+9CR6+6vefzW0lN8Mk68\nAhwVu/8E8K+kIqmpLiJhq66p7pNxYlP3HMAxwAxgRVKRVOMUkbBVNzhUKePEl9zzl2Oj7T/HmvUP\nAEenXVSBU0TCVv0E+HIZJy6PHc8CRpKBAqeIhE1r1UVEMgpwyaUCp4iETTVOEZFs2tpU42y4p2de\n4H3ukAPPynTt5X+bnLU4IpImvApn3wucItJcCmqqi4hko8ApIpKRAqeISEYKnCIiWYUXNxU4RSRs\nqnGKiGSkwCkikpECp4hIRgqcIiJZhRc3tQO8iIStUCh43ypIy6s+FLgVmI9tZHxkWpmavsbZ3e2T\ni6lo5RuJOZh6yLr2fMjHL/O/9o3HZbq2SF9VZVM9yqt+EJZ/aA6WOiOeHvhELEHbGVgQfQT4JbZ7\nfFmqcYpI0AptBe9bGfG86qsp5lWPWwIMdseDgWUkBE1ogRqniLS2Kmuc5fKqjy455wrg78BiYBDw\nqbSLKnCKSNCSAucbix/kjSUPJr3cpy/vm1j/ZgewI/BX4N1Y2uCyFDhFJGhJgXOj4e9go+HvWHd/\n5bwbSk/xyas+Bvi2O/4nlld9JJZauCz1cYpI0KocVffJq74IGzwCGIYFzX8llUk1ThEJW3XzOH3y\nqn8HuBpYgFUmTwNeTLqoAqeIBK0GK4fS8qq/AHw4ywUVOEUkaFpyKSKSkQKniEhW4cXNcAPn2rV+\nSynbyq8WqGjzAf3WpzhesiyjHPK+07Jd+87zshZHpCWoxikikpECp4hIRgqcPT0JvAyswRbf79PA\nsohIoCps3tFQjQyc3dja0MSJpiLSt6nG2Vt4PxERCUqIgbORa9W7gduxtaTHNLAcIhKwQsH/Vi+N\nrHHuh20guiW2jdMiYGb05LfPnrLuxLHjOhg3vqO+pRORVF0zOuma0Znre4RY4wylRJOBFcAP3P3u\nla+v9Xph1nmcb67xuy7ABu35Vcg1j1NaUf9+BahtXOne+euly8wre+z7E2v9/mU1qqm+CbbTMsAA\n4APAwgaVRUQCVoNkbTXXqKb6MGBarAy/Av7SoLKISMACbKk3LHA+AezRoPcWkSaStTuuHho9Haki\n3x/WGs817ZEr73nS+9zjxuyQ6dpZypK1z/LIX83zPvfnR4zKdG2RkNWgxjkBuADbyPhK4Hslz38N\nOMIdbwDsiqUJ/m+lCyp1hogErco+ziiv+gRgN2ASFhjjzgdGudsZQCcJQRMUOEUkcFXO4/TJqx53\nOHBdWpkUOEUkaFXWOMvlVR9e4a02AT4I3JhWpmD7OEVEIHkC/Ion57PiyQVJL88yCPJh4E5Smumg\nwCkigUsaKB68wygG71AcDH2u69rSU3zyqkc+jUczHdRUF5HAVdnH6ZNXHWBTYBxws0+ZVOMUkaBV\nuSLIJ686wCHunFU+F1XgFJGg1WAeZ1pedYBr3M2LAqeIBC3E3ZEUOEUkaAHGzeYPnO0Z17Ge8ZUf\nep973JyLcy1LFlpGKX2VapwiIhkFGDcVOEUkbKpxiohkFGDcVOAUkbCpxikiklGAcVOBU0TCphqn\niEhGCpwiIhkp55CISEYBVjgVOEUkbGqqB+Cyn57W6CIEZeIld2U6f/oJY3IqiUh5AcbNvhc4RaS5\ntAUYObUDvIgErcod4MFSAy8CHgO+UeGcDmAe8ACWHjiRapwiErQq+zijvOoHYfmH5mCpMx6OnbMZ\ncAmW4fJpYGjaRVXjFJGgtRX8b2X45FU/HEsJHCVxeyG1TOv5WURE6qIOedV3BjYH7sCSu302rUxq\nqotI0JJa6sseuZdlj96X9HKfvOr9gD2BA4FNgFnA3VifaFlJgfOilMKc7FEgEZGqFKgcOYeOfA9D\nR75n3f3H/3Rl6Sk+edX/gzXPV7lbF/Bu1jNw3ksxWkcl73bHPlFcRKRqVa64jOdVX4zlVZ9Ucs7N\n2ABSO7ARMBpIzLGTFDh/XnJ/ALDSt7QiIrVQh7zqi4BbgfuBtcAVwENJF/Xp4xwDXAkMwqq5ewDH\nAl/O+glERLKqQRJEn7zq57ubF59R9QuwCaTREP18YLzvG4iIVKMGE+BrzndU/amS+2/WuiD1ctio\n7RpdhKBkXXs+ZO8Tvc9dnjG9skg5zbrJx1PAfu54Q2w0/eHKp4uI1E6AcdMrcB4PXIhNGn0G+Atw\nQp6FEhGJhLjJh0/gfB5bkiQiUnfhhU2/waEdgVuwwaHnsTlPO+RZKBGRSJVLLnPhEzh/DVwPbA1s\nA9wAXJdnoUREIlVu8pFPmTzO6Q9ci+0sshr4JbBxnoUSEYmEWONM6uPcHOtemA6cQbGWeRi9J5OK\niOQiwLGhxMB5Hz3XpB/r/o7Wqp+eV6FERCLNNo9zRL0KISJSSYBp1b1XDr0D2I2efZu/qH1xRER6\narYaZ2QKtjZ9d+BPwETgTpo0cGrJYHX0M5F6Cy9s+gXOT2Cbet4HfAEYBvwqz0KJiERqsDtSzflM\nR1oFrME29tgUeI6eOyonuQpYCiyMPbY58FfgUWz55ma+hRWRvqcG05HS0gN3AC9h6YHnAd9KK5NP\n4JwDDME295zrLnyXx+sArsYKHXc6Fjh3Af6GRudFJEGV28pF6YEnYOM0k4Bdy5w3Axjlbueklcmn\nqR5tWHwZtovyYGCBx+sAZtJ7dP4jFPfzvAZL/q7gKSJlVbnJRzw9MBTTA5fu8JbpTZIC515Uzi20\nJ9bnuT6GYc133N/D1vM6ItIHVDmoXi498OiSc7qxTBcLsB3gvkYVqTN+QHJStv2TLuypO+U9RKSP\nq3I6kk98uQ8bt3kVmzX0e6wrsaKkwNnhW7KMlgJbAc9iG4c8V+6kc6ZOWXc8bnwH48bnVRwRWV9d\nMzrpmtGZ63skDcQ8/cBsnnlgdtLLfdIDvxI7ng78BBvEfrHSResxzj8C25bune7+ecAy4HtY3+Zm\n9O7j7F61Op+KqOZxiuSnf78C1DaudJ80zT/hxEWH7lr6/hsAjwAHYumBZ2MDRPGLDsMqcN1Yn+j1\npKyc9F05tL6uwwaChmL9DGcC57qCHY112H4q5zKISBOrchqnT3rgT2CZLt7EmuufTrto3oGzNPF7\n5KCc31dEWkQN5r+npQe+xN28+czjbAM+i9UWAbbDqrMiIrlrtv04Iz8B1gIHAFOBFe6x9+RYrvwM\n8F+otOqNNZku3X/D9qylEZEUAa649Aqco7HZ9PPc/ReBfrmVSEQkJsDNkbwC5xtYp2pkS6wGKiKS\nu2ZND3wRMA14C/AdbAQqdRG8iEgttIcXN70C5y+Be7F5UFB+naeISC6atca5HbASm8QONkl0O+Cp\nvAolIhIJMG56Bc4/U1zvuTHwNmwm/u55FUpEJNKso+rvKLm/J3BCDmUREemlWZvqpe6j97ZMIiK5\nCDBuegXOU2PHbViN85l8iiMi0lOzNtUHxo7fBP4I3JhPcUREeioEmOcyLXC2Y6kyTk05r3ms/K/3\nqVpCKdJ4zVbj3ACrYe6H7W+nndpFpO6aLXDOxvoz5wM3Azdge9WBBdGb8i2aiEjVqTNykbStXFTa\njbEd2w8APuRuH865XCIigNU4fW8VpOVVj+yNtbI/llampBrnlsApwMK0i4iI5KXKCmeUV/0gbDbQ\nHOAP9F423o6l87kVj9QfSYGzHRi0PiUVEamVDarr5PTNq34S8Dus1plepoTnngXOylREEZEaq0Ne\n9eFYMD0AC5ypA+F55xwSEalKW0LL+dH77uaxeXcnvdxnNtAFWKbdbqyZXlVTXQnVRKThkmqcI/fa\nl5F77bvu/vSrLyw9xSev+l5YEx4sI+9EYDXWF1pWUuBclvCciEhdVDmPcy6wM5YnfTFwGL2z7+4Q\nO74a20KzYtAENdVFJHBV7o7kk1c9MwVOEQlaDea/p+VVj/uCzwUVOEUkaK2yH6eISN0EGDcVOEUk\nbEnrwhtFgVNEghbiJh8KnCIStPDCpgKniAROg0MiIhmFFzYVOEUkcG0BbgGvwCkiQdOouohIRhpV\nFxHJKLywqcApIoFTjVNEJCP1cYqIZKQap4hIRuGFzTBrwSIi6xQK/rcK0vKqfxRYAMwD7sWStiVS\njVNEgpaUrM2DT17124Gb3fE7gWnATsllEhEJWJU1znhe9dUU86rHrYwdDwReSCuTAqeIBK2Q4U8Z\n5fKqDy9z3iFYLXQ6cHJamdRUF5GgJQ2q3z/7Hyycc1fSy33yqgP83t3GAtcCI5NOVuAUkaC1J0TO\nUaPfx6jR71t3/9eXnl96ik9e9biZWFzcgoQU6Wqqi0jQquzjjOdV3xDLq16aM31HirOe9nR/Vwya\noBqniASuQt+lL5+86h8HPocNHq0APp12UQVOEQlaDbbjTMurfp67ecu7qX4VsBRYGHtsCtbHMM/d\nJuRcBhFpYlWOquci78B5Nb0DYzfwQ2CUu92acxlEpInVYOVQzeXdVJ+JdcqWCnH5qYgEqJ41SV+N\nGlU/CVsb+jNgswaVQUSaQFvB/1YvjRgcuhSY6o7PBn4AHF160jlTp6w7Hje+g3HjO+pQNBHJomtG\nJ10zOnN9jxBrnPUo0QjgFmzxvO9z3atW+074z2bI3id6n7t8zsW5lEGkVfXvV4DaxpXumY++6H3y\n2F02r/X7l9WIpvrWseND6TniLiLSQyHDrV7ybqpfB4wHhmIL7ScDHcAe2Oj6ExQnooqI9NLWB3eA\nn1Tmsatyfk8RaSHhhU2tHBKR0AUYORU4RSRofbGpLiJSlfDCpgKniIQuwMipwCkiQQtxArwCp4gE\nLcAuTu0ALyJhq8EE+LS86kdge2fcD/wDeFdamVTjFJGwVVfj9Mmr/i9gHPASFmR/CuybdFHVOEUk\naFVuZOyTV30WFjQB7gHemlYmBU4RCVqVGxn75lWPHA38Oa1MaqqLSNCSWupzZs1k7t0zk16eZZu1\n/YGjgP3STlTgFJGwJUTOvceMZe8xY9fdv/yCc0tP8c2r/i7gCqyPc3lakdRUF5GgVdnH6ZNXfTvg\nJuAzWH9oKtU4RSRoVc7j9MmrfiYwBMtOATaItE/SRRU4RSRoNZj/npZX/Yvu5k2BU0SCVghw6ZAC\np4gELcC4qcApImELMG4qcIpI4AKMnAqcIhI0bSsnIpKR+jhFRDIKMG4qcIpI4AKMnAqcIhI09XGK\niGSkPk4RkYwCjJsKnCISuAAjpwKniARNfZwiIhm1hRc3FThFJHABBk7tAC8iQatyB3hIz6v+dizT\n5WvAqT5lUo1TRIJW5XQkn7zqy4CTgEN8L6oap4gErZDhVoZPXvXnsdxEq33LpMApIkGrc151L2qq\ni0jgKrfVZ905g1l3diW9OEtedW8KnCIStKQ+zjFjxzNm7Ph193903jmlp/jmVc9ETXURCVqVfZw+\nedXjb+VFNU4RCVod8qpvhY22DwbWAl8BdgNWVLqoAqeIBK0GSy7T8qo/S8/mfCoFThEJW4ArhxQ4\nRSRoAcZNBU4RCZs2MhYRyagQYORU4BSRoIUXNhU4RSRwAVY4FThFJGzaAV5EJKMQa5xacikiklGe\ngXNb4A7gQeAB4GT3+ObAX4FHgb8Am+VYBhFpclVuK5eLPAPnauCrwO7AvsAJwK7A6Vjg3AX4m7sv\nIlJWDVJn1FyegfNZYL47XoEtrB8OfAS4xj1+DRm2qxeRvifEGme9BodGAKOAe4BhwFL3+FJ3X0Sk\nrADHhuoSOAcCN2JbNb1S8lw3FXZoPmfqlHXH48Z3MG58Rz6lE5H11jWjk64Znfm+SYCRM+8i9QP+\niG3pdIF7bBHQgTXlt8YGkN5e8rruVatz2fGeIXuf6H3u8jkX51IGkVbVv18BahtXul95ba33yYM2\nbqv1+5eVZx9nAds09CGKQRNs9+XPu+PPA7/3vWDu/7MFQJ+x+bX654P6fsYa9HGm5VUH+LF7fgHW\nrZgoz8C5H/AZYH9gnrtNAM4F3o9NRzrA3feiX8jW0OqfsdU/H9Q5cGa4lRHlVZ+A7eo+CZvdE3cw\nsBOWYuNY4NK0MuXZx3knlQPzQTm+r4i0kCp3R4rnVYdiXvWHY+fEZ/rcg80tjw9i96KVQyIStDrk\nVS93zltr+RnqpZPiiLtuuunWPLdOaivr+79c8vqPA1fE7n8GuKjknFuwrsXI7cCeSYUKdZOPjkYX\nQESCUO0IuU9e9dJz3uoeExHpkzYA/kkxr/p8yg8O/dkd7wvcXa/CiYiEaiLwCDZIdIZ77EsUc6uD\njbw/jk1HSmymi4hIi/OZxNrsngTux+a8zm5sUWriKmxKx8LYY622rWC5zzgF60eLz19uZtoiskm1\nY9XoEdgyznL9FK3gCeyXsVWMxVZhxIPKecBp7vgbZFgAEahyn3EycEpjipOLrYA93PFArNm7K633\nXbac9wK3xu6fTmvu4/kEsEWjC1FjI+gZVBZR3BFrK3e/2Y2gd+A8tTFFqYvfY4tYWvG79NIsE+B9\nJrG2gm5sDtlc4JgGlyUvfWVbwZOwgYaf0VpN2BFoi8imCZzdjS5AneyH/VJOxHbMH9vY4uQumrTc\nai4F3oY1b5cAP2hscWpmvbaIbEXNEjh9JrG2giXu7+eBadg621azFGvWgW0r+FwDy5KX5ygGkitp\nje+xHxY0r6W4o1lf+C7LapbAORfbuWQENon1MGx7ulayCTDIHQ8APkDPfrNWsd7bCjaRrWPHh9L8\n32PNt4iU+ik3ibWVvA2bLTAfm/LRCp/xOmAx8AbWR/0FbNbA7bTOFJbSz3gU8AtsWtkCLJg0e9/f\n+4C12O9mfIpVq32XIiIiIiIiIiIiIiIiIiIiIiIiInlYg829WwhcD/Sv4lo/x/K4gOVySdqlajy2\nQUtWT1J+l6hKj8etyPheU2jtDTkkB82yckiq8yq2Bv6d2ETt40qez5J7Kr4m+Rh6plkttT8wJsO1\n4++R5fGs51RzvogCZx80E9gJqw3OBG7GViq1Ad/HNlBeABzrzi9gaQUWYZvWviV2rU5gL3c8AbgX\nW13yV2B7LDXBV7Ha7n7AlsDv3HvMphhUt8BWnjyA1WJ9EnRNw5biPkDvnaR+6B6/HRjqHtsRmO5e\n0wWM9HgPEenDop1sNsAC5ZewwLkCC3BggfJ/3fFGwBxsb4CPYUGtgK3BXu4eA9sVfE8sID4Vu1a0\n9K50Q99fU0zDuh229hngx8C33PHB2PK+ck3y+EbPQ9zf/bEuiOj+WmCSO/4/iqlg/4b9hwEw2t2P\nyqimumQSanpgqa3+WK0PrLZ1FRbAZgP/do9/AGvKf8LdH4xtrDIWC3jd2O5Nfy+5dgHLDNgVu9Z/\nS56PHETPPtFB2IYmY7HNMMCyDS73+ExfAQ5xx9u6ss7GAudv3eO/BG5y7zEGuCH2+g093kOkLAXO\nvmEV1sdZamXJ/ROxZnbcwaQ3nX37CQtYbe+NCs/56gAOxAL2a1jNd+MK1+zGuiGWU/5nIJKZ+jgl\nchvwZYr/me6CbXXXhW3j14Y11fcveV03lod6HNa0h2Jz+hWKW+WBNflPjt1/t/u7CzjcHU+k2Oyu\nZDAWCF8D3o4F0Egb8El3fDjWj/sK1syPatMF4F0p7yFSkQJn31CuRli6Y/eVWJ/jfVif4aVYkrxp\nWGbRh4BrgLvKXOsFrI/0Jmxw6Dr3+C1YEzwaHDoZeA82+PQgxbzWZ2GB9wF3ftTkr/Q5bsUC/EPA\nd4FZsXNWYhsHL8RqplPd40cAR1Pctu8jZa4rIiIiIiIiIiIiIiIiIiIiIiIiIiIiAft/wkVYtYwN\n5aYAAAAASUVORK5CYII=\n", 499 | "text/plain": [ 500 | "" 501 | ] 502 | }, 503 | "metadata": {}, 504 | "output_type": "display_data" 505 | } 506 | ], 507 | "source": [ 508 | "from pyspark.mllib.evaluation import MulticlassMetrics\n", 509 | "\n", 510 | "metrics = MulticlassMetrics(\n", 511 | " predictions.select(\"prediction\", \"target_cat\").rdd)\n", 512 | "conf_matrix = metrics.confusionMatrix().toArray()\n", 513 | "plot_confusion_matrix(conf_matrix)" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 22, 519 | "metadata": { 520 | "collapsed": false 521 | }, 522 | "outputs": [ 523 | { 524 | "data": { 525 | "text/plain": [ 526 | "{u'back': 2203,\n", 527 | " u'buffer_overflow': 30,\n", 528 | " u'ftp_write': 8,\n", 529 | " u'guess_passwd': 53,\n", 530 | " u'imap': 12,\n", 531 | " u'ipsweep': 1247,\n", 532 | " u'land': 21,\n", 533 | " u'loadmodule': 9,\n", 534 | " u'multihop': 7,\n", 535 | " u'neptune': 107201,\n", 536 | " u'nmap': 231,\n", 537 | " u'normal': 97278,\n", 538 | " u'perl': 3,\n", 539 | " u'phf': 4,\n", 540 | " u'pod': 264,\n", 541 | " u'portsweep': 1040,\n", 542 | " u'rootkit': 10,\n", 543 | " u'satan': 1589,\n", 544 | " u'smurf': 280790,\n", 545 | " u'spy': 2,\n", 546 | " u'teardrop': 979,\n", 547 | " u'warezclient': 1020,\n", 548 | " u'warezmaster': 20}" 549 | ] 550 | }, 551 | "execution_count": 22, 552 | "metadata": {}, 553 | "output_type": "execute_result" 554 | } 555 | ], 556 | "source": [ 557 | "# Let's now improve the score: is the training dataset balanced?\n", 558 | "\n", 559 | "train_composition = train_df.groupBy(\"target\").count().rdd.collectAsMap()\n", 560 | "train_composition" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 23, 566 | "metadata": { 567 | "collapsed": false 568 | }, 569 | "outputs": [ 570 | { 571 | "data": { 572 | "text/plain": [ 573 | "{u'back': 1,\n", 574 | " u'buffer_overflow': 33.333333333333336,\n", 575 | " u'ftp_write': 125.0,\n", 576 | " u'guess_passwd': 18.867924528301888,\n", 577 | " u'imap': 83.33333333333333,\n", 578 | " u'ipsweep': 1,\n", 579 | " u'land': 47.61904761904762,\n", 580 | " u'loadmodule': 111.11111111111111,\n", 581 | " u'multihop': 142.85714285714286,\n", 582 | " u'neptune': 0.23320677978750198,\n", 583 | " u'nmap': 4.329004329004329,\n", 584 | " u'normal': 0.2569954152017928,\n", 585 | " u'perl': 333.3333333333333,\n", 586 | " u'phf': 250.0,\n", 587 | " u'pod': 3.787878787878788,\n", 588 | " u'portsweep': 1,\n", 589 | " u'rootkit': 100.0,\n", 590 | " u'satan': 1,\n", 591 | " u'smurf': 0.08903450977598917,\n", 592 | " u'spy': 500.0,\n", 593 | " u'teardrop': 1.0214504596527068,\n", 594 | " u'warezclient': 1,\n", 595 | " u'warezmaster': 50.0}" 596 | ] 597 | }, 598 | "execution_count": 23, 599 | "metadata": {}, 600 | "output_type": "execute_result" 601 | } 602 | ], 603 | "source": [ 604 | "def set_sample_rate_between_vals(cnt, the_min, the_max):\n", 605 | " if the_min <= cnt <= the_max:\n", 606 | " # no sampling\n", 607 | " return 1\n", 608 | " \n", 609 | " elif cnt < the_min:\n", 610 | " # Oversampling: return many times the same observation\n", 611 | " return the_min/float(cnt)\n", 612 | "\n", 613 | " else:\n", 614 | " # Subsampling: sometime don't retunt it\n", 615 | " return the_max/float(cnt)\n", 616 | " \n", 617 | "sample_rates = {k:set_sample_rate_between_vals(v, 1000, 25000) \n", 618 | " for k,v in train_composition.iteritems()} \n", 619 | "sample_rates" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": 24, 625 | "metadata": { 626 | "collapsed": false 627 | }, 628 | "outputs": [], 629 | "source": [ 630 | "bc_sample_rates = sc.broadcast(sample_rates)\n", 631 | "\n", 632 | "def map_and_sample(el, rates):\n", 633 | " rate = rates.value[el['target']]\n", 634 | " if rate > 1:\n", 635 | " return [el]*int(rate)\n", 636 | " else:\n", 637 | " import random\n", 638 | " return [el] if random.random() < rate else []\n", 639 | " \n", 640 | "sampled_train_df = (train_df\n", 641 | " .flatMap(\n", 642 | " lambda x: map_and_sample(x, bc_sample_rates))\n", 643 | " .toDF()\n", 644 | " .cache())" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": 25, 650 | "metadata": { 651 | "collapsed": false 652 | }, 653 | "outputs": [ 654 | { 655 | "data": { 656 | "text/plain": [ 657 | "96755" 658 | ] 659 | }, 660 | "execution_count": 25, 661 | "metadata": {}, 662 | "output_type": "execute_result" 663 | } 664 | ], 665 | "source": [ 666 | "sampled_train_df.count()" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": 26, 672 | "metadata": { 673 | "collapsed": false, 674 | "scrolled": true 675 | }, 676 | "outputs": [ 677 | { 678 | "data": { 679 | "text/plain": [ 680 | "Row(duration=0.0, protocol_type=u'tcp', service=u'http', flag=u'SF', src_bytes=181.0, dst_bytes=5450.0, land=0.0, wrong_fragment=0.0, urgent=0.0, hot=0.0, num_failed_logins=0.0, logged_in=1.0, num_compromised=0.0, root_shell=0.0, su_attempted=0.0, num_root=0.0, num_file_creations=0.0, num_shells=0.0, num_access_files=0.0, num_outbound_cmds=0.0, is_host_login=0.0, is_guest_login=0.0, count=8.0, srv_count=8.0, serror_rate=0.0, srv_serror_rate=0.0, rerror_rate=0.0, srv_rerror_rate=0.0, same_srv_rate=1.0, diff_srv_rate=0.0, srv_diff_host_rate=0.0, dst_host_count=9.0, dst_host_srv_count=9.0, dst_host_same_srv_rate=1.0, dst_host_diff_srv_rate=0.0, dst_host_same_src_port_rate=0.11, dst_host_srv_diff_host_rate=0.0, dst_host_serror_rate=0.0, dst_host_srv_serror_rate=0.0, dst_host_rerror_rate=0.0, dst_host_srv_rerror_rate=0.0, target=u'normal')" 681 | ] 682 | }, 683 | "execution_count": 26, 684 | "metadata": {}, 685 | "output_type": "execute_result" 686 | } 687 | ], 688 | "source": [ 689 | "sampled_train_df.first()" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": 27, 695 | "metadata": { 696 | "collapsed": false 697 | }, 698 | "outputs": [ 699 | { 700 | "name": "stdout", 701 | "output_type": "stream", 702 | "text": [ 703 | "F1-score test set: 0.966865218179\n" 704 | ] 705 | } 706 | ], 707 | "source": [ 708 | "full_model = full_pipeline.fit(sampled_train_df)\n", 709 | "predictions = full_model.transform(test_df)\n", 710 | "print \"F1-score test set:\", evaluator.evaluate(predictions)" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": 28, 716 | "metadata": { 717 | "collapsed": false, 718 | "scrolled": false 719 | }, 720 | "outputs": [ 721 | { 722 | "name": "stdout", 723 | "output_type": "stream", 724 | "text": [ 725 | "F1-score test set: 0.967669293816\n" 726 | ] 727 | } 728 | ], 729 | "source": [ 730 | "clf = RandomForestClassifier(\n", 731 | " numTrees=50, maxBins=100, seed=101,\n", 732 | " labelCol=\"target_cat\", featuresCol=\"features\")\n", 733 | "\n", 734 | "stages = full_pipeline.getStages()[:-1]\n", 735 | "stages.append(clf)\n", 736 | "\n", 737 | "refined_pipeline = Pipeline(stages=stages)\n", 738 | "\n", 739 | "refined_model = refined_pipeline.fit(sampled_train_df)\n", 740 | "predictions = refined_model.transform(test_df)\n", 741 | "print \"F1-score test set:\", evaluator.evaluate(predictions)" 742 | ] 743 | }, 744 | { 745 | "cell_type": "code", 746 | "execution_count": 29, 747 | "metadata": { 748 | "collapsed": false 749 | }, 750 | "outputs": [], 751 | "source": [ 752 | "pipeline_to_clf = Pipeline(\n", 753 | " stages=preproc_stages + [assembler]).fit(sampled_train_df)\n", 754 | "train = pipeline_to_clf.transform(sampled_train_df).cache()\n", 755 | "test = pipeline_to_clf.transform(test_df)" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": 30, 761 | "metadata": { 762 | "collapsed": false 763 | }, 764 | "outputs": [], 765 | "source": [ 766 | "# May take some 10 minutes\n", 767 | "\n", 768 | "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n", 769 | " \n", 770 | "rf = RandomForestClassifier(\n", 771 | " cacheNodeIds=True, seed=101, labelCol=\"target_cat\", \n", 772 | " featuresCol=\"features\", maxBins=100)\n", 773 | "\n", 774 | "grid = (ParamGridBuilder() \n", 775 | " .addGrid(rf.maxDepth, [3, 6, 9, 12]) \n", 776 | " .addGrid(rf.numTrees, [20, 50]) \n", 777 | " .build())\n", 778 | "\n", 779 | "cv = CrossValidator(\n", 780 | " estimator=rf, estimatorParamMaps=grid, \n", 781 | " evaluator=evaluator, numFolds=3)\n", 782 | "cvModel = cv.fit(train)" 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": 31, 788 | "metadata": { 789 | "collapsed": false 790 | }, 791 | "outputs": [ 792 | { 793 | "name": "stdout", 794 | "output_type": "stream", 795 | "text": [ 796 | "F1-score test set: 0.969948273422\n" 797 | ] 798 | } 799 | ], 800 | "source": [ 801 | "predictions = cvModel.transform(test)\n", 802 | "print \"F1-score test set:\", evaluator.evaluate(predictions)" 803 | ] 804 | }, 805 | { 806 | "cell_type": "code", 807 | "execution_count": 32, 808 | "metadata": { 809 | "collapsed": false 810 | }, 811 | "outputs": [ 812 | { 813 | "data": { 814 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAU4AAAEoCAYAAAA3/hguAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X28VVWdx/HPuRcU5EFRFBUfbj6RWika4IA8+DANOpNp\nT4b2qGljPjRpU9o4QmSTlTaWljmaZlbO6BipU2hZwsVEAQXEZzTNBxAVSQFREe788Vubs++55+y9\nNufsc9Y59/vmdV7ss88+e69zz+XHWmuvtX4gIiIiIiIiIiIiIiIiIiIiIiIivdws4GS3fSJwR43P\n3wFsBNpqfF4f/YHbgL8B/1PFefL4uTTKeOCxRhdCJM0zwApgq9i+zwN3NaQ0Pd0FnJTj+TtID5wn\nAAuA1cAy4HfAuBpc+1PAfSnXbiUbgT0aXQjx11t+MTdXG/ClGpyn4B6t5GzgP4ELgR2AXYEfAcfU\n4Ny7A09gAaW3SPr96FO3UohU6Wnga8BKYGu3r7TGORaYjzUp5wF/F3ttFhZU/gysBfbEAsFpwFLg\ndWC62z/XneO/gb7u/dsA/we8BLyKNV2Hx84fr3F+Fpjjtr+K1QCjx3rgWvfa1sBPsdrh88A3Kf7n\n2QZcDLwMPAWcTuUa59bu3B8p81pkS+BS4AX3+E9gC/faJHf9s7Fa/TL3GQC+AbwFvO2ucRIwDbg+\ndu6OkrJ91pX5deAvWE042j8n9r6072s6cLc7zx3AdhU+W1T+f8W+n2XAscDRWMBfCZwbO3409h2v\ncsdeRvF77nSfZY37vB+Lnf+rwHLgOrfvOfeePd01RrrnO2Pf24QK5RWpm6eBI4CbsQAD3QPnttg/\nhBOxf8CfwALcEPf6LKy5v697vS/2D2QGMBDYDwsQf8ICwWDgYeDTsfMfB/Rzx9/o3hupFDjjdsGC\n1j+45zOAK7A+xO2x5vCp7rV/Bh7FgvMQd/4NlA+ck7GAnNRimQ7cAwx1jz+7fWBBYD0WENuBo7D/\nXKL/oKYCP4+dayqVA+cA4DVgb/faMOxnC91/Lj7f11JgL+xnfhfw7QqfLSr/+a78nwdeAX7pyrMf\n8AZWcwY4CAuebW7fI3RvyZQ21aPzfxv7velH98CJu+bD2Hd5B/DdCmUVqaungcOB/bEaylC6B85P\nAfeWvOce4DNu+y4sMMRtpHstZwFWa4lcjNXMyjkQ+4ceSQuc/YH7Y+cfBryJ/SOMTMECN+7vU2Ov\n/T2Va5wnYjWhJE9iATbyAexnChYE3ig59wosuEDPGmbp8w66B85VwIexzxz3WYo/F5/v6+ux104D\nZpZ+qJLyR83rQa48o2LHLAA+VOH9/wL8Ova8XOB8i2INPdoXD5wAtwBLgEUUa7BSB+rjTPcw1mQ+\nF+iK7d8ZeLbk2L+6/ZHSX3SwABFZV+b5QLe9FXAlVmt9DZiN1ch8+0p/itUgv+ee747941qOBZpV\nwE+wmifATiXlLf1scSux/0iSfn92xn4e8fPFfzYr6d6H+QbFz57FWuB4rMa8DPuuRlQoT9r39WJs\nO/5dlLOS4u/DOvd36Xc5wG3v48q1HPsuv0XlboDIy1h3RZKrsf/YL8NqqFInCpx+pgKn0L2P8QWK\nTbHI7m5/pIvNdw72D240FjAn4n+T6VysyXlybN9zWC1mO6x5OsSd973u9eXAbrHj49ul5rpzHZdw\nzDKsZhg/37L0ope1hu6jG3Ysef33WI12R2zIzlVlzuHzfeXlCqx5vhf2M/830v/tpf3uDMT6kK/G\n+oWHJB8utaTA6ecpbDxhvF9qJhbYpmB3PY8H3o3VLCI+Qa5QYXsgVmt5Deufm+pZ1qOAM7Gm61ux\n/cuxAPN9rGnZht1kiG4o3AicRbGPM35zo9RrwAXYXfQPYUGtr7v2d9wxN2B9gFEf5wV0b25nsciV\nc1cs8JwXe20HV4YBWK1rLdY3W6pW39fmGIjd+HnDXfO0ktdXYN9FFj/AbnCdCvwWaz1InShw+puO\nBYioJrAS+CesZvgK8BX3PN4PWVprKFeL6CrZjp5fivXZvYL1xc2s8P7S930cC1SPUryz/mP32qex\nfrNHXDlvolh7uwq7ybAY65+7OeF6YAH4bCw4voQ1g79I8QbWhe48D7rHArev3OdO+jwAd2L/cT2I\n3RW/LfZ6G/BlrOa4EhsoflqZ82T9vkrLUK6MSc/jvoLd6X8d+C9s9ET8+GnYnfNVwEcTrh3t+xBW\nw44+59nYDagpCWUQERHxcg1Wo1+ScMwPsREViykO8RIR6bXGY8GwUuA8GpvxBjCGniMvRER6pQ4q\nB86fYP3dkcewoXuJ1McpIr3ZcLoPw3semziSKMg5sIUBO3d1rd3ckSsi0ijjJ0xkTufs2o1OaN+y\niw1vpR9XtBqbhZdFaXlThxEGGTi71i6j34Gn99i/fvk8+u40utu+VfMvr1ex6uLC6dM4/4JpjS5G\nrlr9M7b654PKn7F/3xqP6NrwVtlYUMmbi340KOMVXsCGuUWiacqJGtVUn4z1JSzFFtIQESmv0Ob/\nyO5WiutDHIJNr15R+XDTiBpnO3A5cCQW2edjhX+0AWURkdAVqqrF3oDNuhuK9WVOpTiv/0rsjvrR\n2NoKa4HP+Zy0EYFzNFbIZ9zz/8YG9KYGzraBw9MOaXoTJk5qdBFy1+qfsdU/H9T5M25eTTLiMyng\njKwnbUTgLHcXa4zPG9sHKXC2glb/jK3++aDOn7GtvX7X8tSIwOm18MX65fM2bbcNHN4rgqZIs+mc\nPYvO2bPyvUh1TfVcNCJwlt7F2hWrdXZTevdcRMIzYeKkbrXPb33zG7W/SHVN9Vw0okQLsNW6O7AF\nJ47Hbg6JiPRUKPg/6qQRNc53sM7YO7A77NGCuyIiPQVY42zUAPiZVE5LAPgPbB8yKtsNsVYbMC/S\n8tTHKSKSkWqcIiIZqcYpIpKRapwiIhkpcIqIZNSmprqISDaqcYqIZKSbQyIiGWmRDxGRjAJsqodX\nIhGRuOrnqqdlnBgCzMDyqt8H7J9WJAVOEQlbdakzoowTk4H9sIWN9y055uvAA8ABWBqNH6QVqemb\n6lnnng8Zfab/ueddlrU4IlJr1d0c8sk4sS9wkdt+HFu5bXvg5UonVY1TRMJWXY2zXMaJ0lXRFwMf\ndtujgd1Jya3e9DVOEWlxCTXODSuXsnHlk0nv9sk4cRHWPF8ILHF/b0h6gwKniIQt4a56+9ARtA8d\nsen5hidvLz3EJ+PEauCk2POngb8kFUlNdREJW3VNdZ+ME1u71wBOAWYDa5KKpBqniIStuptDlTJO\nfMG9fiV2t/1nWLP+IeDktJMqcIpI2KofAF8u48SVse25wAgyUOAUkbBprrqISEYBTrlU4BSRsKnG\nKSKSTVubapwNl2UapVIPiwQgvApn7wucItJcCmqqi4hko8ApIpKRAqeISEYKnCIiWYUXNxU4RSRs\nqnGKiGSkwCkikpECp4hIRgqcIiJZhRc3tQK8iIStUCh4PypIy6s+FLgdWIQtZPzZtDKpxplAqYdF\nGq/KpnqUV/1ILP/QfCx1Rjw98BlYgrbzsCD6OPALbPX4slTjFJGgFdoK3o8y4nnV11PMqx63HBjs\ntgcDK0kImqAap4gErsoaZ7m86mNKjrkK+BOwDBgEfDztpAqcIhK0pMD59rKHeXv5w0lv98mr/nWs\nf3MSsCfwB+AALG1wWQqcIhK0pMC55fD3sOXw92x6vnbhTaWH+ORVHwt8y20/heVVH4GlFi5LfZwi\nErQq76r75FV/DLt5BDAMC5p/SSqTapwiErbqxnH65FX/D+BaYDFWmfwq8GrSSRU4RSRoNZg5lJZX\n/RXgg1lOqMApIkHTlEsRkYwUOEVEsgovbjZ/4Ny40WeY1uZpKz8ToaJMqYfHnpPt3Pdckul4kVah\nGqeISEYKnCIiGSlwdvcM8DqwAZt8P7qBZRGRQFVYvKOhGhk4u7C5oYkDTUWkd1ONs6fwfiIiEpQQ\nA2cj56p3AXdic0lPaWA5RCRghYL/o14aWeMchy0guj22jNNjwJzoxQunT9t04ISJk5gwcVJ9Syci\nqTpnz6Jz9qxcrxFijTOUEk0F1gDRYMWudev9xmeGNI4zC43jlFbUv28BahtXuvb+19Jp5pUt/d5R\ntb5+WY1qqm+FrbQMMAD4ALCkQWURkYDVIFlbzTWqqT4MmBErwy+B3zeoLCISsABb6g0LnE8DBzbo\n2iLSRPLsMttcjR6OVLWQfqgbMvS3Zu2zHHLIv/if+95LM51bJGQ1qHFOBi7FFjK+GvhOyetfAU50\n232AfbE0wX+rdEKlzhCRoFXZxxnlVZ8M7AdMwQJj3MXASPc4D5hFQtAEBU4RCVyV4zh98qrHnQDc\nkFYmBU4RCVqVNc5yedWHV7jUVsA/ADenlanp+zhFpLUlDTNa88wi1jyzOOntWQZ6fxC4m5RmOihw\nikjgkm4AD95jJIP3GLnp+Uud15ce4pNXPfIJPJrpoKa6iASuyj5On7zqAFsDE4BbfMqkGqeIBK3K\nGUE+edUBjnXHrPM5qQKniAStBuM40/KqA1znHl4UOEUkaCGujqTAKSJBCzBu9r7Aec6tj3gfe8kx\n+2U6d3uO0z+zTKPM8zOK1JtqnCIiGQUYNxU4RSRsqnGKiGQUYNxU4BSRsKnGKSKSUYBxU4FTRMKm\nGqeISEYKnCIiGYWUHieiwCkiQQuwwqnAKSJhU1M9ANf8+DbvY5t1OmKzlrsZvfbG+kzHb71V35xK\n0roCjJu9L3CKSHNpCzByagV4EQlalSvAg6UGfgxYCnytwjGTgIXAQ1h64ESqcYpI0Krs44zyqh+J\n5R+aj6XOeDR2zDbAj7AMl88DQ9NOqhqniAStreD/KMMnr/oJWErgKInbK6ll2szPIiJSF3XIq743\nsC1wF5bc7VNpZVJTXUSCltRSX/n4/ax84oGkt/vkVe8LHAQcAWwFzAXuxfpEy0oKnJelFOYsjwKJ\niFSlQOXIOXTE+xk64v2bnj/526tLD/HJq/4c1jxf5x6dwAFsZuC8n2K0jkre5bZ9oriISNWqnHEZ\nz6u+DMurPqXkmFuwG0jtwJbAGOD7SSdNCpw/K3k+AFjrW1oRkVqoQ171x4DbgQeBjcBVQGLiLp8+\nzrHA1cAgrJp7IHAq8MWsn0BEJKsaJEH0yat+sXt48bmrfik2gDS6Rb8ImOh7ARGRatRgAHzN+d5V\nf7bk+Tu1Lki9jP7HQxtdhF7j7qWpw+G6OXTv1HHHdZFl/rnmnuevWRf5eBYY57a3wO6mP1r5cBGR\n2gkwbnoFztOAH2CDRl8Afg+cnmehREQiIS7y4RM4X8amJImI1F14YdPv5tCewG3YzaGXsTFPe+RZ\nKBGRSJVTLnPhEzh/BdwI7ATsDNwE3JBnoUREIlUu8pFPmTyO6Q9cj60ssh74BdAvz0KJiERCrHEm\n9XFui3UvzATOo1jLPJ6eg0lFRHIR4L2hxMD5AN3npJ/q/o7mqp+bV6FERCLNNo6zo16FEBGpJMC0\n6t4zh94D7Ef3vs2f1744IiLdNVuNMzINm5u+P/Bb4Cjgbpo0cM77eYYBAWeOSz9GKgplCmVWmkYZ\nlvDCpl/g/Ci2qOcDwOeAYcAv8yyUiEikBqsj1ZzPcKR1wAZsYY+tgZfovqJykmuAFcCS2L5tgT8A\nT2DTN7fxLayI9D41GI6Ulh54EvAalh54IXB+Wpl8Aud8YAi2uOcCd+J7PN4HcC1W6LhzscC5D/BH\ndHdeRBJUuaxclB54MnafZgqwb5njZgMj3ePCtDL5NNWjBYt/gq2iPBhY7PE+gDn0vDt/DMX1PK/D\nkr8reIpIWVUu8hFPDwzF9MClK7xlukhS4DyYyrmFDsL6PDfHMKz5jvt72GaeR0R6gSpvqpdLDzym\n5JguLNPFYmwFuK9QReqMS0hOynZY0ok9daVcQ0R6uSqHI/nElwew+zZvYKOGfoN1JVaUFDgn+ZYs\noxXAjsCL2MIhL5U76MLp0zZtT5g4iQkT8yqOiGyuztmz6Jw9K9drJN2Ief6hebzw0Lykt/ukB14d\n254J/Bi7if1qpZPW4z5/B7Ys3Xvd8+8CK4HvYH2b29Czj7Nr3fp8KqJDRp3hfeyq+ZfnUgaRVtW/\nbwFqG1e6zpzhn3DisuP2Lb1+H+Bx4AgsPfA87AZR/KTDsApcF9YneiMpMyd9Zw5trhuwG0FDsX6G\nC4CLXMFOxjpsP55zGUSkiVU5jNMnPfBHsUwX72DN9U+knTTvwFma+D1yZM7XFZEWUYPx72npgX/k\nHt58xnG2AZ/CaosAu2HVWRGR3DXbepyRHwMbgcOB6cAat+/9OZYrNyf9+xfTD5KGOOfWxBEg3Vxy\nzH45lkRCEuCMS6/AOQYbTb/QPX8V0CoIIlIXAS6O5BU438Y6VSPbYzVQEZHcNWt64MuAGcAOwH9g\nd6BSJ8GLiNRCe3hx0ytw/gK4HxsHBeXneYqI5KJZa5y7AWuxQexgg0R3A57Nq1AiIpEA46ZX4Pwd\nxfme/YB3YSPx98+rUCIikWa9q/6ekucHAafnUBYRkR6atale6gF6LsskIpKLAOOmV+A8J7bdhtU4\nX8inOCIi3TVrU31gbPsd4P+Am/MpjohId4UA81ymBc52LFXGOSnHiYjkotlqnH2wGuY4bH07rdQu\nInXXbIFzHtafuQi4BbgJW6sOLIj+Ot+iiYhUnTojF0nLykWl7Yet2H448E/u8cGcyyUiAliN0/dR\nQVpe9cgorJX94bQyJdU4twfOBpaknUREJC9VVjijvOpHYqOB5gO30nPaeDuWzud2PFJ/JAXOdmDQ\n5pRURKRW+lTXyembV/1M4H+xWmd6mRJeexH4RqYiiojUWB3yqg/HgunhWOBMvRGed84hEZGqtCW0\nnJ944F6WLrw36e0+o4EuxTLtdmHN9Kqa6kqoJiINl1TjHHHwIYw4+JBNz2de+4PSQ3zyqh+MNeHB\nMvIeBazH+kLLSgqcKxNeExGpiyrHcS4A9sbypC8Djqdn9t09YtvXYktoVgyaoKa6iASuytWRfPKq\nZ6bAKSJBq8H497S86nGf8zlheEPyTde69fnM8Bwy6gzvY1fNvzyXMoi0qv59C1DbuNJ19X1/9T74\n82N2r/X1y1KNU0SCFuCMSwVOEQlb0rzwRlHgFJGghbjIhwKniAQtvLCpwCkigWuVZG0iInUTXthU\n4BSRwLUFuAS8AqeIBE131UVEMtJddRGRjMILm70wcHZMVrqkuJdeezPT8Tts3c/72LVvvpO1ON4G\n9AvjVzfrZwyl3M1ENU4RkYzUxykikpFqnCIiGYUXNsOsBYuIbFIo+D8qSMur/iFgMbAQuB9L2pZI\nNU4RCVpSsjYPPnnV7wRucdvvBWYAeyWXSUQkYFXWOON51ddTzKsetza2PRB4Ja1MCpwiErRChj9l\nlMurPrzMccditdCZwFlpZVJTXUSClnRT/cF5f2bJ/HuS3u6bg+c37jEeuB4YkXSwAqeIBK09IXKO\nHHMoI8ccuun5r664uPQQn7zqcXOwuLgdCSnS1VQXkaBV2ccZz6u+BZZXvTRn+p4URz0d5P6uGDSh\nF9Y493rXto0uQlDWb8iWTfSFV9d5Hzt82/6Zzr3u7Q2Zjg/Bm+9szHT8gJzK0coq9F368smr/hHg\n09jNozXAJ9JO2usCp4g0lxosx5mWV/277uEt76b6NcAKYEls3zSsj2Ghe0zOuQwi0sSqvKuei7wD\n57X0DIxdwPeBke5xe85lEJEmVoOZQzWXd1N9DtYpWyrE6aciEqB61iR9Nequ+pnY3NCfAts0qAwi\n0gTaCv6PemnEzaErgOlu+5vAJcDJpQddOH3apu0JEycxYeKkOhRNRLLonD2Lztmzcr1GiDXOepSo\nA7gNmzzv+1rXuvXZhsn4+tg1872PvemkUbmUISRZhhdlledwpP5btGctTi5Wrnk70/HbDdwip5KE\noX/fAtQ2rnTNeeJV74PH77Ntra9fViOa6jvFto+j+x13EZFuChke9ZJ3U/0GYCIwFJtoPxWYBByI\n3V1/muJAVBGRHtp64QrwU8rsuybna4pICwkvbGrmkIiELsDIGWCRAOj66yt+Ny2ypKsVabQhx13h\nfeyqGaflWJJ85HFz6L6n/uZ98Jg9t6n19ctSjVNEghZi7U6BU0TCFmDkVOAUkaCFOABegVNEghbg\naCStAC8iYavBAPi0vOonYmtnPAj8GXhfWplU4xSRsFVX4/TJq/4XYALwGhZk/ws4JOmkqnGKSNCq\nXMjYJ6/6XCxoAtwH7JJWJgVOEQlalQsZ++ZVj5wM/C6tTGqqi0jQklrq8+fOYcG9c5LenmWZtcOA\nk4BxaQcqcIpI2BIi56ix4xk1dvym51deelHpIb551d8HXIX1ca6qokgNldt6nCLNYsioMzIdv2r+\n5TmVxF8eUy4XP7va++ADdhtUev0+wOPAEcAyYB62+FD85tBuwJ+ATwL3+lxHNU4RCVqV4zh98qpf\nAAzBslOA3UQanXRSBU4RCVoNqq9pedU/7x7eFDhFJGiFAKcOKXCKSNACjJsKnCIStgDjpgKniAQu\nwMipwCkiQdOyciIiGamPU0QkowDjpgKniAQuwMipwCkiQQuxjzO8Epnc5qpnmf8bwtxfkWaSx1z1\npSve8D5472Fb1fr6ZanGKSJBC7F2p8ApImELMHIqcIpI0ELs41TgFJGgtYUXNxU4RSRwAQZOJWsT\nkaBVmeUS0vOqvxvLdPkmcI5PmVTjFJGgVTnl0iev+krgTOBY35OqxikiQStkeJThk1f9ZWCBe92L\nAqeIBK3OedW9qKkuIoGr3Fafe/ds5t7dmfTmXKYgBni/ClB6YJFc5ZV6OI8pl8+vesv74F2GbFl6\n/UOAadgNIoDzgI3Ad8q8fSqwBrgk7TpqqotI0Krs41wA7A10AFsAx2M3hypdyoua6iIStDrkVd8R\nu9s+GKuNfgnYD6t9lqXAKSJBq8GUy7S86i8Cu2Y5oQKniIQtwDsxCpwiErQA46YCp4iETcnaREQy\nKgQYORU4RSRo4YVNBU4RCVyAFU4FThEJm1aAFxHJSDXOALyzYaP3sX3aNSNVWlPW1NdDxnqt79tr\n5BkZdgXuAh4GHgLOcvu3Bf4APAH8HtgmxzKISJOrclm5XOQZONcDXwb2x1YoOR3YFzgXC5z7AH90\nz0VEyqpB6oyayzNwvggscttrsIn1w4FjgOvc/uvIsFy9iPQ+IdY469XH2QGMBO4DhgEr3P4V7rmI\nSFkB3huqS+AcCNyMLdW0uuS1Liqs0Hzh9GmbtidMnMSEiZPyKZ2IbLYNrz/HxtXPpR9YjQAjZ96B\nsy8WNK8HfuP2rcDWv3sR2Al4qdwbz79gWs5FE5FqtQ/elfbBxRXZNiy7t+bXCHEcZ559nAVs0dBH\ngEtj+28FPuO2P0MxoKbqnD2rVmULlj5j82v1zwdW06yXGvRxpuVVB/ihe30x1q2YKM/AOQ74JHAY\nsNA9JgMXAX+PDUc63D330ht+IfUZm1+rfz4g/+Z5TJWpM6K86pOxVd2nYKN74o4G9sJSbJwKXJFW\npjyb6ndTOTAfmeN1RaSFVLk6UjyvOhTzqj8aOyY+0uc+bGx5/CZ2D5oaIyJBq0Ne9XLH7FLLz1Av\nsyjecddDDz2a5zGL2sp6/ddL3v8R4KrY808Cl5UccxvWtRi5EzgoqVChzlWf1OgCiEgQqr2l/gLd\nE7HtitUok47Zxe0TEemV+gBPUcyrvojyN4d+57YPAWo/pkpEpMkcBTyO3SQ6z+37AsXc6mB33p/E\nhiMlNtNFRKTF+QxibXbPAA9iY17nNbYoNXENNqRjSWxfqy0rWO4zTsP60eLjl5uZlohsUu1YNboD\nm8ZZrp+iFTyN/TK2ivHYLIx4UPku8FW3/TUyTIAIVLnPOBU4uzHFycWOwIFueyDW7N2X1vsuW87f\nAbfHnp9La67j+TSwXaMLUWMddA8qj1FcEWtH97zZddAzcLbykum/wSaxtOJ36aVZBsD7DGJtBV3Y\nGLIFwCkNLkteesuygmdiNxp+Sms1YTvQEpFNEzi7Gl2AOhmH/VIeha2YP76xxcldNGi51VwBvAtr\n3i4HLmlscWpms5aIbEXNEjh9BrG2guXu75eBGdg821YTLSsICcsKNrmXKAaSq2mN7zFpiUho3e+y\nrGYJnAuwlUs6sEGsx2PL07WSrYBBbnsA8AG695u1is1eVrCJ7BTbPo7m/x5rvkSk1E+5Qayt5F3Y\naIFF2JCPVviMNwDLgLexPurPYaMG7qR1hrCUfsaTgJ9jw8oWY8Gk2fv+DgU2Yr+b8SFWrfZdioiI\niIiIiIiIiIiIiIiIiIiIiORhAzb2bglwI9C/inP9DMvjApbLJWmVqonYAi1ZPUP5VaIq7Y9bk/Fa\n02jtBTkkB80yc0iq8wY2B/692EDtfy55PUvuqfic5FPonma11GHA2Aznjl8jy/6sx1RzvIgCZy80\nB9gLqw3OAW7BZiq1Ad/DFlBeDJzqji9gaQUewxat3SF2rlnAwW57MnA/NrvkD8DuWGqCL2O13XHA\n9sD/umvMoxhUt8NmnjyE1WJ9EnTNwKbiPkTPlaS+7/bfCQx1+/YEZrr3dAIjPK4hIr1YtJJNHyxQ\nfgELnGuwAAcWKP/NbW8JzMfWBvgwFtQK2BzsVW4f2KrgB2EB8dnYuaKpd6UL+v6KYhrW3bC5zwA/\nBM5320dj0/vKNcnjCz0PcX/3x7ogoucbgSlu+98ppoL9I/YfBsAY9zwqo5rqkkmo6YGltvpjtT6w\n2tY1WACbB/zV7f8A1pT/qHs+GFtYZTwW8Lqw1Zv+VHLuApYZsDN2rr+VvB45ku59ooOwBU3GY4th\ngGUbXOXxmb4EHOu2d3VlnYcFzv9x+38B/NpdYyxwU+z9W3hcQ6QsBc7eYR3Wx1lqbcnzM7BmdtzR\npDedffsJC1ht7+0Kr/maBByBBew3sZpvvwrn7MK6IVZR/mcgkpn6OCVyB/BFiv+Z7oMtddeJLePX\nhjXVDyt5XxeWh3oC1rSHYnN6NcWl8sCa/GfFnh/g/u4ETnDbR1FsdlcyGAuEbwLvxgJopA34mNs+\nAevHXY3m0qtDAAAAlElEQVQ186PadAF4X8o1RCpS4OwdytUIS1fsvhrrc3wA6zO8AkuSNwPLLPoI\ncB1wT5lzvYL1kf4auzl0g9t/G9YEj24OnQW8H7v59DDFvNbfwALvQ+74qMlf6XPcjgX4R4BvA3Nj\nx6zFFg5egtVMp7v9JwInU1y275gy5xUREREREREREREREREREREREREREZGA/T9G7auTAnZNiwAA\nAABJRU5ErkJggg==\n", 815 | "text/plain": [ 816 | "" 817 | ] 818 | }, 819 | "metadata": {}, 820 | "output_type": "display_data" 821 | } 822 | ], 823 | "source": [ 824 | "metrics = MulticlassMetrics(predictions.select(\n", 825 | " \"prediction\", \"target_cat\").rdd)\n", 826 | "conf_matrix = metrics.confusionMatrix().toArray()\n", 827 | "plot_confusion_matrix(conf_matrix)" 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": 33, 833 | "metadata": { 834 | "collapsed": false 835 | }, 836 | "outputs": [ 837 | { 838 | "data": { 839 | "text/plain": [ 840 | "DataFrame[duration: double, protocol_type: string, service: string, flag: string, src_bytes: double, dst_bytes: double, land: double, wrong_fragment: double, urgent: double, hot: double, num_failed_logins: double, logged_in: double, num_compromised: double, root_shell: double, su_attempted: double, num_root: double, num_file_creations: double, num_shells: double, num_access_files: double, num_outbound_cmds: double, is_host_login: double, is_guest_login: double, count: double, srv_count: double, serror_rate: double, srv_serror_rate: double, rerror_rate: double, srv_rerror_rate: double, same_srv_rate: double, diff_srv_rate: double, srv_diff_host_rate: double, dst_host_count: double, dst_host_srv_count: double, dst_host_same_srv_rate: double, dst_host_diff_srv_rate: double, dst_host_same_src_port_rate: double, dst_host_srv_diff_host_rate: double, dst_host_serror_rate: double, dst_host_srv_serror_rate: double, dst_host_rerror_rate: double, dst_host_srv_rerror_rate: double, target: string, protocol_type_cat: double, service_cat: double, flag_cat: double, target_cat: double, features: vector]" 841 | ] 842 | }, 843 | "execution_count": 33, 844 | "metadata": {}, 845 | "output_type": "execute_result" 846 | } 847 | ], 848 | "source": [ 849 | "#cleanup\n", 850 | "bc_sample_rates.unpersist()\n", 851 | "sampled_train_df.unpersist()\n", 852 | "train.unpersist()" 853 | ] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": null, 858 | "metadata": { 859 | "collapsed": true 860 | }, 861 | "outputs": [], 862 | "source": [] 863 | } 864 | ], 865 | "metadata": { 866 | "kernelspec": { 867 | "display_name": "Python 2", 868 | "language": "python", 869 | "name": "python2" 870 | }, 871 | "language_info": { 872 | "codemirror_mode": { 873 | "name": "ipython", 874 | "version": 2 875 | }, 876 | "file_extension": ".py", 877 | "mimetype": "text/x-python", 878 | "name": "python", 879 | "nbconvert_exporter": "python", 880 | "pygments_lexer": "ipython2", 881 | "version": "2.7.6" 882 | } 883 | }, 884 | "nbformat": 4, 885 | "nbformat_minor": 0 886 | } 887 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Packt Publishing 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Large Scale Machine Learning with Python 5 | This is the code repository for [Large Scale Machine Learning with Python](https://www.packtpub.com/big-data-and-business-intelligence/large-scale-machine-learning-python?utm_source=github&utm_medium=repository&utm_campaign=9781785887215), published by Packt. It contains all the supporting project files necessary to work through the book from start to finish. 6 | 7 | ## Instructions 8 | The execution of the code examples provided in this book requires an installation of Python 2.7 or higher versions on macOS, Linux, or Microsoft Windows. 9 | The examples throughout the book will make frequent use of Python's essential libraries, such as SciPy, NumPy, Scikit-learn, and StatsModels, and to a minor extent, matplotlib and pandas, for scientific and statistical computing. We will also make use of an out-of-core cloud computing application called H2O. 10 | This book is highly dependent on Jupyter and its Notebooks powered by the Python kernel. We will use its most recent version, 4.1, for this book. 11 | The first chapter will provide you with all the step-by-step instructions and some useful tips to set up your Python environment, these core libraries, and all the necessary tools. 12 | 13 | ## Related books 14 | - [R Machine Learning By Example](https://www.packtpub.com/big-data-and-business-intelligence/r-machine-learning-example?utm_source=github&utm_medium=repository&utm_campaign=9781784390846) 15 | - [R Machine Learning Essentials](https://www.packtpub.com/big-data-and-business-intelligence/r-machine-learning-essentials?utm_source=github&utm_medium=repository&utm_campaign=9781783987740) 16 | - [Machine Learning with R](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-r?utm_source=github&utm_medium=repository&utm_campaign=9781782162148) 17 | ### Download a free PDF 18 | 19 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
20 |

https://packt.link/free-ebook/9781785887215

-------------------------------------------------------------------------------- /vowpal_wabbit_for_windows/x64/vw.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Large-Scale-Machine-Learning-With-Python/681b476109470a04f354c9f4d152b8de40670eb7/vowpal_wabbit_for_windows/x64/vw.exe -------------------------------------------------------------------------------- /vowpal_wabbit_for_windows/x86/vw.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Large-Scale-Machine-Learning-With-Python/681b476109470a04f354c9f4d152b8de40670eb7/vowpal_wabbit_for_windows/x86/vw.exe --------------------------------------------------------------------------------