├── vowpal_wabbit_for_windows
├── x64
│ └── vw.exe
└── x86
│ └── vw.exe
├── Chapter 08
├── Chapter_8_code_Vagrantfile
├── Chapter_8_code_Spark.ipynb
├── Chapter_8_code_HDFS.ipynb
└── Chapter_8_code_MR.ipynb
├── LICENSE
├── README.md
├── Chapter 09
├── Chapter_9_code_01.ipynb
└── Chapter_9_code_02.ipynb
└── Chapter 03
└── Chapter_3_code.ipynb
/vowpal_wabbit_for_windows/x64/vw.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Large-Scale-Machine-Learning-With-Python/HEAD/vowpal_wabbit_for_windows/x64/vw.exe
--------------------------------------------------------------------------------
/vowpal_wabbit_for_windows/x86/vw.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Large-Scale-Machine-Learning-With-Python/HEAD/vowpal_wabbit_for_windows/x86/vw.exe
--------------------------------------------------------------------------------
/Chapter 08/Chapter_8_code_Vagrantfile:
--------------------------------------------------------------------------------
1 | Vagrant.configure("2") do |config|
2 | config.vm.box = "sparkpy/sparkbox_test_1"
3 | config.vm.hostname = "sparkbox"
4 | config.ssh.insert_key = false
5 |
6 | # Hadoop ResourceManager
7 | config.vm.network :forwarded_port, guest: 8088, host: 8088, auto_correct: true
8 |
9 | # Hadoop NameNode
10 | config.vm.network :forwarded_port, guest: 50070, host: 50070, auto_correct: true
11 |
12 | # Hadoop DataNode
13 | config.vm.network :forwarded_port, guest: 50075, host: 50075, auto_correct: true
14 |
15 | # Ipython notebooks (yarn and standalone)
16 | config.vm.network :forwarded_port, guest: 8888, host: 8888, auto_correct: true
17 |
18 |
19 | config.vm.provider "virtualbox" do |v|
20 | v.customize ["modifyvm", :id, "--natdnshostresolver1", "on"]
21 | v.customize ["modifyvm", :id, "--natdnsproxy1", "on"]
22 | v.customize ["modifyvm", :id, "--nictype1", "virtio"]
23 |
24 | v.name = "sparkbox_test"
25 | v.memory = "4096"
26 | v.cpus = "2"
27 | end
28 |
29 | end
30 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016 Packt Publishing
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | # Large Scale Machine Learning with Python
5 | This is the code repository for [Large Scale Machine Learning with Python](https://www.packtpub.com/big-data-and-business-intelligence/large-scale-machine-learning-python?utm_source=github&utm_medium=repository&utm_campaign=9781785887215), published by Packt. It contains all the supporting project files necessary to work through the book from start to finish.
6 |
7 | ## Instructions
8 | The execution of the code examples provided in this book requires an installation of Python 2.7 or higher versions on macOS, Linux, or Microsoft Windows.
9 | The examples throughout the book will make frequent use of Python's essential libraries, such as SciPy, NumPy, Scikit-learn, and StatsModels, and to a minor extent, matplotlib and pandas, for scientific and statistical computing. We will also make use of an out-of-core cloud computing application called H2O.
10 | This book is highly dependent on Jupyter and its Notebooks powered by the Python kernel. We will use its most recent version, 4.1, for this book.
11 | The first chapter will provide you with all the step-by-step instructions and some useful tips to set up your Python environment, these core libraries, and all the necessary tools.
12 |
13 | ## Related books
14 | - [R Machine Learning By Example](https://www.packtpub.com/big-data-and-business-intelligence/r-machine-learning-example?utm_source=github&utm_medium=repository&utm_campaign=9781784390846)
15 | - [R Machine Learning Essentials](https://www.packtpub.com/big-data-and-business-intelligence/r-machine-learning-essentials?utm_source=github&utm_medium=repository&utm_campaign=9781783987740)
16 | - [Machine Learning with R](https://www.packtpub.com/big-data-and-business-intelligence/machine-learning-r?utm_source=github&utm_medium=repository&utm_campaign=9781782162148)
17 | ### Download a free PDF
18 |
19 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
20 |
https://packt.link/free-ebook/9781785887215
--------------------------------------------------------------------------------
/Chapter 08/Chapter_8_code_Spark.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Let's first insert some data in the HDFS"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": false
15 | },
16 | "outputs": [
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "Found 2 items\n",
22 | "-rw-r--r-- 1 vagrant supergroup 1365 2016-05-10 20:06 /datasets/hadoop_git_readme.txt\n",
23 | "-rw-r--r-- 1 vagrant supergroup 5589889 2016-05-10 20:06 /datasets/shakespeare_all.txt\n",
24 | "16/05/10 20:06:36 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\n",
25 | "Deleted /tmp\n"
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "!hdfs dfs -mkdir -p /datasets\n",
31 | "!wget -q http://www.gutenberg.org/cache/epub/100/pg100.txt \\\n",
32 | " -O ../datasets/shakespeare_all.txt\n",
33 | "!hdfs dfs -put -f ../datasets/shakespeare_all.txt /datasets/shakespeare_all.txt\n",
34 | "!hdfs dfs -put -f ../datasets/hadoop_git_readme.txt /datasets/hadoop_git_readme.txt\n",
35 | "!hdfs dfs -ls /datasets\n",
36 | "!hdfs dfs -rm -r /tmp"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "## pySpark"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {
50 | "collapsed": false
51 | },
52 | "outputs": [
53 | {
54 | "data": {
55 | "text/plain": [
56 | "[(u'spark.rdd.compress', u'True'),\n",
57 | " (u'spark.master', u'yarn-client'),\n",
58 | " (u'spark.serializer.objectStreamReset', u'100'),\n",
59 | " (u'spark.yarn.isPython', u'true'),\n",
60 | " (u'spark.submit.deployMode', u'client'),\n",
61 | " (u'spark.executor.cores', u'2'),\n",
62 | " (u'spark.app.name', u'PySparkShell')]"
63 | ]
64 | },
65 | "execution_count": 2,
66 | "metadata": {},
67 | "output_type": "execute_result"
68 | }
69 | ],
70 | "source": [
71 | "sc._conf.getAll()"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 3,
77 | "metadata": {
78 | "collapsed": false
79 | },
80 | "outputs": [
81 | {
82 | "data": {
83 | "text/plain": [
84 | "ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:423"
85 | ]
86 | },
87 | "execution_count": 3,
88 | "metadata": {},
89 | "output_type": "execute_result"
90 | }
91 | ],
92 | "source": [
93 | "numbers = range(10)\n",
94 | "numbers_rdd = sc.parallelize(numbers)\n",
95 | "\n",
96 | "numbers_rdd"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 4,
102 | "metadata": {
103 | "collapsed": false
104 | },
105 | "outputs": [
106 | {
107 | "data": {
108 | "text/plain": [
109 | "[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]"
110 | ]
111 | },
112 | "execution_count": 4,
113 | "metadata": {},
114 | "output_type": "execute_result"
115 | }
116 | ],
117 | "source": [
118 | "numbers_rdd.collect()"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 5,
124 | "metadata": {
125 | "collapsed": false
126 | },
127 | "outputs": [
128 | {
129 | "data": {
130 | "text/plain": [
131 | "[0, 1, 2, 3]"
132 | ]
133 | },
134 | "execution_count": 5,
135 | "metadata": {},
136 | "output_type": "execute_result"
137 | }
138 | ],
139 | "source": [
140 | "numbers_rdd.take(4)"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 6,
146 | "metadata": {
147 | "collapsed": false
148 | },
149 | "outputs": [
150 | {
151 | "data": {
152 | "text/plain": [
153 | "u'For the latest information about Hadoop, please visit our website at:'"
154 | ]
155 | },
156 | "execution_count": 6,
157 | "metadata": {},
158 | "output_type": "execute_result"
159 | }
160 | ],
161 | "source": [
162 | "sc.textFile(\"hdfs:///datasets/hadoop_git_readme.txt\").first()"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 7,
168 | "metadata": {
169 | "collapsed": false
170 | },
171 | "outputs": [
172 | {
173 | "data": {
174 | "text/plain": [
175 | "u'For the latest information about Hadoop, please visit our website at:'"
176 | ]
177 | },
178 | "execution_count": 7,
179 | "metadata": {},
180 | "output_type": "execute_result"
181 | }
182 | ],
183 | "source": [
184 | "sc.textFile(\"file:///home/vagrant/datasets/hadoop_git_readme.txt\").first()"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 8,
190 | "metadata": {
191 | "collapsed": true
192 | },
193 | "outputs": [],
194 | "source": [
195 | "numbers_rdd.saveAsTextFile(\"hdfs:///tmp/numbers_1_10.txt\")"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 9,
201 | "metadata": {
202 | "collapsed": false
203 | },
204 | "outputs": [
205 | {
206 | "name": "stdout",
207 | "output_type": "stream",
208 | "text": [
209 | "Found 5 items\r\n",
210 | "-rw-r--r-- 1 vagrant supergroup 0 2016-05-10 20:06 /tmp/numbers_1_10.txt/_SUCCESS\r\n",
211 | "-rw-r--r-- 1 vagrant supergroup 4 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00000\r\n",
212 | "-rw-r--r-- 1 vagrant supergroup 4 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00001\r\n",
213 | "-rw-r--r-- 1 vagrant supergroup 4 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00002\r\n",
214 | "-rw-r--r-- 1 vagrant supergroup 8 2016-05-10 20:06 /tmp/numbers_1_10.txt/part-00003\r\n"
215 | ]
216 | }
217 | ],
218 | "source": [
219 | "!hdfs dfs -ls /tmp/numbers_1_10.txt"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": 10,
225 | "metadata": {
226 | "collapsed": true
227 | },
228 | "outputs": [],
229 | "source": [
230 | "numbers_rdd.coalesce(1).saveAsTextFile(\"hdfs:///tmp/numbers_1_10_one_file.txt\")"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": 11,
236 | "metadata": {
237 | "collapsed": false
238 | },
239 | "outputs": [
240 | {
241 | "name": "stdout",
242 | "output_type": "stream",
243 | "text": [
244 | "Found 2 items\r\n",
245 | "-rw-r--r-- 1 vagrant supergroup 0 2016-05-10 20:06 /tmp/numbers_1_10_one_file.txt/_SUCCESS\r\n",
246 | "-rw-r--r-- 1 vagrant supergroup 20 2016-05-10 20:06 /tmp/numbers_1_10_one_file.txt/part-00000\r\n"
247 | ]
248 | }
249 | ],
250 | "source": [
251 | "!hdfs dfs -ls /tmp/numbers_1_10_one_file.txt"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 12,
257 | "metadata": {
258 | "collapsed": false
259 | },
260 | "outputs": [
261 | {
262 | "name": "stdout",
263 | "output_type": "stream",
264 | "text": [
265 | "0\r\n",
266 | "1\r\n",
267 | "2\r\n",
268 | "3\r\n",
269 | "4\r\n",
270 | "5\r\n",
271 | "6\r\n",
272 | "7\r\n",
273 | "8\r\n",
274 | "9\r\n"
275 | ]
276 | }
277 | ],
278 | "source": [
279 | "!hdfs dfs -cat /tmp/numbers_1_10_one_file.txt/part-00000"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 13,
285 | "metadata": {
286 | "collapsed": true
287 | },
288 | "outputs": [],
289 | "source": [
290 | "numbers_rdd.saveAsTextFile(\"file:///tmp/numbers_1_10.txt\")"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 14,
296 | "metadata": {
297 | "collapsed": false
298 | },
299 | "outputs": [
300 | {
301 | "name": "stdout",
302 | "output_type": "stream",
303 | "text": [
304 | "part-00000 part-00001\tpart-00002 part-00003\t_SUCCESS\r\n"
305 | ]
306 | }
307 | ],
308 | "source": [
309 | "!ls /tmp/numbers_1_10.txt"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 15,
315 | "metadata": {
316 | "collapsed": false
317 | },
318 | "outputs": [
319 | {
320 | "data": {
321 | "text/plain": [
322 | "[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]"
323 | ]
324 | },
325 | "execution_count": 15,
326 | "metadata": {},
327 | "output_type": "execute_result"
328 | }
329 | ],
330 | "source": [
331 | "def sq(x):\n",
332 | " return x**2\n",
333 | "\n",
334 | "numbers_rdd.map(sq).collect()"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 16,
340 | "metadata": {
341 | "collapsed": false
342 | },
343 | "outputs": [
344 | {
345 | "data": {
346 | "text/plain": [
347 | "[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]"
348 | ]
349 | },
350 | "execution_count": 16,
351 | "metadata": {},
352 | "output_type": "execute_result"
353 | }
354 | ],
355 | "source": [
356 | "numbers_rdd.map(lambda x: x**2).collect()"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": 17,
362 | "metadata": {
363 | "collapsed": false
364 | },
365 | "outputs": [
366 | {
367 | "data": {
368 | "text/plain": [
369 | "285"
370 | ]
371 | },
372 | "execution_count": 17,
373 | "metadata": {},
374 | "output_type": "execute_result"
375 | }
376 | ],
377 | "source": [
378 | "numbers_rdd.map(lambda x: x**2).reduce(lambda a,b: a+b)"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": 18,
384 | "metadata": {
385 | "collapsed": false
386 | },
387 | "outputs": [
388 | {
389 | "data": {
390 | "text/plain": [
391 | "285"
392 | ]
393 | },
394 | "execution_count": 18,
395 | "metadata": {},
396 | "output_type": "execute_result"
397 | }
398 | ],
399 | "source": [
400 | "numbers_rdd.map(lambda x: x**2).sum()"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": 19,
406 | "metadata": {
407 | "collapsed": false
408 | },
409 | "outputs": [
410 | {
411 | "data": {
412 | "text/plain": [
413 | "[('even', 0),\n",
414 | " ('odd', 1),\n",
415 | " ('even', 2),\n",
416 | " ('odd', 3),\n",
417 | " ('even', 4),\n",
418 | " ('odd', 5),\n",
419 | " ('even', 6),\n",
420 | " ('odd', 7),\n",
421 | " ('even', 8),\n",
422 | " ('odd', 9)]"
423 | ]
424 | },
425 | "execution_count": 19,
426 | "metadata": {},
427 | "output_type": "execute_result"
428 | }
429 | ],
430 | "source": [
431 | "def tag(x):\n",
432 | " return \"even\" if x%2==0 else \"odd\"\n",
433 | " \n",
434 | " \n",
435 | "numbers_rdd.map(lambda x: (tag(x), x) ).collect()"
436 | ]
437 | },
438 | {
439 | "cell_type": "code",
440 | "execution_count": 20,
441 | "metadata": {
442 | "collapsed": false
443 | },
444 | "outputs": [
445 | {
446 | "data": {
447 | "text/plain": [
448 | "[('even', 20), ('odd', 25)]"
449 | ]
450 | },
451 | "execution_count": 20,
452 | "metadata": {},
453 | "output_type": "execute_result"
454 | }
455 | ],
456 | "source": [
457 | "numbers_rdd.map(lambda x: (tag(x), x) ).reduceByKey(lambda a,b: a+b).collect()"
458 | ]
459 | },
460 | {
461 | "cell_type": "code",
462 | "execution_count": null,
463 | "metadata": {
464 | "collapsed": true
465 | },
466 | "outputs": [],
467 | "source": []
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": null,
472 | "metadata": {
473 | "collapsed": true
474 | },
475 | "outputs": [],
476 | "source": []
477 | },
478 | {
479 | "cell_type": "code",
480 | "execution_count": 21,
481 | "metadata": {
482 | "collapsed": false
483 | },
484 | "outputs": [
485 | {
486 | "name": "stdout",
487 | "output_type": "stream",
488 | "text": [
489 | "{'chars': 1335, 'lines': 31, 'words': 179}\n"
490 | ]
491 | }
492 | ],
493 | "source": [
494 | "def emit_feats(line):\n",
495 | " return [(\"chars\", len(line)), \\\n",
496 | " (\"words\", len(line.split())), \\\n",
497 | " (\"lines\", 1)]\n",
498 | "\n",
499 | "print (sc.textFile(\"/datasets/hadoop_git_readme.txt\")\n",
500 | " .flatMap(emit_feats)\n",
501 | " .reduceByKey(lambda a,b: a+b)\n",
502 | " .collectAsMap())"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": 22,
508 | "metadata": {
509 | "collapsed": false
510 | },
511 | "outputs": [
512 | {
513 | "name": "stdout",
514 | "output_type": "stream",
515 | "text": [
516 | "[(27801, u'the')]\n"
517 | ]
518 | }
519 | ],
520 | "source": [
521 | "import re\n",
522 | "WORD_RE = re.compile(r\"[\\w']+\")\n",
523 | "\n",
524 | "print (sc.textFile(\"/datasets/shakespeare_all.txt\")\n",
525 | " .flatMap(lambda line: WORD_RE.findall(line))\n",
526 | " .map(lambda word: (word.lower(), 1))\n",
527 | " .reduceByKey(lambda a,b: a+b)\n",
528 | " .map(lambda (k,v): (v,k))\n",
529 | " .takeOrdered(1, key = lambda x: -x[0]))"
530 | ]
531 | },
532 | {
533 | "cell_type": "code",
534 | "execution_count": 23,
535 | "metadata": {
536 | "collapsed": false
537 | },
538 | "outputs": [
539 | {
540 | "name": "stdout",
541 | "output_type": "stream",
542 | "text": [
543 | "[(u'the', 27801)]\n"
544 | ]
545 | }
546 | ],
547 | "source": [
548 | "print (sc.textFile(\"/datasets/shakespeare_all.txt\")\n",
549 | " .flatMap(lambda line: [(word.lower(), 1) for word in WORD_RE.findall(line)])\n",
550 | " .reduceByKey(lambda a,b: a+b)\n",
551 | " .takeOrdered(1, key = lambda x: -x[1]))"
552 | ]
553 | },
554 | {
555 | "cell_type": "code",
556 | "execution_count": null,
557 | "metadata": {
558 | "collapsed": true
559 | },
560 | "outputs": [],
561 | "source": []
562 | }
563 | ],
564 | "metadata": {
565 | "kernelspec": {
566 | "display_name": "Python 2",
567 | "language": "python",
568 | "name": "python2"
569 | },
570 | "language_info": {
571 | "codemirror_mode": {
572 | "name": "ipython",
573 | "version": 2
574 | },
575 | "file_extension": ".py",
576 | "mimetype": "text/x-python",
577 | "name": "python",
578 | "nbconvert_exporter": "python",
579 | "pygments_lexer": "ipython2",
580 | "version": "2.7.6"
581 | }
582 | },
583 | "nbformat": 4,
584 | "nbformat_minor": 0
585 | }
586 |
--------------------------------------------------------------------------------
/Chapter 08/Chapter_8_code_HDFS.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [
10 | {
11 | "name": "stdout",
12 | "output_type": "stream",
13 | "text": [
14 | "16/05/10 19:34:19 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\n",
15 | "Deleted /tmp\n",
16 | "16/05/10 19:34:22 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\n"
17 | ]
18 | }
19 | ],
20 | "source": [
21 | "# Clean up\n",
22 | "!hdfs dfs -rm -r -f /datasets /tmp\n",
23 | "!rm -rf /tmp/hadoop_git_readme*\n",
24 | "!hdfs dfs -expunge"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "## Command line"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "metadata": {
38 | "collapsed": false
39 | },
40 | "outputs": [
41 | {
42 | "name": "stdout",
43 | "output_type": "stream",
44 | "text": [
45 | "Configured Capacity: 42241163264 (39.34 GB)\r\n",
46 | "Present Capacity: 37536710656 (34.96 GB)\r\n",
47 | "DFS Remaining: 37346992128 (34.78 GB)\r\n",
48 | "DFS Used: 189718528 (180.93 MB)\r\n",
49 | "DFS Used%: 0.51%\r\n",
50 | "Under replicated blocks: 0\r\n",
51 | "Blocks with corrupt replicas: 0\r\n",
52 | "Missing blocks: 0\r\n",
53 | "\r\n",
54 | "-------------------------------------------------\r\n",
55 | "Live datanodes (1):\r\n",
56 | "\r\n",
57 | "Name: 127.0.0.1:50010 (localhost)\r\n",
58 | "Hostname: sparkbox\r\n",
59 | "Decommission Status : Normal\r\n",
60 | "Configured Capacity: 42241163264 (39.34 GB)\r\n",
61 | "DFS Used: 189718528 (180.93 MB)\r\n",
62 | "Non DFS Used: 4704452608 (4.38 GB)\r\n",
63 | "DFS Remaining: 37346992128 (34.78 GB)\r\n",
64 | "DFS Used%: 0.45%\r\n",
65 | "DFS Remaining%: 88.41%\r\n",
66 | "Configured Cache Capacity: 0 (0 B)\r\n",
67 | "Cache Used: 0 (0 B)\r\n",
68 | "Cache Remaining: 0 (0 B)\r\n",
69 | "Cache Used%: 100.00%\r\n",
70 | "Cache Remaining%: 0.00%\r\n",
71 | "Xceivers: 1\r\n",
72 | "Last contact: Tue May 10 19:34:23 UTC 2016\r\n",
73 | "\r\n",
74 | "\r\n"
75 | ]
76 | }
77 | ],
78 | "source": [
79 | "!hdfs dfsadmin -report"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 3,
85 | "metadata": {
86 | "collapsed": false
87 | },
88 | "outputs": [
89 | {
90 | "name": "stdout",
91 | "output_type": "stream",
92 | "text": [
93 | "Found 2 items\r\n",
94 | "drwxr-xr-x - vagrant supergroup 0 2016-05-10 19:05 /spark\r\n",
95 | "drwxr-xr-x - vagrant supergroup 0 2016-05-10 18:48 /user\r\n"
96 | ]
97 | }
98 | ],
99 | "source": [
100 | "!hdfs dfs -ls /"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 4,
106 | "metadata": {
107 | "collapsed": false
108 | },
109 | "outputs": [
110 | {
111 | "name": "stdout",
112 | "output_type": "stream",
113 | "text": [
114 | "Filesystem Size Used Available Use%\r\n",
115 | "hdfs://localhost:9000 39.3 G 180.9 M 34.8 G 0%\r\n"
116 | ]
117 | }
118 | ],
119 | "source": [
120 | "!hdfs dfs -df -h /"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 5,
126 | "metadata": {
127 | "collapsed": false
128 | },
129 | "outputs": [
130 | {
131 | "name": "stdout",
132 | "output_type": "stream",
133 | "text": [
134 | "179.0 M /spark\r\n",
135 | "473.4 K /user\r\n"
136 | ]
137 | }
138 | ],
139 | "source": [
140 | "!hdfs dfs -du -h /"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 6,
146 | "metadata": {
147 | "collapsed": false
148 | },
149 | "outputs": [],
150 | "source": [
151 | "!hdfs dfs -mkdir /datasets"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 7,
157 | "metadata": {
158 | "collapsed": false
159 | },
160 | "outputs": [],
161 | "source": [
162 | "!wget -q http://www.gutenberg.org/cache/epub/100/pg100.txt \\\n",
163 | " -O ../datasets/shakespeare_all.txt"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 8,
169 | "metadata": {
170 | "collapsed": false
171 | },
172 | "outputs": [],
173 | "source": [
174 | "!hdfs dfs -put ../datasets/shakespeare_all.txt \\\n",
175 | " /datasets/shakespeare_all.txt\n",
176 | "\n",
177 | "!hdfs dfs -put ../datasets/hadoop_git_readme.txt \\\n",
178 | " /datasets/hadoop_git_readme.txt"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": 9,
184 | "metadata": {
185 | "collapsed": false
186 | },
187 | "outputs": [
188 | {
189 | "name": "stdout",
190 | "output_type": "stream",
191 | "text": [
192 | "Found 2 items\r\n",
193 | "-rw-r--r-- 1 vagrant supergroup 1365 2016-05-10 19:34 /datasets/hadoop_git_readme.txt\r\n",
194 | "-rw-r--r-- 1 vagrant supergroup 5589889 2016-05-10 19:34 /datasets/shakespeare_all.txt\r\n"
195 | ]
196 | }
197 | ],
198 | "source": [
199 | "!hdfs dfs -ls /datasets"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 10,
205 | "metadata": {
206 | "collapsed": false
207 | },
208 | "outputs": [
209 | {
210 | "name": "stdout",
211 | "output_type": "stream",
212 | "text": [
213 | "30\r\n"
214 | ]
215 | }
216 | ],
217 | "source": [
218 | "!hdfs dfs -cat /datasets/hadoop_git_readme.txt | wc -l"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 11,
224 | "metadata": {
225 | "collapsed": false
226 | },
227 | "outputs": [
228 | {
229 | "name": "stdout",
230 | "output_type": "stream",
231 | "text": [
232 | "60\r\n"
233 | ]
234 | }
235 | ],
236 | "source": [
237 | "!hdfs dfs -cat \\\n",
238 | " hdfs:///datasets/hadoop_git_readme.txt \\\n",
239 | " file:///home/vagrant/datasets/hadoop_git_readme.txt | wc -l"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 12,
245 | "metadata": {
246 | "collapsed": true
247 | },
248 | "outputs": [],
249 | "source": [
250 | "!hdfs dfs -cp /datasets/hadoop_git_readme.txt \\\n",
251 | " /datasets/copy_hadoop_git_readme.txt"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 13,
257 | "metadata": {
258 | "collapsed": false
259 | },
260 | "outputs": [
261 | {
262 | "name": "stdout",
263 | "output_type": "stream",
264 | "text": [
265 | "16/05/10 19:35:07 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\r\n",
266 | "Deleted /datasets/copy_hadoop_git_readme.txt\r\n"
267 | ]
268 | }
269 | ],
270 | "source": [
271 | "!hdfs dfs -rm /datasets/copy_hadoop_git_readme.txt"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 14,
277 | "metadata": {
278 | "collapsed": false
279 | },
280 | "outputs": [
281 | {
282 | "name": "stdout",
283 | "output_type": "stream",
284 | "text": [
285 | "16/05/10 19:35:09 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\r\n"
286 | ]
287 | }
288 | ],
289 | "source": [
290 | "!hdfs dfs -expunge"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 15,
296 | "metadata": {
297 | "collapsed": false
298 | },
299 | "outputs": [],
300 | "source": [
301 | "!hdfs dfs -get /datasets/hadoop_git_readme.txt \\\n",
302 | " /tmp/hadoop_git_readme.txt"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 16,
308 | "metadata": {
309 | "collapsed": false
310 | },
311 | "outputs": [
312 | {
313 | "name": "stdout",
314 | "output_type": "stream",
315 | "text": [
316 | "ntry, of \r\n",
317 | "encryption software. BEFORE using any encryption software, please \r\n",
318 | "check your country's laws, regulations and policies concerning the\r\n",
319 | "import, possession, or use, and re-export of encryption software, to \r\n",
320 | "see if this is permitted. See for more\r\n",
321 | "information.\r\n",
322 | "\r\n",
323 | "The U.S. Government Department of Commerce, Bureau of Industry and\r\n",
324 | "Security (BIS), has classified this software as Export Commodity \r\n",
325 | "Control Number (ECCN) 5D002.C.1, which includes information security\r\n",
326 | "software using or performing cryptographic functions with asymmetric\r\n",
327 | "algorithms. The form and manner of this Apache Software Foundation\r\n",
328 | "distribution makes it eligible for export under the License Exception\r\n",
329 | "ENC Technology Software Unrestricted (TSU) exception (see the BIS \r\n",
330 | "Export Administration Regulations, Section 740.13) for both object \r\n",
331 | "code and source code.\r\n",
332 | "\r\n",
333 | "The following provides more details on the included cryptographic\r\n",
334 | "software:\r\n",
335 | " Hadoop Core uses the SSL libraries from the Jetty project written \r\n",
336 | "by mortbay.org."
337 | ]
338 | }
339 | ],
340 | "source": [
341 | "!hdfs dfs -tail /datasets/hadoop_git_readme.txt"
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "metadata": {
347 | "collapsed": true
348 | },
349 | "source": [
350 | "## Snakebite"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 17,
356 | "metadata": {
357 | "collapsed": false
358 | },
359 | "outputs": [],
360 | "source": [
361 | "from snakebite.client import Client\n",
362 | "client = Client(\"localhost\", 9000)"
363 | ]
364 | },
365 | {
366 | "cell_type": "code",
367 | "execution_count": 18,
368 | "metadata": {
369 | "collapsed": false
370 | },
371 | "outputs": [
372 | {
373 | "data": {
374 | "text/plain": [
375 | "{'blockSize': 134217728L,\n",
376 | " 'bytesPerChecksum': 512,\n",
377 | " 'checksumType': 2,\n",
378 | " 'encryptDataTransfer': False,\n",
379 | " 'fileBufferSize': 4096,\n",
380 | " 'replication': 1,\n",
381 | " 'trashInterval': 0L,\n",
382 | " 'writePacketSize': 65536}"
383 | ]
384 | },
385 | "execution_count": 18,
386 | "metadata": {},
387 | "output_type": "execute_result"
388 | }
389 | ],
390 | "source": [
391 | "client.serverdefaults()"
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "execution_count": 19,
397 | "metadata": {
398 | "collapsed": false
399 | },
400 | "outputs": [
401 | {
402 | "name": "stdout",
403 | "output_type": "stream",
404 | "text": [
405 | "/datasets\n",
406 | "/spark\n",
407 | "/user\n"
408 | ]
409 | }
410 | ],
411 | "source": [
412 | "for x in client.ls(['/']):\n",
413 | " print x['path']"
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 20,
419 | "metadata": {
420 | "collapsed": false
421 | },
422 | "outputs": [
423 | {
424 | "data": {
425 | "text/plain": [
426 | "{'capacity': 42241163264L,\n",
427 | " 'corrupt_blocks': 0L,\n",
428 | " 'filesystem': 'hdfs://localhost:9000',\n",
429 | " 'missing_blocks': 0L,\n",
430 | " 'remaining': 37341663232L,\n",
431 | " 'under_replicated': 0L,\n",
432 | " 'used': 195353480L}"
433 | ]
434 | },
435 | "execution_count": 20,
436 | "metadata": {},
437 | "output_type": "execute_result"
438 | }
439 | ],
440 | "source": [
441 | "client.df()"
442 | ]
443 | },
444 | {
445 | "cell_type": "code",
446 | "execution_count": 21,
447 | "metadata": {
448 | "collapsed": false
449 | },
450 | "outputs": [
451 | {
452 | "data": {
453 | "text/plain": [
454 | "[{'length': 5591254L, 'path': '/datasets'},\n",
455 | " {'length': 187698038L, 'path': '/spark'},\n",
456 | " {'length': 484810L, 'path': '/user'}]"
457 | ]
458 | },
459 | "execution_count": 21,
460 | "metadata": {},
461 | "output_type": "execute_result"
462 | }
463 | ],
464 | "source": [
465 | "list(client.du([\"/\"]))"
466 | ]
467 | },
468 | {
469 | "cell_type": "code",
470 | "execution_count": 22,
471 | "metadata": {
472 | "collapsed": false
473 | },
474 | "outputs": [],
475 | "source": [
476 | "# Note:\n",
477 | "# put command is not yet available"
478 | ]
479 | },
480 | {
481 | "cell_type": "code",
482 | "execution_count": 23,
483 | "metadata": {
484 | "collapsed": false
485 | },
486 | "outputs": [
487 | {
488 | "name": "stdout",
489 | "output_type": "stream",
490 | "text": [
491 | "30\n"
492 | ]
493 | }
494 | ],
495 | "source": [
496 | "for el in client.cat(['/datasets/hadoop_git_readme.txt']):\n",
497 | " print el.next().count(\"\\n\")"
498 | ]
499 | },
500 | {
501 | "cell_type": "code",
502 | "execution_count": 24,
503 | "metadata": {
504 | "collapsed": false
505 | },
506 | "outputs": [],
507 | "source": [
508 | "# Note:\n",
509 | "# copy command is not yet available"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": 25,
515 | "metadata": {
516 | "collapsed": false
517 | },
518 | "outputs": [
519 | {
520 | "data": {
521 | "text/plain": [
522 | "{'path': '/datasets/shakespeare_all.txt', 'result': True}"
523 | ]
524 | },
525 | "execution_count": 25,
526 | "metadata": {},
527 | "output_type": "execute_result"
528 | }
529 | ],
530 | "source": [
531 | "client.delete(['/datasets/shakespeare_all.txt']).next()"
532 | ]
533 | },
534 | {
535 | "cell_type": "code",
536 | "execution_count": 26,
537 | "metadata": {
538 | "collapsed": false
539 | },
540 | "outputs": [
541 | {
542 | "data": {
543 | "text/plain": [
544 | "{'error': '',\n",
545 | " 'path': '/tmp/hadoop_git_readme_2.txt',\n",
546 | " 'result': True,\n",
547 | " 'source_path': '/datasets/hadoop_git_readme.txt'}"
548 | ]
549 | },
550 | "execution_count": 26,
551 | "metadata": {},
552 | "output_type": "execute_result"
553 | }
554 | ],
555 | "source": [
556 | "(client\n",
557 | ".copyToLocal(['/datasets/hadoop_git_readme.txt'], \n",
558 | " '/tmp/hadoop_git_readme_2.txt')\n",
559 | ".next())"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": 27,
565 | "metadata": {
566 | "collapsed": false
567 | },
568 | "outputs": [
569 | {
570 | "data": {
571 | "text/plain": [
572 | "[{'path': '/datasets_2', 'result': True}]"
573 | ]
574 | },
575 | "execution_count": 27,
576 | "metadata": {},
577 | "output_type": "execute_result"
578 | }
579 | ],
580 | "source": [
581 | "list(client.mkdir(['/datasets_2']))"
582 | ]
583 | },
584 | {
585 | "cell_type": "code",
586 | "execution_count": 28,
587 | "metadata": {
588 | "collapsed": false
589 | },
590 | "outputs": [
591 | {
592 | "data": {
593 | "text/plain": [
594 | "[{'path': '/datasets', 'result': True},\n",
595 | " {'path': '/datasets_2', 'result': True}]"
596 | ]
597 | },
598 | "execution_count": 28,
599 | "metadata": {},
600 | "output_type": "execute_result"
601 | }
602 | ],
603 | "source": [
604 | "list(client.delete(['/datasets*'], recurse=True))"
605 | ]
606 | },
607 | {
608 | "cell_type": "code",
609 | "execution_count": null,
610 | "metadata": {
611 | "collapsed": true
612 | },
613 | "outputs": [],
614 | "source": []
615 | }
616 | ],
617 | "metadata": {
618 | "kernelspec": {
619 | "display_name": "Python 2",
620 | "language": "python",
621 | "name": "python2"
622 | },
623 | "language_info": {
624 | "codemirror_mode": {
625 | "name": "ipython",
626 | "version": 2
627 | },
628 | "file_extension": ".py",
629 | "mimetype": "text/x-python",
630 | "name": "python",
631 | "nbconvert_exporter": "python",
632 | "pygments_lexer": "ipython2",
633 | "version": "2.7.6"
634 | }
635 | },
636 | "nbformat": 4,
637 | "nbformat_minor": 0
638 | }
639 |
--------------------------------------------------------------------------------
/Chapter 08/Chapter_8_code_MR.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Let's first insert some data in the HDFS"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": false
15 | },
16 | "outputs": [
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "Found 2 items\r\n",
22 | "-rw-r--r-- 1 vagrant supergroup 1365 2016-05-10 19:58 /datasets/hadoop_git_readme.txt\r\n",
23 | "-rw-r--r-- 1 vagrant supergroup 5589889 2016-05-10 19:58 /datasets/shakespeare_all.txt\r\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "!hdfs dfs -mkdir -p /datasets\n",
29 | "!wget -q http://www.gutenberg.org/cache/epub/100/pg100.txt \\\n",
30 | " -O ../datasets/shakespeare_all.txt\n",
31 | "!hdfs dfs -put -f ../datasets/shakespeare_all.txt /datasets/shakespeare_all.txt\n",
32 | "!hdfs dfs -put -f ../datasets/hadoop_git_readme.txt /datasets/hadoop_git_readme.txt\n",
33 | "!hdfs dfs -ls /datasets"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "## MR with Hadoop streaming"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 2,
46 | "metadata": {
47 | "collapsed": true
48 | },
49 | "outputs": [],
50 | "source": [
51 | "with open('mapper_hadoop.py', 'w') as fh:\n",
52 | " fh.write(\"\"\"#!/usr/bin/env python\n",
53 | "\n",
54 | "import sys\n",
55 | "\n",
56 | "for line in sys.stdin:\n",
57 | " print \"chars\", len(line.rstrip('\\\\n'))\n",
58 | " print \"words\", len(line.split())\n",
59 | " print \"lines\", 1\n",
60 | " \"\"\")\n",
61 | "\n",
62 | "\n",
63 | "with open('reducer_hadoop.py', 'w') as fh:\n",
64 | " fh.write(\"\"\"#!/usr/bin/env python\n",
65 | "\n",
66 | "import sys\n",
67 | "\n",
68 | "counts = {\"chars\": 0, \"words\":0, \"lines\":0}\n",
69 | "\n",
70 | "for line in sys.stdin:\n",
71 | " kv = line.rstrip().split()\n",
72 | " counts[kv[0]] += int(kv[1])\n",
73 | "\n",
74 | "for k,v in counts.items():\n",
75 | " print k, v\n",
76 | " \"\"\") "
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 3,
82 | "metadata": {
83 | "collapsed": true
84 | },
85 | "outputs": [],
86 | "source": [
87 | "!chmod a+x *_hadoop.py"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 4,
93 | "metadata": {
94 | "collapsed": false
95 | },
96 | "outputs": [
97 | {
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "chars 1335\r\n",
102 | "lines 31\r\n",
103 | "words 179\r\n"
104 | ]
105 | }
106 | ],
107 | "source": [
108 | "!cat ../datasets/hadoop_git_readme.txt | ./mapper_hadoop.py | sort -k1,1 | ./reducer_hadoop.py"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 5,
114 | "metadata": {
115 | "collapsed": false
116 | },
117 | "outputs": [
118 | {
119 | "name": "stdout",
120 | "output_type": "stream",
121 | "text": [
122 | "16/05/10 19:58:48 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.\n",
123 | "Deleted /tmp/mr.out\n",
124 | "packageJobJar: [/tmp/hadoop-unjar5384590696382062055/] [] /tmp/streamjob1965588122940844531.jar tmpDir=null\n",
125 | "16/05/10 19:58:50 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n",
126 | "16/05/10 19:58:51 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032\n",
127 | "16/05/10 19:58:51 INFO mapred.FileInputFormat: Total input paths to process : 1\n",
128 | "16/05/10 19:58:51 INFO mapreduce.JobSubmitter: number of splits:2\n",
129 | "16/05/10 19:58:52 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1462906052477_0019\n",
130 | "16/05/10 19:58:52 INFO impl.YarnClientImpl: Submitted application application_1462906052477_0019\n",
131 | "16/05/10 19:58:52 INFO mapreduce.Job: The url to track the job: http://sparkbox:8088/proxy/application_1462906052477_0019/\n",
132 | "16/05/10 19:58:52 INFO mapreduce.Job: Running job: job_1462906052477_0019\n",
133 | "16/05/10 19:58:58 INFO mapreduce.Job: Job job_1462906052477_0019 running in uber mode : false\n",
134 | "16/05/10 19:58:58 INFO mapreduce.Job: map 0% reduce 0%\n",
135 | "16/05/10 19:59:03 INFO mapreduce.Job: map 50% reduce 0%\n",
136 | "16/05/10 19:59:08 INFO mapreduce.Job: map 100% reduce 0%\n",
137 | "16/05/10 19:59:14 INFO mapreduce.Job: map 100% reduce 100%\n",
138 | "16/05/10 19:59:14 INFO mapreduce.Job: Job job_1462906052477_0019 completed successfully\n",
139 | "16/05/10 19:59:14 INFO mapreduce.Job: Counters: 49\n",
140 | "\tFile System Counters\n",
141 | "\t\tFILE: Number of bytes read=1060\n",
142 | "\t\tFILE: Number of bytes written=332854\n",
143 | "\t\tFILE: Number of read operations=0\n",
144 | "\t\tFILE: Number of large read operations=0\n",
145 | "\t\tFILE: Number of write operations=0\n",
146 | "\t\tHDFS: Number of bytes read=2256\n",
147 | "\t\tHDFS: Number of bytes written=33\n",
148 | "\t\tHDFS: Number of read operations=9\n",
149 | "\t\tHDFS: Number of large read operations=0\n",
150 | "\t\tHDFS: Number of write operations=2\n",
151 | "\tJob Counters \n",
152 | "\t\tLaunched map tasks=2\n",
153 | "\t\tLaunched reduce tasks=1\n",
154 | "\t\tData-local map tasks=2\n",
155 | "\t\tTotal time spent by all maps in occupied slots (ms)=6732\n",
156 | "\t\tTotal time spent by all reduces in occupied slots (ms)=3739\n",
157 | "\t\tTotal time spent by all map tasks (ms)=6732\n",
158 | "\t\tTotal time spent by all reduce tasks (ms)=3739\n",
159 | "\t\tTotal vcore-milliseconds taken by all map tasks=6732\n",
160 | "\t\tTotal vcore-milliseconds taken by all reduce tasks=3739\n",
161 | "\t\tTotal megabyte-milliseconds taken by all map tasks=6893568\n",
162 | "\t\tTotal megabyte-milliseconds taken by all reduce tasks=3828736\n",
163 | "\tMap-Reduce Framework\n",
164 | "\t\tMap input records=31\n",
165 | "\t\tMap output records=93\n",
166 | "\t\tMap output bytes=868\n",
167 | "\t\tMap output materialized bytes=1066\n",
168 | "\t\tInput split bytes=208\n",
169 | "\t\tCombine input records=0\n",
170 | "\t\tCombine output records=0\n",
171 | "\t\tReduce input groups=23\n",
172 | "\t\tReduce shuffle bytes=1066\n",
173 | "\t\tReduce input records=93\n",
174 | "\t\tReduce output records=3\n",
175 | "\t\tSpilled Records=186\n",
176 | "\t\tShuffled Maps =2\n",
177 | "\t\tFailed Shuffles=0\n",
178 | "\t\tMerged Map outputs=2\n",
179 | "\t\tGC time elapsed (ms)=78\n",
180 | "\t\tCPU time spent (ms)=1830\n",
181 | "\t\tPhysical memory (bytes) snapshot=699170816\n",
182 | "\t\tVirtual memory (bytes) snapshot=2495647744\n",
183 | "\t\tTotal committed heap usage (bytes)=512229376\n",
184 | "\tShuffle Errors\n",
185 | "\t\tBAD_ID=0\n",
186 | "\t\tCONNECTION=0\n",
187 | "\t\tIO_ERROR=0\n",
188 | "\t\tWRONG_LENGTH=0\n",
189 | "\t\tWRONG_MAP=0\n",
190 | "\t\tWRONG_REDUCE=0\n",
191 | "\tFile Input Format Counters \n",
192 | "\t\tBytes Read=2048\n",
193 | "\tFile Output Format Counters \n",
194 | "\t\tBytes Written=33\n",
195 | "16/05/10 19:59:14 INFO streaming.StreamJob: Output directory: /tmp/mr.out\n"
196 | ]
197 | }
198 | ],
199 | "source": [
200 | "!hdfs dfs -mkdir -p /tmp\n",
201 | "!hdfs dfs -rm -f -r /tmp/mr.out\n",
202 | "\n",
203 | "!hadoop jar /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.6.4.jar \\\n",
204 | "-files mapper_hadoop.py,reducer_hadoop.py \\\n",
205 | "-mapper mapper_hadoop.py -reducer reducer_hadoop.py \\\n",
206 | "-input /datasets/hadoop_git_readme.txt -output /tmp/mr.out\n",
207 | "\n"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 6,
213 | "metadata": {
214 | "collapsed": false
215 | },
216 | "outputs": [
217 | {
218 | "name": "stdout",
219 | "output_type": "stream",
220 | "text": [
221 | "Found 2 items\r\n",
222 | "-rw-r--r-- 1 vagrant supergroup 0 2016-05-10 19:59 /tmp/mr.out/_SUCCESS\r\n",
223 | "-rw-r--r-- 1 vagrant supergroup 33 2016-05-10 19:59 /tmp/mr.out/part-00000\r\n"
224 | ]
225 | }
226 | ],
227 | "source": [
228 | "!hdfs dfs -ls /tmp/mr.out"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 7,
234 | "metadata": {
235 | "collapsed": false
236 | },
237 | "outputs": [
238 | {
239 | "name": "stdout",
240 | "output_type": "stream",
241 | "text": [
242 | "chars 1335\t\r\n",
243 | "lines 31\t\r\n",
244 | "words 179\t\r\n"
245 | ]
246 | }
247 | ],
248 | "source": [
249 | "!hdfs dfs -cat /tmp/mr.out/part-00000"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": null,
255 | "metadata": {
256 | "collapsed": true
257 | },
258 | "outputs": [],
259 | "source": []
260 | },
261 | {
262 | "cell_type": "markdown",
263 | "metadata": {},
264 | "source": [
265 | "## MR with Python MrJob library"
266 | ]
267 | },
268 | {
269 | "cell_type": "code",
270 | "execution_count": 8,
271 | "metadata": {
272 | "collapsed": true
273 | },
274 | "outputs": [],
275 | "source": [
276 | "with open(\"MrJob_job1.py\", \"w\") as fh:\n",
277 | " fh.write(\"\"\"\n",
278 | "from mrjob.job import MRJob\n",
279 | "\n",
280 | "\n",
281 | "class MRWordFrequencyCount(MRJob):\n",
282 | "\n",
283 | " def mapper(self, _, line):\n",
284 | " yield \"chars\", len(line)\n",
285 | " yield \"words\", len(line.split())\n",
286 | " yield \"lines\", 1\n",
287 | "\n",
288 | " def reducer(self, key, values):\n",
289 | " yield key, sum(values)\n",
290 | "\n",
291 | "\n",
292 | "if __name__ == '__main__':\n",
293 | " MRWordFrequencyCount.run() \n",
294 | " \"\"\")"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 9,
300 | "metadata": {
301 | "collapsed": false
302 | },
303 | "outputs": [
304 | {
305 | "name": "stdout",
306 | "output_type": "stream",
307 | "text": [
308 | "No configs found; falling back on auto-configuration\r\n",
309 | "Creating temp directory /tmp/MrJob_job1.vagrant.20160510.195920.590984\r\n",
310 | "Running step 1 of 1...\r\n",
311 | "Streaming final output from /tmp/MrJob_job1.vagrant.20160510.195920.590984/output...\r\n",
312 | "\"chars\"\t1335\r\n",
313 | "\"lines\"\t31\r\n",
314 | "\"words\"\t179\r\n",
315 | "Removing temp directory /tmp/MrJob_job1.vagrant.20160510.195920.590984...\r\n"
316 | ]
317 | }
318 | ],
319 | "source": [
320 | "!python MrJob_job1.py ../datasets/hadoop_git_readme.txt"
321 | ]
322 | },
323 | {
324 | "cell_type": "code",
325 | "execution_count": 10,
326 | "metadata": {
327 | "collapsed": false
328 | },
329 | "outputs": [
330 | {
331 | "name": "stdout",
332 | "output_type": "stream",
333 | "text": [
334 | "No configs found; falling back on auto-configuration\n",
335 | "Looking for hadoop binary in /usr/local/hadoop/bin...\n",
336 | "Found hadoop binary: /usr/local/hadoop/bin/hadoop\n",
337 | "Creating temp directory /tmp/MrJob_job1.vagrant.20160510.195920.870616\n",
338 | "Using Hadoop version 2.6.4\n",
339 | "Copying local files to hdfs:///user/vagrant/tmp/mrjob/MrJob_job1.vagrant.20160510.195920.870616/files/...\n",
340 | "Looking for Hadoop streaming jar in /usr/local/hadoop...\n",
341 | "Found Hadoop streaming jar: /usr/local/hadoop/share/hadoop/tools/lib/hadoop-streaming-2.6.4.jar\n",
342 | "Running step 1 of 1...\n",
343 | " packageJobJar: [/tmp/hadoop-unjar7634308048659876233/] [] /tmp/streamjob5879999650692493094.jar tmpDir=null\n",
344 | " Connecting to ResourceManager at /0.0.0.0:8032\n",
345 | " Connecting to ResourceManager at /0.0.0.0:8032\n",
346 | " Total input paths to process : 1\n",
347 | " number of splits:2\n",
348 | " Submitting tokens for job: job_1462906052477_0020\n",
349 | " Submitted application application_1462906052477_0020\n",
350 | " The url to track the job: http://sparkbox:8088/proxy/application_1462906052477_0020/\n",
351 | " Running job: job_1462906052477_0020\n",
352 | " Job job_1462906052477_0020 running in uber mode : false\n",
353 | " map 0% reduce 0%\n",
354 | " map 50% reduce 0%\n",
355 | " map 100% reduce 0%\n",
356 | " map 100% reduce 100%\n",
357 | " Job job_1462906052477_0020 completed successfully\n",
358 | " Output directory: hdfs:///user/vagrant/tmp/mrjob/MrJob_job1.vagrant.20160510.195920.870616/output\n",
359 | "Counters: 50\n",
360 | "\tFile Input Format Counters \n",
361 | "\t\tBytes Read=2048\n",
362 | "\tFile Output Format Counters \n",
363 | "\t\tBytes Written=36\n",
364 | "\tFile System Counters\n",
365 | "\t\tFILE: Number of bytes read=1153\n",
366 | "\t\tFILE: Number of bytes written=337717\n",
367 | "\t\tFILE: Number of large read operations=0\n",
368 | "\t\tFILE: Number of read operations=0\n",
369 | "\t\tFILE: Number of write operations=0\n",
370 | "\t\tHDFS: Number of bytes read=2256\n",
371 | "\t\tHDFS: Number of bytes written=36\n",
372 | "\t\tHDFS: Number of large read operations=0\n",
373 | "\t\tHDFS: Number of read operations=9\n",
374 | "\t\tHDFS: Number of write operations=2\n",
375 | "\tJob Counters \n",
376 | "\t\tData-local map tasks=2\n",
377 | "\t\tKilled map tasks=1\n",
378 | "\t\tLaunched map tasks=2\n",
379 | "\t\tLaunched reduce tasks=1\n",
380 | "\t\tTotal megabyte-milliseconds taken by all map tasks=7394304\n",
381 | "\t\tTotal megabyte-milliseconds taken by all reduce tasks=3846144\n",
382 | "\t\tTotal time spent by all map tasks (ms)=7221\n",
383 | "\t\tTotal time spent by all maps in occupied slots (ms)=7221\n",
384 | "\t\tTotal time spent by all reduce tasks (ms)=3756\n",
385 | "\t\tTotal time spent by all reduces in occupied slots (ms)=3756\n",
386 | "\t\tTotal vcore-milliseconds taken by all map tasks=7221\n",
387 | "\t\tTotal vcore-milliseconds taken by all reduce tasks=3756\n",
388 | "\tMap-Reduce Framework\n",
389 | "\t\tCPU time spent (ms)=1830\n",
390 | "\t\tCombine input records=0\n",
391 | "\t\tCombine output records=0\n",
392 | "\t\tFailed Shuffles=0\n",
393 | "\t\tGC time elapsed (ms)=66\n",
394 | "\t\tInput split bytes=208\n",
395 | "\t\tMap input records=31\n",
396 | "\t\tMap output bytes=961\n",
397 | "\t\tMap output materialized bytes=1159\n",
398 | "\t\tMap output records=93\n",
399 | "\t\tMerged Map outputs=2\n",
400 | "\t\tPhysical memory (bytes) snapshot=726175744\n",
401 | "\t\tReduce input groups=3\n",
402 | "\t\tReduce input records=93\n",
403 | "\t\tReduce output records=3\n",
404 | "\t\tReduce shuffle bytes=1159\n",
405 | "\t\tShuffled Maps =2\n",
406 | "\t\tSpilled Records=186\n",
407 | "\t\tTotal committed heap usage (bytes)=515899392\n",
408 | "\t\tVirtual memory (bytes) snapshot=2496479232\n",
409 | "\tShuffle Errors\n",
410 | "\t\tBAD_ID=0\n",
411 | "\t\tCONNECTION=0\n",
412 | "\t\tIO_ERROR=0\n",
413 | "\t\tWRONG_LENGTH=0\n",
414 | "\t\tWRONG_MAP=0\n",
415 | "\t\tWRONG_REDUCE=0\n",
416 | "Streaming final output from hdfs:///user/vagrant/tmp/mrjob/MrJob_job1.vagrant.20160510.195920.870616/output...\n",
417 | "\"chars\"\t1335\n",
418 | "\"lines\"\t31\n",
419 | "\"words\"\t179\n",
420 | "Removing HDFS temp directory hdfs:///user/vagrant/tmp/mrjob/MrJob_job1.vagrant.20160510.195920.870616...\n",
421 | "Removing temp directory /tmp/MrJob_job1.vagrant.20160510.195920.870616...\n"
422 | ]
423 | }
424 | ],
425 | "source": [
426 | "!python MrJob_job1.py -r hadoop hdfs:///datasets/hadoop_git_readme.txt"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": 11,
432 | "metadata": {
433 | "collapsed": true
434 | },
435 | "outputs": [],
436 | "source": [
437 | "with open(\"MrJob_job2.py\", \"w\") as fh:\n",
438 | " fh.write(\"\"\"\n",
439 | "from mrjob.job import MRJob\n",
440 | "from mrjob.step import MRStep\n",
441 | "import re\n",
442 | "\n",
443 | "WORD_RE = re.compile(r\"[\\w']+\")\n",
444 | "\n",
445 | "\n",
446 | "class MRMostUsedWord(MRJob):\n",
447 | "\n",
448 | " def steps(self):\n",
449 | " return [\n",
450 | " MRStep(mapper=self.mapper_get_words,\n",
451 | " reducer=self.reducer_count_words),\n",
452 | " MRStep(mapper=self.mapper_word_count_one_key,\n",
453 | " reducer=self.reducer_find_max_word)\n",
454 | " ]\n",
455 | "\n",
456 | " def mapper_get_words(self, _, line):\n",
457 | " # yield each word in the line\n",
458 | " for word in WORD_RE.findall(line):\n",
459 | " yield (word.lower(), 1)\n",
460 | "\n",
461 | " def reducer_count_words(self, word, counts):\n",
462 | " # send all (num_occurrences, word) pairs to the same reducer.\n",
463 | " yield (word, sum(counts))\n",
464 | " \n",
465 | " def mapper_word_count_one_key(self, word, counts):\n",
466 | " # send all the tuples to same reducer\n",
467 | " yield None, (counts, word)\n",
468 | "\n",
469 | " def reducer_find_max_word(self, _, count_word_pairs):\n",
470 | " # each item of word_count_pairs is a tuple (count, word),\n",
471 | " yield max(count_word_pairs)\n",
472 | "\n",
473 | "\n",
474 | "if __name__ == '__main__':\n",
475 | " MRMostUsedWord.run()\n",
476 | "\"\"\")"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": 12,
482 | "metadata": {
483 | "collapsed": false
484 | },
485 | "outputs": [
486 | {
487 | "name": "stdout",
488 | "output_type": "stream",
489 | "text": [
490 | "27801\t\"the\"\r\n"
491 | ]
492 | }
493 | ],
494 | "source": [
495 | "# This time is running on a big dataset\n",
496 | "!python MrJob_job2.py --quiet ../datasets/shakespeare_all.txt"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": 13,
502 | "metadata": {
503 | "collapsed": false
504 | },
505 | "outputs": [
506 | {
507 | "name": "stdout",
508 | "output_type": "stream",
509 | "text": [
510 | "27801\t\"the\"\r\n"
511 | ]
512 | }
513 | ],
514 | "source": [
515 | "!python MrJob_job2.py -r hadoop --quiet hdfs:///datasets/shakespeare_all.txt"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": null,
521 | "metadata": {
522 | "collapsed": true
523 | },
524 | "outputs": [],
525 | "source": []
526 | }
527 | ],
528 | "metadata": {
529 | "kernelspec": {
530 | "display_name": "Python 2",
531 | "language": "python",
532 | "name": "python2"
533 | },
534 | "language_info": {
535 | "codemirror_mode": {
536 | "name": "ipython",
537 | "version": 2
538 | },
539 | "file_extension": ".py",
540 | "mimetype": "text/x-python",
541 | "name": "python",
542 | "nbconvert_exporter": "python",
543 | "pygments_lexer": "ipython2",
544 | "version": "2.7.6"
545 | }
546 | },
547 | "nbformat": 4,
548 | "nbformat_minor": 0
549 | }
550 |
--------------------------------------------------------------------------------
/Chapter 09/Chapter_9_code_01.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Sharing data within the cluster"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "##### Read-only variables (broadcast)"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "# Example: let's encode the gender found in the demographic data\n",
26 | "# As a hot encode. Note: the association should be the same\n",
27 | "# on every machine in the cluster, requiring a shared mapping\n",
28 | "\n",
29 | "one_hot_encoding = {\"M\": (1, 0, 0),\n",
30 | " \"F\": (0, 1, 0),\n",
31 | " \"U\": (0, 0, 1)\n",
32 | " }"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 2,
38 | "metadata": {
39 | "collapsed": false
40 | },
41 | "outputs": [
42 | {
43 | "data": {
44 | "text/plain": [
45 | "[(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (0, 0, 1)]"
46 | ]
47 | },
48 | "execution_count": 2,
49 | "metadata": {},
50 | "output_type": "execute_result"
51 | }
52 | ],
53 | "source": [
54 | "# Gender one-hot-encoding\n",
55 | "(sc.parallelize([\"M\", \"F\", \"U\", \"F\", \"M\", \"U\"])\n",
56 | " .map(lambda x: one_hot_encoding[x])\n",
57 | " .collect())\n",
58 | "\n",
59 | "# The command above works only in the single node configuration\n",
60 | "# since the variable \"one_hot_encoding\" is defined only on this machine\n",
61 | "# On a multi-node cluster, it will raise a Java error"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 3,
67 | "metadata": {
68 | "collapsed": false
69 | },
70 | "outputs": [
71 | {
72 | "data": {
73 | "text/plain": [
74 | "[(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (0, 0, 1)]"
75 | ]
76 | },
77 | "execution_count": 3,
78 | "metadata": {},
79 | "output_type": "execute_result"
80 | }
81 | ],
82 | "source": [
83 | "# Solution 1: include the encoding map in the .map() function \n",
84 | "# In this way, all the nodes will see it\n",
85 | "\n",
86 | "def map_ohe(x):\n",
87 | " ohe = {\"M\": (1, 0, 0),\n",
88 | " \"F\": (0, 1, 0),\n",
89 | " \"U\": (0, 0, 1)\n",
90 | " }\n",
91 | " return ohe[x]\n",
92 | "\n",
93 | "sc.parallelize([\"M\", \"F\", \"U\", \"F\", \"M\", \"U\"]).map(map_ohe).collect()\n",
94 | "\n"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 4,
100 | "metadata": {
101 | "collapsed": false
102 | },
103 | "outputs": [
104 | {
105 | "data": {
106 | "text/plain": [
107 | "[(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0), (1, 0, 0), (0, 0, 1)]"
108 | ]
109 | },
110 | "execution_count": 4,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "# Solution 2: broadcast the map to all the nodes.\n",
117 | "# All of them will be able to read-only it\n",
118 | "\n",
119 | "bcast_map = sc.broadcast(one_hot_encoding)\n",
120 | "\n",
121 | "def bcast_map_ohe(x, shared_ohe):\n",
122 | " return shared_ohe[x]\n",
123 | "\n",
124 | "(sc.parallelize([\"M\", \"F\", \"U\", \"F\", \"M\", \"U\"])\n",
125 | " .map(lambda x: bcast_map_ohe(x, bcast_map.value))\n",
126 | " .collect())"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 5,
132 | "metadata": {
133 | "collapsed": true
134 | },
135 | "outputs": [],
136 | "source": [
137 | "bcast_map.unpersist()"
138 | ]
139 | },
140 | {
141 | "cell_type": "markdown",
142 | "metadata": {},
143 | "source": [
144 | "##### Write-only variables (broadcast)"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 6,
150 | "metadata": {
151 | "collapsed": false
152 | },
153 | "outputs": [
154 | {
155 | "name": "stdout",
156 | "output_type": "stream",
157 | "text": [
158 | "The number of empty lines is:\n"
159 | ]
160 | },
161 | {
162 | "data": {
163 | "text/plain": [
164 | "6"
165 | ]
166 | },
167 | "execution_count": 6,
168 | "metadata": {},
169 | "output_type": "execute_result"
170 | }
171 | ],
172 | "source": [
173 | "# Let's coint the empty line in a file\n",
174 | "\n",
175 | "print \"The number of empty lines is:\"\n",
176 | "\n",
177 | "(sc.textFile('file:///home/vagrant/datasets/hadoop_git_readme.txt')\n",
178 | " .filter(lambda line: len(line) == 0)\n",
179 | " .count())"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 7,
185 | "metadata": {
186 | "collapsed": false
187 | },
188 | "outputs": [
189 | {
190 | "name": "stdout",
191 | "output_type": "stream",
192 | "text": [
193 | "In the file there are 31 lines\n",
194 | "And 6 lines are empty\n"
195 | ]
196 | }
197 | ],
198 | "source": [
199 | "# Let's count the lines in a file, and at the same time,\n",
200 | "# count the empty ones\n",
201 | "\n",
202 | "accum = sc.accumulator(0)\n",
203 | "\n",
204 | "def split_line(line): \n",
205 | " if len(line) == 0:\n",
206 | " accum.add(1)\n",
207 | " return 1\n",
208 | "\n",
209 | "tot_lines = (\n",
210 | " sc.textFile('file:///home/vagrant/datasets/hadoop_git_readme.txt')\n",
211 | " .map(split_line)\n",
212 | " .count())\n",
213 | "\n",
214 | "empty_lines = accum.value\n",
215 | "\n",
216 | "\n",
217 | "print \"In the file there are %d lines\" % tot_lines\n",
218 | "print \"And %d lines are empty\" % empty_lines"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "# Real world example with broadcast and accumulator\n",
226 | "### train multiple classifiers and select the best one, accumulating the errors"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 8,
232 | "metadata": {
233 | "collapsed": true
234 | },
235 | "outputs": [],
236 | "source": [
237 | "# step 1: load the dataset\n",
238 | "# note: if the dataset is large, you should read the next section\n",
239 | "\n",
240 | "from sklearn.datasets import load_iris\n",
241 | "\n",
242 | "bcast_dataset = sc.broadcast(load_iris())"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 9,
248 | "metadata": {
249 | "collapsed": false
250 | },
251 | "outputs": [],
252 | "source": [
253 | "# step 2: create an accumulator that stores the errors in a list\n",
254 | "\n",
255 | "from pyspark import AccumulatorParam\n",
256 | "\n",
257 | "class ErrorAccumulator(AccumulatorParam):\n",
258 | " def zero(self, initialList):\n",
259 | " return initialList\n",
260 | "\n",
261 | " def addInPlace(self, v1, v2):\n",
262 | " if not isinstance(v1, list):\n",
263 | " v1 = [v1]\n",
264 | " if not isinstance(v2, list):\n",
265 | " v2 = [v2]\n",
266 | " return v1 + v2\n",
267 | "\n",
268 | "errAccum = sc.accumulator([], ErrorAccumulator())"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": 10,
274 | "metadata": {
275 | "collapsed": true
276 | },
277 | "outputs": [],
278 | "source": [
279 | "# step 3: create mappers: each of them will use a classifier\n",
280 | "\n",
281 | "def apply_classifier(clf, dataset):\n",
282 | " \n",
283 | " clf_name = clf.__class__.__name__\n",
284 | " X = dataset.value.data\n",
285 | " y = dataset.value.target\n",
286 | " \n",
287 | " try:\n",
288 | " from sklearn.metrics import accuracy_score\n",
289 | " \n",
290 | " clf.fit(X, y)\n",
291 | " y_pred = clf.predict(X)\n",
292 | " acc = accuracy_score(y, y_pred)\n",
293 | "\n",
294 | " return [(clf_name, acc)]\n",
295 | "\n",
296 | " except Exception as e:\n",
297 | " errAccum.add((clf_name, str(e)))\n",
298 | " return []\n"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 11,
304 | "metadata": {
305 | "collapsed": false
306 | },
307 | "outputs": [
308 | {
309 | "data": {
310 | "text/plain": [
311 | "[('DummyClassifier', 0.33333333333333331),\n",
312 | " ('SGDClassifier', 0.66666666666666663)]"
313 | ]
314 | },
315 | "execution_count": 11,
316 | "metadata": {},
317 | "output_type": "execute_result"
318 | }
319 | ],
320 | "source": [
321 | "from sklearn.linear_model import SGDClassifier\n",
322 | "from sklearn.dummy import DummyClassifier\n",
323 | "from sklearn.decomposition import PCA\n",
324 | "from sklearn.manifold import MDS\n",
325 | "\n",
326 | "classifiers = [DummyClassifier('most_frequent'), \n",
327 | " SGDClassifier(), \n",
328 | " PCA(), \n",
329 | " MDS()]\n",
330 | "\n",
331 | "(sc.parallelize(classifiers)\n",
332 | " .flatMap(lambda x: apply_classifier(x, bcast_dataset))\n",
333 | " .collect())"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 12,
339 | "metadata": {
340 | "collapsed": false
341 | },
342 | "outputs": [
343 | {
344 | "name": "stdout",
345 | "output_type": "stream",
346 | "text": [
347 | "The errors are:\n"
348 | ]
349 | },
350 | {
351 | "data": {
352 | "text/plain": [
353 | "[('PCA', \"'PCA' object has no attribute 'predict'\"),\n",
354 | " ('MDS',\n",
355 | " \"Proximity must be 'precomputed' or 'euclidean'. Got euclidean instead\")]"
356 | ]
357 | },
358 | "execution_count": 12,
359 | "metadata": {},
360 | "output_type": "execute_result"
361 | }
362 | ],
363 | "source": [
364 | "print \"The errors are:\"\n",
365 | "errAccum.value"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": 13,
371 | "metadata": {
372 | "collapsed": false
373 | },
374 | "outputs": [],
375 | "source": [
376 | "bcast_dataset.unpersist()"
377 | ]
378 | },
379 | {
380 | "cell_type": "markdown",
381 | "metadata": {},
382 | "source": [
383 | "# Load the data"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 14,
389 | "metadata": {
390 | "collapsed": true
391 | },
392 | "outputs": [],
393 | "source": [
394 | "from pyspark.sql import SQLContext\n",
395 | "sqlContext = SQLContext(sc)"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": 15,
401 | "metadata": {
402 | "collapsed": false
403 | },
404 | "outputs": [
405 | {
406 | "name": "stdout",
407 | "output_type": "stream",
408 | "text": [
409 | "{\"user_id\":0, \"balance\": 10.0}\r\n",
410 | "{\"user_id\":1, \"gender\":\"M\", \"balance\": 1.0}\r\n",
411 | "{\"user_id\":2, \"gender\":\"F\", \"balance\": -0.5}\r\n",
412 | "{\"user_id\":3, \"gender\":\"F\", \"balance\": 0.0}\r\n",
413 | "{\"user_id\":4, \"balance\": 5.0}\r\n",
414 | "{\"user_id\":5, \"gender\":\"M\", \"balance\": 3.0}"
415 | ]
416 | }
417 | ],
418 | "source": [
419 | "!cat /home/vagrant/datasets/users.json"
420 | ]
421 | },
422 | {
423 | "cell_type": "code",
424 | "execution_count": 16,
425 | "metadata": {
426 | "collapsed": false
427 | },
428 | "outputs": [
429 | {
430 | "name": "stdout",
431 | "output_type": "stream",
432 | "text": [
433 | "+-------+------+-------+\n",
434 | "|balance|gender|user_id|\n",
435 | "+-------+------+-------+\n",
436 | "| 10.0| null| 0|\n",
437 | "| 1.0| M| 1|\n",
438 | "| -0.5| F| 2|\n",
439 | "| 0.0| F| 3|\n",
440 | "| 5.0| null| 4|\n",
441 | "| 3.0| M| 5|\n",
442 | "+-------+------+-------+\n",
443 | "\n"
444 | ]
445 | }
446 | ],
447 | "source": [
448 | "df = sqlContext.read.json(\"file:///home/vagrant/datasets/users.json\")\n",
449 | "df.show()"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": 17,
455 | "metadata": {
456 | "collapsed": false
457 | },
458 | "outputs": [
459 | {
460 | "name": "stdout",
461 | "output_type": "stream",
462 | "text": [
463 | "root\n",
464 | " |-- balance: double (nullable = true)\n",
465 | " |-- gender: string (nullable = true)\n",
466 | " |-- user_id: long (nullable = true)\n",
467 | "\n"
468 | ]
469 | }
470 | ],
471 | "source": [
472 | "df.printSchema()"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 18,
478 | "metadata": {
479 | "collapsed": false
480 | },
481 | "outputs": [
482 | {
483 | "name": "stdout",
484 | "output_type": "stream",
485 | "text": [
486 | "+-------+------+-------+\n",
487 | "|balance|gender|user_id|\n",
488 | "+-------+------+-------+\n",
489 | "| 1.0| M| 1|\n",
490 | "| 3.0| M| 5|\n",
491 | "+-------+------+-------+\n",
492 | "\n"
493 | ]
494 | }
495 | ],
496 | "source": [
497 | "(df.filter(df['gender'] != 'null')\n",
498 | " .filter(df['balance'] > 0)\n",
499 | " .select(['balance', 'gender', 'user_id'])\n",
500 | " .show())"
501 | ]
502 | },
503 | {
504 | "cell_type": "code",
505 | "execution_count": 19,
506 | "metadata": {
507 | "collapsed": false
508 | },
509 | "outputs": [
510 | {
511 | "name": "stdout",
512 | "output_type": "stream",
513 | "text": [
514 | "+-------+------+-------+\n",
515 | "|balance|gender|user_id|\n",
516 | "+-------+------+-------+\n",
517 | "| 1.0| M| 1|\n",
518 | "| 3.0| M| 5|\n",
519 | "+-------+------+-------+\n",
520 | "\n"
521 | ]
522 | }
523 | ],
524 | "source": [
525 | "(df.filter('gender is not null')\n",
526 | " .filter('balance > 0').select(\"*\").show())"
527 | ]
528 | },
529 | {
530 | "cell_type": "code",
531 | "execution_count": 20,
532 | "metadata": {
533 | "collapsed": false
534 | },
535 | "outputs": [
536 | {
537 | "name": "stdout",
538 | "output_type": "stream",
539 | "text": [
540 | "+-------+------+-------+\n",
541 | "|balance|gender|user_id|\n",
542 | "+-------+------+-------+\n",
543 | "| 1.0| M| 1|\n",
544 | "| 3.0| M| 5|\n",
545 | "+-------+------+-------+\n",
546 | "\n"
547 | ]
548 | }
549 | ],
550 | "source": [
551 | "df.filter('gender is not null and balance > 0').show()"
552 | ]
553 | },
554 | {
555 | "cell_type": "code",
556 | "execution_count": 21,
557 | "metadata": {
558 | "collapsed": false
559 | },
560 | "outputs": [
561 | {
562 | "name": "stdout",
563 | "output_type": "stream",
564 | "text": [
565 | "+-------+------+-------+\n",
566 | "|balance|gender|user_id|\n",
567 | "+-------+------+-------+\n",
568 | "| 1.0| M| 1|\n",
569 | "| -0.5| F| 2|\n",
570 | "| 0.0| F| 3|\n",
571 | "| 3.0| M| 5|\n",
572 | "+-------+------+-------+\n",
573 | "\n"
574 | ]
575 | }
576 | ],
577 | "source": [
578 | "df.na.drop().show()"
579 | ]
580 | },
581 | {
582 | "cell_type": "code",
583 | "execution_count": 22,
584 | "metadata": {
585 | "collapsed": false
586 | },
587 | "outputs": [
588 | {
589 | "name": "stdout",
590 | "output_type": "stream",
591 | "text": [
592 | "+-------+------+-------+\n",
593 | "|balance|gender|user_id|\n",
594 | "+-------+------+-------+\n",
595 | "| 1.0| M| 1|\n",
596 | "| -0.5| F| 2|\n",
597 | "| 0.0| F| 3|\n",
598 | "| 3.0| M| 5|\n",
599 | "+-------+------+-------+\n",
600 | "\n"
601 | ]
602 | }
603 | ],
604 | "source": [
605 | "df.na.drop(subset=[\"gender\"]).show()"
606 | ]
607 | },
608 | {
609 | "cell_type": "code",
610 | "execution_count": 23,
611 | "metadata": {
612 | "collapsed": false
613 | },
614 | "outputs": [
615 | {
616 | "name": "stdout",
617 | "output_type": "stream",
618 | "text": [
619 | "+-------+------+-------+\n",
620 | "|balance|gender|user_id|\n",
621 | "+-------+------+-------+\n",
622 | "| 10.0| U| 0|\n",
623 | "| 1.0| M| 1|\n",
624 | "| -0.5| F| 2|\n",
625 | "| 0.0| F| 3|\n",
626 | "| 5.0| U| 4|\n",
627 | "| 3.0| M| 5|\n",
628 | "+-------+------+-------+\n",
629 | "\n"
630 | ]
631 | }
632 | ],
633 | "source": [
634 | "df.na.fill({'gender': \"U\", 'balance': 0.0}).show()"
635 | ]
636 | },
637 | {
638 | "cell_type": "code",
639 | "execution_count": 24,
640 | "metadata": {
641 | "collapsed": false
642 | },
643 | "outputs": [
644 | {
645 | "name": "stdout",
646 | "output_type": "stream",
647 | "text": [
648 | "+------+------------+\n",
649 | "|gender|avg(balance)|\n",
650 | "+------+------------+\n",
651 | "| F| -0.25|\n",
652 | "| M| 2.0|\n",
653 | "| U| 7.5|\n",
654 | "+------+------------+\n",
655 | "\n"
656 | ]
657 | }
658 | ],
659 | "source": [
660 | "(df.na.fill({'gender': \"U\", 'balance': 0.0})\n",
661 | " .groupBy(\"gender\").avg('balance').show())"
662 | ]
663 | },
664 | {
665 | "cell_type": "code",
666 | "execution_count": 25,
667 | "metadata": {
668 | "collapsed": true
669 | },
670 | "outputs": [],
671 | "source": [
672 | "df.registerTempTable(\"users\")"
673 | ]
674 | },
675 | {
676 | "cell_type": "code",
677 | "execution_count": 26,
678 | "metadata": {
679 | "collapsed": false
680 | },
681 | "outputs": [
682 | {
683 | "name": "stdout",
684 | "output_type": "stream",
685 | "text": [
686 | "+------+-----+\n",
687 | "|gender| _c1|\n",
688 | "+------+-----+\n",
689 | "| F|-0.25|\n",
690 | "| M| 2.0|\n",
691 | "+------+-----+\n",
692 | "\n"
693 | ]
694 | }
695 | ],
696 | "source": [
697 | "sqlContext.sql(\"\"\"\n",
698 | " SELECT gender, AVG(balance) \n",
699 | " FROM users \n",
700 | " WHERE gender IS NOT NULL \n",
701 | " GROUP BY gender\"\"\").show()"
702 | ]
703 | },
704 | {
705 | "cell_type": "code",
706 | "execution_count": 27,
707 | "metadata": {
708 | "collapsed": false
709 | },
710 | "outputs": [
711 | {
712 | "data": {
713 | "text/plain": [
714 | "pyspark.sql.dataframe.DataFrame"
715 | ]
716 | },
717 | "execution_count": 27,
718 | "metadata": {},
719 | "output_type": "execute_result"
720 | }
721 | ],
722 | "source": [
723 | "type(sqlContext.table(\"users\"))"
724 | ]
725 | },
726 | {
727 | "cell_type": "code",
728 | "execution_count": 28,
729 | "metadata": {
730 | "collapsed": false
731 | },
732 | "outputs": [
733 | {
734 | "data": {
735 | "text/plain": [
736 | "[Row(balance=10.0, gender=None, user_id=0),\n",
737 | " Row(balance=1.0, gender=u'M', user_id=1),\n",
738 | " Row(balance=-0.5, gender=u'F', user_id=2),\n",
739 | " Row(balance=0.0, gender=u'F', user_id=3),\n",
740 | " Row(balance=5.0, gender=None, user_id=4),\n",
741 | " Row(balance=3.0, gender=u'M', user_id=5)]"
742 | ]
743 | },
744 | "execution_count": 28,
745 | "metadata": {},
746 | "output_type": "execute_result"
747 | }
748 | ],
749 | "source": [
750 | "sqlContext.table(\"users\").collect()"
751 | ]
752 | },
753 | {
754 | "cell_type": "code",
755 | "execution_count": 29,
756 | "metadata": {
757 | "collapsed": false
758 | },
759 | "outputs": [
760 | {
761 | "data": {
762 | "text/plain": [
763 | "Row(balance=10.0, gender=None, user_id=0)"
764 | ]
765 | },
766 | "execution_count": 29,
767 | "metadata": {},
768 | "output_type": "execute_result"
769 | }
770 | ],
771 | "source": [
772 | "a_row = sqlContext.sql(\"SELECT * FROM users\").first()\n",
773 | "a_row"
774 | ]
775 | },
776 | {
777 | "cell_type": "code",
778 | "execution_count": 30,
779 | "metadata": {
780 | "collapsed": false
781 | },
782 | "outputs": [
783 | {
784 | "name": "stdout",
785 | "output_type": "stream",
786 | "text": [
787 | "10.0\n",
788 | "10.0\n"
789 | ]
790 | }
791 | ],
792 | "source": [
793 | "print a_row['balance']\n",
794 | "print a_row.balance"
795 | ]
796 | },
797 | {
798 | "cell_type": "code",
799 | "execution_count": 31,
800 | "metadata": {
801 | "collapsed": false
802 | },
803 | "outputs": [
804 | {
805 | "data": {
806 | "text/plain": [
807 | "{'balance': 10.0, 'gender': None, 'user_id': 0}"
808 | ]
809 | },
810 | "execution_count": 31,
811 | "metadata": {},
812 | "output_type": "execute_result"
813 | }
814 | ],
815 | "source": [
816 | "a_row.asDict()"
817 | ]
818 | },
819 | {
820 | "cell_type": "code",
821 | "execution_count": 32,
822 | "metadata": {
823 | "collapsed": true
824 | },
825 | "outputs": [],
826 | "source": [
827 | "!rm -rf /tmp/complete_users*"
828 | ]
829 | },
830 | {
831 | "cell_type": "code",
832 | "execution_count": 33,
833 | "metadata": {
834 | "collapsed": false
835 | },
836 | "outputs": [],
837 | "source": [
838 | "(df.na.drop().write\n",
839 | " .save(\"file:///tmp/complete_users.json\", format='json'))"
840 | ]
841 | },
842 | {
843 | "cell_type": "code",
844 | "execution_count": 34,
845 | "metadata": {
846 | "collapsed": false
847 | },
848 | "outputs": [
849 | {
850 | "name": "stdout",
851 | "output_type": "stream",
852 | "text": [
853 | "total 28\r\n",
854 | "4 drwxrwxr-x 2 vagrant vagrant 4096 May 10 20:36 .\r\n",
855 | "4 drwxrwxrwt 22 root root 4096 May 10 20:36 ..\r\n",
856 | "4 -rw-r--r-- 1 vagrant vagrant 83 May 10 20:36 part-r-00000-f5728f74-10d9-4c7a-8865-64cb80c7ca0a\r\n",
857 | "4 -rw-rw-r-- 1 vagrant vagrant 12 May 10 20:36 .part-r-00000-f5728f74-10d9-4c7a-8865-64cb80c7ca0a.crc\r\n",
858 | "4 -rw-r--r-- 1 vagrant vagrant 82 May 10 20:36 part-r-00001-f5728f74-10d9-4c7a-8865-64cb80c7ca0a\r\n",
859 | "4 -rw-rw-r-- 1 vagrant vagrant 12 May 10 20:36 .part-r-00001-f5728f74-10d9-4c7a-8865-64cb80c7ca0a.crc\r\n",
860 | "0 -rw-r--r-- 1 vagrant vagrant 0 May 10 20:36 _SUCCESS\r\n",
861 | "4 -rw-rw-r-- 1 vagrant vagrant 8 May 10 20:36 ._SUCCESS.crc\r\n"
862 | ]
863 | }
864 | ],
865 | "source": [
866 | "!ls -als /tmp/complete_users.json"
867 | ]
868 | },
869 | {
870 | "cell_type": "code",
871 | "execution_count": 35,
872 | "metadata": {
873 | "collapsed": false
874 | },
875 | "outputs": [
876 | {
877 | "name": "stdout",
878 | "output_type": "stream",
879 | "text": [
880 | "+-------+------+-------+\n",
881 | "|balance|gender|user_id|\n",
882 | "+-------+------+-------+\n",
883 | "| 0.0| F| 3|\n",
884 | "| 3.0| M| 5|\n",
885 | "| 1.0| M| 1|\n",
886 | "| -0.5| F| 2|\n",
887 | "+-------+------+-------+\n",
888 | "\n"
889 | ]
890 | }
891 | ],
892 | "source": [
893 | "sqlContext.sql(\n",
894 | " \"SELECT * FROM json.`file:///tmp/complete_users.json`\").show()"
895 | ]
896 | },
897 | {
898 | "cell_type": "code",
899 | "execution_count": 36,
900 | "metadata": {
901 | "collapsed": true
902 | },
903 | "outputs": [],
904 | "source": [
905 | "df.na.drop().write.save(\n",
906 | " \"file:///tmp/complete_users.parquet\", format='parquet')"
907 | ]
908 | },
909 | {
910 | "cell_type": "code",
911 | "execution_count": 37,
912 | "metadata": {
913 | "collapsed": false
914 | },
915 | "outputs": [
916 | {
917 | "name": "stdout",
918 | "output_type": "stream",
919 | "text": [
920 | "total 44\r\n",
921 | "4 drwxrwxr-x 2 vagrant vagrant 4096 May 10 20:36 .\r\n",
922 | "4 drwxrwxrwt 23 root root 4096 May 10 20:36 ..\r\n",
923 | "4 -rw-r--r-- 1 vagrant vagrant 376 May 10 20:36 _common_metadata\r\n",
924 | "4 -rw-rw-r-- 1 vagrant vagrant 12 May 10 20:36 ._common_metadata.crc\r\n",
925 | "4 -rw-r--r-- 1 vagrant vagrant 1082 May 10 20:36 _metadata\r\n",
926 | "4 -rw-rw-r-- 1 vagrant vagrant 20 May 10 20:36 ._metadata.crc\r\n",
927 | "4 -rw-r--r-- 1 vagrant vagrant 750 May 10 20:36 part-r-00000-810195c2-ffa9-4a54-add7-61e6a7c92095.gz.parquet\r\n",
928 | "4 -rw-rw-r-- 1 vagrant vagrant 16 May 10 20:36 .part-r-00000-810195c2-ffa9-4a54-add7-61e6a7c92095.gz.parquet.crc\r\n",
929 | "4 -rw-r--r-- 1 vagrant vagrant 746 May 10 20:36 part-r-00001-810195c2-ffa9-4a54-add7-61e6a7c92095.gz.parquet\r\n",
930 | "4 -rw-rw-r-- 1 vagrant vagrant 16 May 10 20:36 .part-r-00001-810195c2-ffa9-4a54-add7-61e6a7c92095.gz.parquet.crc\r\n",
931 | "0 -rw-r--r-- 1 vagrant vagrant 0 May 10 20:36 _SUCCESS\r\n",
932 | "4 -rw-rw-r-- 1 vagrant vagrant 8 May 10 20:36 ._SUCCESS.crc\r\n"
933 | ]
934 | }
935 | ],
936 | "source": [
937 | "!ls -als /tmp/complete_users.parquet/"
938 | ]
939 | },
940 | {
941 | "cell_type": "code",
942 | "execution_count": 38,
943 | "metadata": {
944 | "collapsed": false
945 | },
946 | "outputs": [],
947 | "source": [
948 | "from pyspark.sql import Row\n",
949 | "\n",
950 | "rdd_gender = \\\n",
951 | " sc.parallelize([Row(short_gender=\"M\", long_gender=\"Male\"),\n",
952 | " Row(short_gender=\"F\", long_gender=\"Female\")])\n",
953 | "\n",
954 | "(sqlContext.createDataFrame(rdd_gender)\n",
955 | " .registerTempTable(\"gender_maps\"))"
956 | ]
957 | },
958 | {
959 | "cell_type": "code",
960 | "execution_count": 39,
961 | "metadata": {
962 | "collapsed": false
963 | },
964 | "outputs": [
965 | {
966 | "name": "stdout",
967 | "output_type": "stream",
968 | "text": [
969 | "+-----------+------------+\n",
970 | "|long_gender|short_gender|\n",
971 | "+-----------+------------+\n",
972 | "| Male| M|\n",
973 | "| Female| F|\n",
974 | "+-----------+------------+\n",
975 | "\n"
976 | ]
977 | }
978 | ],
979 | "source": [
980 | "sqlContext.table(\"gender_maps\").show()"
981 | ]
982 | },
983 | {
984 | "cell_type": "code",
985 | "execution_count": 40,
986 | "metadata": {
987 | "collapsed": false
988 | },
989 | "outputs": [
990 | {
991 | "name": "stdout",
992 | "output_type": "stream",
993 | "text": [
994 | "+-------+-----------+-------+\n",
995 | "|balance|long_gender|user_id|\n",
996 | "+-------+-----------+-------+\n",
997 | "| 1.0| Male| 1|\n",
998 | "| 3.0| Male| 5|\n",
999 | "| -0.5| Female| 2|\n",
1000 | "| 0.0| Female| 3|\n",
1001 | "+-------+-----------+-------+\n",
1002 | "\n"
1003 | ]
1004 | }
1005 | ],
1006 | "source": [
1007 | "sqlContext.sql(\"\"\"\n",
1008 | " SELECT balance, long_gender, user_id \n",
1009 | " FROM parquet.`file:///tmp/complete_users.parquet` \n",
1010 | " JOIN gender_maps ON gender=short_gender\"\"\").show()"
1011 | ]
1012 | },
1013 | {
1014 | "cell_type": "code",
1015 | "execution_count": 41,
1016 | "metadata": {
1017 | "collapsed": false
1018 | },
1019 | "outputs": [
1020 | {
1021 | "data": {
1022 | "text/plain": [
1023 | "[u'gender_maps', u'users']"
1024 | ]
1025 | },
1026 | "execution_count": 41,
1027 | "metadata": {},
1028 | "output_type": "execute_result"
1029 | }
1030 | ],
1031 | "source": [
1032 | "sqlContext.tableNames()"
1033 | ]
1034 | },
1035 | {
1036 | "cell_type": "code",
1037 | "execution_count": 42,
1038 | "metadata": {
1039 | "collapsed": true
1040 | },
1041 | "outputs": [],
1042 | "source": [
1043 | "for table in sqlContext.tableNames():\n",
1044 | " sqlContext.dropTempTable(table)"
1045 | ]
1046 | },
1047 | {
1048 | "cell_type": "code",
1049 | "execution_count": null,
1050 | "metadata": {
1051 | "collapsed": true
1052 | },
1053 | "outputs": [],
1054 | "source": []
1055 | }
1056 | ],
1057 | "metadata": {
1058 | "kernelspec": {
1059 | "display_name": "Python 2",
1060 | "language": "python",
1061 | "name": "python2"
1062 | },
1063 | "language_info": {
1064 | "codemirror_mode": {
1065 | "name": "ipython",
1066 | "version": 2
1067 | },
1068 | "file_extension": ".py",
1069 | "mimetype": "text/x-python",
1070 | "name": "python",
1071 | "nbconvert_exporter": "python",
1072 | "pygments_lexer": "ipython2",
1073 | "version": "2.7.6"
1074 | }
1075 | },
1076 | "nbformat": 4,
1077 | "nbformat_minor": 0
1078 | }
1079 |
--------------------------------------------------------------------------------
/Chapter 03/Chapter_3_code.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Datasets for experimenting yourself"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import urllib2 # import urllib.request as urllib2 in Python3\n",
19 | "import requests, io, os, StringIO\n",
20 | "import numpy as np\n",
21 | "import tarfile, zipfile, gzip\n",
22 | "\n",
23 | "\n",
24 | "def unzip_from_UCI(UCI_url, dest=''):\n",
25 | " \"\"\"\n",
26 | " Downloads and unpacks datasets from UCI in zip format\n",
27 | " \"\"\"\n",
28 | " response = requests.get(UCI_url)\n",
29 | " compressed_file = io.BytesIO(response.content)\n",
30 | " z = zipfile.ZipFile(compressed_file)\n",
31 | " print ('Extracting in %s' % os.getcwd()+'\\\\'+dest)\n",
32 | " for name in z.namelist():\n",
33 | " if '.csv' in name:\n",
34 | " print ('\\tunzipping %s' %name)\n",
35 | " z.extract(name, path=os.getcwd()+'\\\\'+dest)\n",
36 | "\n",
37 | "def gzip_from_UCI(UCI_url, dest=''):\n",
38 | " \"\"\"\n",
39 | " Downloads and unpacks datasets from UCI in gzip format\n",
40 | " \"\"\"\n",
41 | " response = urllib2.urlopen(UCI_url)\n",
42 | " compressed_file = io.BytesIO(response.read())\n",
43 | " decompressed_file = gzip.GzipFile(fileobj=compressed_file)\n",
44 | " filename = UCI_url.split('/')[-1][:-3]\n",
45 | " with open(os.getcwd()+'\\\\'+filename, 'wb') as outfile:\n",
46 | " outfile.write(decompressed_file.read())\n",
47 | " print ('File %s decompressed' % filename)\n",
48 | " \n",
49 | "def targzip_from_UCI(UCI_url, dest='.'):\n",
50 | " \"\"\"\n",
51 | " Downloads and unpacks datasets from UCI in tar.gz format\n",
52 | " \"\"\"\n",
53 | " response = urllib2.urlopen(UCI_url)\n",
54 | " compressed_file = StringIO.StringIO(response.read())\n",
55 | " tar = tarfile.open(mode=\"r:gz\", fileobj = compressed_file)\n",
56 | " tar.extractall(path=dest)\n",
57 | " datasets = tar.getnames()\n",
58 | " for dataset in datasets:\n",
59 | " size = os.path.getsize(dest+'\\\\'+dataset)\n",
60 | " print ('File %s is %i bytes' % (dataset,size))\n",
61 | " tar.close()\n",
62 | "\n",
63 | "def load_matrix(UCI_url):\n",
64 | " \"\"\"\n",
65 | " Downloads datasets from UCI in matrix form\n",
66 | " \"\"\"\n",
67 | " return np.loadtxt(urllib2.urlopen(UCI_url))"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 2,
73 | "metadata": {
74 | "collapsed": false
75 | },
76 | "outputs": [
77 | {
78 | "name": "stdout",
79 | "output_type": "stream",
80 | "text": [
81 | "Current directory is: \"C:\\scisoft\\WinPython-64bit-2.7.9.4\\notebooks\\Packt - Large Scale\"\n"
82 | ]
83 | }
84 | ],
85 | "source": [
86 | "import os\n",
87 | "print \"Current directory is: \\\"%s\\\"\" % (os.getcwd())"
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 3,
93 | "metadata": {
94 | "collapsed": true
95 | },
96 | "outputs": [],
97 | "source": [
98 | "import zlib\n",
99 | "from random import shuffle, seed\n",
100 | "\n",
101 | "def ram_shuffle(filename_in, filename_out, header=True, random_seed=0):\n",
102 | " with open(filename_in, 'rb') as f:\n",
103 | " zlines = [zlib.compress(line, 9) for line in f]\n",
104 | " if header:\n",
105 | " first_row = zlines.pop(0)\n",
106 | " seed(random_seed)\n",
107 | " shuffle(zlines)\n",
108 | " with open(filename_out, 'wb') as f:\n",
109 | " if header:\n",
110 | " f.write(zlib.decompress(first_row))\n",
111 | " for zline in zlines:\n",
112 | " f.write(zlib.decompress(zline))"
113 | ]
114 | },
115 | {
116 | "cell_type": "markdown",
117 | "metadata": {},
118 | "source": [
119 | "###Bike Sharing Dataset Data Set"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 8,
125 | "metadata": {
126 | "collapsed": false
127 | },
128 | "outputs": [
129 | {
130 | "name": "stdout",
131 | "output_type": "stream",
132 | "text": [
133 | "Extracting in C:\\scisoft\\WinPython-64bit-2.7.9.4\\notebooks\\Packt - Large Scale\\bikesharing\n",
134 | "\tunzipping day.csv\n",
135 | "\tunzipping hour.csv\n"
136 | ]
137 | }
138 | ],
139 | "source": [
140 | "UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip'\n",
141 | "unzip_from_UCI(UCI_url, dest='bikesharing')"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "metadata": {},
147 | "source": [
148 | "###Covertype Data Set "
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 10,
154 | "metadata": {
155 | "collapsed": false
156 | },
157 | "outputs": [
158 | {
159 | "name": "stdout",
160 | "output_type": "stream",
161 | "text": [
162 | "File covtype.data decompressed\n"
163 | ]
164 | }
165 | ],
166 | "source": [
167 | "UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz'\n",
168 | "gzip_from_UCI(UCI_url)"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 3,
174 | "metadata": {
175 | "collapsed": true
176 | },
177 | "outputs": [],
178 | "source": [
179 | "import os\n",
180 | "from random import seed\n",
181 | "local_path = os.getcwd()\n",
182 | "source = 'covtype.data'\n",
183 | "ram_shuffle(filename_in=local_path+'\\\\'+source, \\\n",
184 | " filename_out=local_path+'\\\\shuffled_covtype.data', header=False)"
185 | ]
186 | },
187 | {
188 | "cell_type": "markdown",
189 | "metadata": {},
190 | "source": [
191 | "#Non-linear & faster with Vowpal Wabbit "
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {},
197 | "source": [
198 | "###Useful functions"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 1,
204 | "metadata": {
205 | "collapsed": true
206 | },
207 | "outputs": [],
208 | "source": [
209 | "import numpy as np\n",
210 | "\n",
211 | "def sigmoid(x):\n",
212 | " return 1. / (1. + np.exp(-x))\n",
213 | "\n",
214 | "def apply_log(x): \n",
215 | " return np.log(x + 1.0)\n",
216 | "\n",
217 | "def apply_exp(x): \n",
218 | " return np.exp(x) - 1.0"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "metadata": {},
224 | "source": [
225 | "###Useful dataset examples"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 37,
231 | "metadata": {
232 | "collapsed": false
233 | },
234 | "outputs": [
235 | {
236 | "name": "stdout",
237 | "output_type": "stream",
238 | "text": [
239 | "0 | price:.23 sqft:.25 age:.05 2006\n",
240 | "1 2 'second_house | price:.18 sqft:.15 age:.35 1976\n",
241 | "0 1 0.5 'third_house | price:.53 sqft:.32 age:.87 1924\n"
242 | ]
243 | }
244 | ],
245 | "source": [
246 | "with open('house_dataset','wb') as W:\n",
247 | " W.write(\"0 | price:.23 sqft:.25 age:.05 2006\\n\")\n",
248 | " W.write(\"1 2 'second_house | price:.18 sqft:.15 age:.35 1976\\n\")\n",
249 | " W.write(\"0 1 0.5 'third_house | price:.53 sqft:.32 age:.87 1924\\n\")\n",
250 | "\n",
251 | "with open('house_dataset','rb') as R:\n",
252 | " for line in R:\n",
253 | " print line.strip()"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {},
259 | "source": [
260 | "###A way to call VW from Python"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 2,
266 | "metadata": {
267 | "collapsed": false
268 | },
269 | "outputs": [
270 | {
271 | "name": "stdout",
272 | "output_type": "stream",
273 | "text": [
274 | "Num weight bits = 18\n",
275 | "learning rate = 0.5\n",
276 | "initial_t = 0\n",
277 | "power_t = 0.5\n",
278 | "using no cache\n",
279 | "Reading datafile = house_dataset\n",
280 | "num sources = 1\n",
281 | "average since example example current current current\n",
282 | "loss last counter weight label predict features\n",
283 | "0.000000 0.000000 1 1.0 0.0000 0.0000 5\n",
284 | "0.666667 1.000000 2 3.0 1.0000 0.0000 5\n",
285 | "\n",
286 | "finished run\n",
287 | "number of examples per pass = 3\n",
288 | "passes used = 1\n",
289 | "weighted example sum = 4.000000\n",
290 | "weighted label sum = 2.000000\n",
291 | "average loss = 0.750000\n",
292 | "best constant = 0.500000\n",
293 | "best constant's loss = 0.250000\n",
294 | "total feature number = 15\n",
295 | "------------ COMPLETED ------------\n",
296 | "\n"
297 | ]
298 | }
299 | ],
300 | "source": [
301 | "import subprocess\n",
302 | "\n",
303 | "def execute_vw(parameters):\n",
304 | " execution = subprocess.Popen('vw '+parameters, shell=True, stderr=subprocess.PIPE)\n",
305 | " line = \"\"\n",
306 | " history = \"\"\n",
307 | " while True:\n",
308 | " out = execution.stderr.read(1)\n",
309 | " history += out\n",
310 | " if out == '' and execution.poll() != None:\n",
311 | " print '------------ COMPLETED ------------\\n'\n",
312 | " break\n",
313 | " if out != '':\n",
314 | " line += out\n",
315 | " if '\\n' in line[-2:]:\n",
316 | " print line[:-2]\n",
317 | " line = ''\n",
318 | " return history.split('\\r\\n')\n",
319 | "\n",
320 | "\n",
321 | "params = \"house_dataset\"\n",
322 | "results = execute_vw(params)"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "metadata": {},
328 | "source": [
329 | "###Processing examples"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 2,
335 | "metadata": {
336 | "collapsed": true
337 | },
338 | "outputs": [],
339 | "source": [
340 | "import csv\n",
341 | "\n",
342 | "def vw_convert(origin_file, target_file, binary_features, numeric_features, target, transform_target=lambda(x):x,\n",
343 | " separator=',', classification=True, multiclass=False, fieldnames= None, header=True, sparse=True):\n",
344 | " \"\"\"\n",
345 | " Reads a online style stream and returns a generator of normalized feature vectors\n",
346 | " \n",
347 | " Parameters\n",
348 | " ‐‐‐‐‐‐‐‐‐‐\n",
349 | " original_file = the csv file you are taken the data from \n",
350 | " target file = the file to stream from\n",
351 | " binary_features = the list of qualitative features to consider\n",
352 | " numeric_features = the list of numeric features to consider\n",
353 | " target = the label of the response variable\n",
354 | " transform_target = a function transforming the response\n",
355 | " separator = the field separator character\n",
356 | " classification = a Boolean indicating if it is classification\n",
357 | " multiclass = a Boolean indicating if it is multiclass classification\n",
358 | " fieldnames = the fields' labels (can be ommitted and read from file)\n",
359 | " header = a boolean indicating if the original file has an header\n",
360 | " sparse = if a sparse vector is to be returned from the generator\n",
361 | " \"\"\"\n",
362 | " with open(target_file, 'wb') as W:\n",
363 | " with open(origin_file, 'rb') as R:\n",
364 | " iterator = csv.DictReader(R, fieldnames, delimiter=separator)\n",
365 | " for n, row in enumerate(iterator):\n",
366 | " if not header or n>0:\n",
367 | " # DATA PROCESSING\n",
368 | " response = transform_target(float(row[target]))\n",
369 | " if classification and not multiclass:\n",
370 | " if response == 0:\n",
371 | " stream_row = '-1 '\n",
372 | " else:\n",
373 | " stream_row = '1 '\n",
374 | " else:\n",
375 | " stream_row = str(response)+' '\n",
376 | " quantitative = list()\n",
377 | " qualitative = list()\n",
378 | " for k,v in row.iteritems():\n",
379 | " if k in binary_features:\n",
380 | " qualitative.append(str(k)+'_'+str(v)+':1')\n",
381 | " else:\n",
382 | " if k in numeric_features and (float(v)!=0 or not sparse):\n",
383 | " quantitative.append(str(k)+':'+str(v))\n",
384 | " if quantitative:\n",
385 | " stream_row += '|n '+' '.join(quantitative)\n",
386 | " if qualitative:\n",
387 | " stream_row += '|q ' + ' '.join(qualitative)\n",
388 | " W.write(stream_row+'\\n')"
389 | ]
390 | },
391 | {
392 | "cell_type": "markdown",
393 | "metadata": {},
394 | "source": [
395 | "###Examples with toys datasets"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": 210,
401 | "metadata": {
402 | "collapsed": true
403 | },
404 | "outputs": [],
405 | "source": [
406 | "import numpy as np\n",
407 | "from sklearn.datasets import load_iris, load_boston\n",
408 | "from random import seed\n",
409 | "iris = load_iris()\n",
410 | "seed(2)\n",
411 | "re_order = np.random.permutation(len(iris.target))\n",
412 | "with open('iris_versicolor.vw','wb') as W1:\n",
413 | " for k in re_order:\n",
414 | " y = iris.target[k]\n",
415 | " X = iris.values()[1][k,:]\n",
416 | " features = ' |f '+' '.join([a+':'+str(b) for a,b in zip(map(lambda(a): a[:-5].replace(' ','_'), iris.feature_names),X)])\n",
417 | " target = '1' if y==1 else '-1'\n",
418 | " W1.write(target+features+'\\n')"
419 | ]
420 | },
421 | {
422 | "cell_type": "code",
423 | "execution_count": 203,
424 | "metadata": {
425 | "collapsed": false
426 | },
427 | "outputs": [],
428 | "source": [
429 | "boston = load_boston()\n",
430 | "seed(2)\n",
431 | "re_order = np.random.permutation(len(boston.target))\n",
432 | "with open('boston.vw','wb') as W1:\n",
433 | " for k in re_order:\n",
434 | " y = boston.target[k]\n",
435 | " X = boston.data[k,:]\n",
436 | " features = ' |f '+' '.join([a+':'+str(b) for a,b in zip(map(lambda(a): a[:-5].replace(' ','_'), iris.feature_names),X)])\n",
437 | " W1.write(str(y)+features+'\\n')"
438 | ]
439 | },
440 | {
441 | "cell_type": "markdown",
442 | "metadata": {},
443 | "source": [
444 | "###Binary Iris"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": 197,
450 | "metadata": {
451 | "collapsed": false
452 | },
453 | "outputs": [
454 | {
455 | "name": "stdout",
456 | "output_type": "stream",
457 | "text": [
458 | "using l2 regularization = 1e-006\n",
459 | "predictions = iris_bin.test\n",
460 | "Lambda = 1e-006\n",
461 | "Kernel = rbf\n",
462 | "bandwidth = 0.1\n",
463 | "Num weight bits = 18\n",
464 | "learning rate = 0.5\n",
465 | "initial_t = 0\n",
466 | "power_t = 0.5\n",
467 | "using no cache\n",
468 | "Reading datafile = iris_versicolor.vw\n",
469 | "num sources = 1\n",
470 | "average since example example current current current\n",
471 | "loss last counter weight label predict features\n",
472 | "1.000000 1.000000 1 1.0 -1.0000 0.0000 5\n",
473 | "0.960606 0.921212 2 2.0 -1.0000 -0.0788 5\n",
474 | "1.030685 1.100763 4 4.0 -1.0000 -0.7865 5\n",
475 | "0.790707 0.550729 8 8.0 -1.0000 -0.3755 5\n",
476 | "0.647808 0.504909 16 16.0 -1.0000 -1.2473 5\n",
477 | "0.477695 0.307582 32 32.0 1.0000 0.8621 5\n",
478 | "0.319804 0.161914 64 64.0 -1.0000 -1.7015 5\n",
479 | "0.272695 0.225585 128 128.0 -1.0000 -1.3150 5\n",
480 | "\n",
481 | "finished run\n",
482 | "number of examples = 150\n",
483 | "weighted example sum = 150.000000\n",
484 | "weighted label sum = -50.000000\n",
485 | "average loss = 0.248892\n",
486 | "best constant = -0.333333\n",
487 | "best constant's loss = 0.888889\n",
488 | "total feature number = 750\n",
489 | "Num support = 49\n",
490 | "Number of kernel evaluations = 8836 Number of cache queries = 18555\n",
491 | "Total loss = 37.333748\n",
492 | "Done freeing model\n",
493 | "Done freeing kernel params\n",
494 | "Done with finish \n",
495 | "------------ COMPLETED ------------\n",
496 | "\n"
497 | ]
498 | }
499 | ],
500 | "source": [
501 | "params = '--ksvm --l2 0.000001 --reprocess 2 -b 18 --kernel rbf --bandwidth=0.1 -p iris_bin.test -d iris_versicolor.vw'\n",
502 | "results = execute_vw(params)"
503 | ]
504 | },
505 | {
506 | "cell_type": "code",
507 | "execution_count": 198,
508 | "metadata": {
509 | "collapsed": false
510 | },
511 | "outputs": [
512 | {
513 | "name": "stdout",
514 | "output_type": "stream",
515 | "text": [
516 | "holdout accuracy: 0.966\n"
517 | ]
518 | }
519 | ],
520 | "source": [
521 | "import numpy as np\n",
522 | "def sigmoid(x):\n",
523 | " return 1. / (1. + np.exp(-x))\n",
524 | "\n",
525 | "accuracy = 0\n",
526 | "with open('iris_bin.test', 'rb') as R:\n",
527 | " with open('iris_versicolor.vw', 'rb') as TRAIN:\n",
528 | " holdouts = 0.0\n",
529 | " for n,(line, example) in enumerate(zip(R,TRAIN)):\n",
530 | " if (n+1) % 10==0:\n",
531 | " predicted = float(line.strip())\n",
532 | " y = float(example.split('|')[0])\n",
533 | " accuracy += np.sign(predicted)==np.sign(y)\n",
534 | " holdouts += 1 \n",
535 | "print 'holdout accuracy: %0.3f' % ((accuracy / holdouts)**0.5)"
536 | ]
537 | },
538 | {
539 | "cell_type": "markdown",
540 | "metadata": {},
541 | "source": [
542 | "###Boston"
543 | ]
544 | },
545 | {
546 | "cell_type": "code",
547 | "execution_count": 211,
548 | "metadata": {
549 | "collapsed": false
550 | },
551 | "outputs": [
552 | {
553 | "name": "stdout",
554 | "output_type": "stream",
555 | "text": [
556 | "final_regressor = boston.model\n",
557 | "using dropout for neural network training\n",
558 | "Num weight bits = 18\n",
559 | "learning rate = 0.5\n",
560 | "initial_t = 0\n",
561 | "power_t = 0.5\n",
562 | "decay_learning_rate = 1\n",
563 | "creating cache_file = cache_train.vw\n",
564 | "Reading datafile = boston.vw\n",
565 | "num sources = 1\n",
566 | "average since example example current current current\n",
567 | "loss last counter weight label predict features\n",
568 | "2500.000000 2500.000000 1 1.0 50.0000 0.0000 4\n",
569 | "1570.433136 640.866272 2 2.0 26.4000 1.0847 3\n",
570 | "945.682968 320.932800 4 4.0 21.0000 3.4834 3\n",
571 | "738.617393 531.551817 8 8.0 35.4000 6.9177 4\n",
572 | "559.106543 379.595694 16 16.0 23.1000 6.6911 3\n",
573 | "362.538769 165.970995 32 32.0 16.7000 12.2397 3\n",
574 | "301.716126 240.893483 64 64.0 19.7000 12.3789 3\n",
575 | "236.351873 170.987621 128 128.0 16.1000 15.3972 3\n",
576 | "180.695258 125.038643 256 256.0 26.5000 24.0065 3\n",
577 | "99.536619 99.536619 512 512.0 28.7000 18.4439 3 h\n",
578 | "83.688702 67.840785 1024 1024.0 50.0000 20.8653 4 h\n",
579 | "72.301786 60.914870 2048 2048.0 10.4000 0.0000 3 h\n",
580 | "59.041621 45.840391 4096 4096.0 20.6000 21.1746 4 h\n",
581 | "\n",
582 | "finished run\n",
583 | "number of examples per pass = 456\n",
584 | "passes used = 10\n",
585 | "weighted example sum = 4560.000000\n",
586 | "weighted label sum = 103341.001506\n",
587 | "average loss = 43.299850 h\n",
588 | "best constant = 22.662500\n",
589 | "total feature number = 15220\n",
590 | "------------ COMPLETED ------------\n",
591 | "\n"
592 | ]
593 | }
594 | ],
595 | "source": [
596 | "params = 'boston.vw -f boston.model --loss_function squared -k --cache_file cache_train.vw --passes=20 --nn 5 --dropout'\n",
597 | "results = execute_vw(params)"
598 | ]
599 | },
600 | {
601 | "cell_type": "code",
602 | "execution_count": 212,
603 | "metadata": {
604 | "collapsed": false
605 | },
606 | "outputs": [
607 | {
608 | "name": "stdout",
609 | "output_type": "stream",
610 | "text": [
611 | "only testing\n",
612 | "predictions = boston.test\n",
613 | "using dropout for neural network testing\n",
614 | "Num weight bits = 18\n",
615 | "learning rate = 0.5\n",
616 | "initial_t = 0\n",
617 | "power_t = 0.5\n",
618 | "creating cache_file = cache_test.vw\n",
619 | "Reading datafile = boston.vw\n",
620 | "num sources = 1\n",
621 | "average since example example current current current\n",
622 | "loss last counter weight label predict features\n",
623 | "922.607483 922.607483 1 1.0 50.0000 19.6255 4\n",
624 | "464.302045 5.996608 2 2.0 26.4000 23.9512 3\n",
625 | "253.949617 43.597188 4 4.0 21.0000 21.2530 3\n",
626 | "175.713928 97.478239 8 8.0 35.4000 25.5958 4\n",
627 | "130.466937 85.219947 16 16.0 15.2000 15.8726 3\n",
628 | "79.291346 28.115755 32 32.0 15.6000 19.7057 4\n",
629 | "85.270478 91.249610 64 64.0 22.8000 20.4866 3\n",
630 | "83.265921 81.261364 128 128.0 20.8000 18.1267 3\n",
631 | "70.838572 58.411224 256 256.0 27.5000 16.6386 3\n",
632 | "\n",
633 | "finished run\n",
634 | "number of examples per pass = 506\n",
635 | "passes used = 1\n",
636 | "weighted example sum = 506.000000\n",
637 | "weighted label sum = 11401.600174\n",
638 | "average loss = 65.960779\n",
639 | "best constant = 22.532808\n",
640 | "total feature number = 1687\n",
641 | "------------ COMPLETED ------------\n",
642 | "\n"
643 | ]
644 | }
645 | ],
646 | "source": [
647 | "params = '-t boston.vw -i boston.model -k --cache_file cache_test.vw -p boston.test'\n",
648 | "results = execute_vw(params)"
649 | ]
650 | },
651 | {
652 | "cell_type": "code",
653 | "execution_count": 214,
654 | "metadata": {
655 | "collapsed": false
656 | },
657 | "outputs": [
658 | {
659 | "name": "stdout",
660 | "output_type": "stream",
661 | "text": [
662 | "holdout RMSE: 7.010\n"
663 | ]
664 | }
665 | ],
666 | "source": [
667 | "val_rmse = 0\n",
668 | "with open('boston.test', 'rb') as R:\n",
669 | " with open('boston.vw', 'rb') as TRAIN:\n",
670 | " holdouts = 0.0\n",
671 | " for n,(line, example) in enumerate(zip(R,TRAIN)):\n",
672 | " if (n+1) % 10==0:\n",
673 | " predicted = float(line.strip())\n",
674 | " y = float(example.split('|')[0])\n",
675 | " val_rmse += (predicted - y)**2\n",
676 | " holdouts += 1 \n",
677 | "print 'holdout RMSE: %0.3f' % ((val_rmse / holdouts)**0.5)"
678 | ]
679 | },
680 | {
681 | "cell_type": "markdown",
682 | "metadata": {},
683 | "source": [
684 | "###Bike sharing"
685 | ]
686 | },
687 | {
688 | "cell_type": "code",
689 | "execution_count": 6,
690 | "metadata": {
691 | "collapsed": false
692 | },
693 | "outputs": [],
694 | "source": [
695 | "import os\n",
696 | "local_path = os.getcwd()\n",
697 | "b_vars = ['holiday','hr','mnth', 'season','weathersit','weekday','workingday','yr']\n",
698 | "n_vars = ['hum', 'temp', 'atemp', 'windspeed']\n",
699 | "source = '\\\\bikesharing\\\\hour.csv'\n",
700 | "origin = target_file=local_path+'\\\\'+source\n",
701 | "target = target_file=local_path+'\\\\'+'bike.vw'\n",
702 | "vw_convert(origin, target, binary_features=b_vars, numeric_features=n_vars, target = 'cnt', transform_target=apply_log,\n",
703 | " separator=',', classification=False, multiclass=False, fieldnames= None, header=True)"
704 | ]
705 | },
706 | {
707 | "cell_type": "code",
708 | "execution_count": 45,
709 | "metadata": {
710 | "collapsed": false
711 | },
712 | "outputs": [
713 | {
714 | "name": "stdout",
715 | "output_type": "stream",
716 | "text": [
717 | "final_regressor = regression.model\n",
718 | "Num weight bits = 18\n",
719 | "learning rate = 0.5\n",
720 | "initial_t = 0\n",
721 | "power_t = 0.5\n",
722 | "decay_learning_rate = 1\n",
723 | "creating cache_file = cache_train.vw\n",
724 | "Reading datafile = bike.vw\n",
725 | "num sources = 1\n",
726 | "average since example example current current current\n",
727 | "loss last counter weight label predict features\n",
728 | "8.027098 8.027098 1 1.0 2.8332 0.0000 12\n",
729 | "7.243733 6.460369 2 2.0 3.7136 1.1718 12\n",
730 | "4.184013 1.124293 4 4.0 2.6391 2.4762 12\n",
731 | "2.709537 1.235061 8 8.0 1.3863 1.5636 12\n",
732 | "2.265795 1.822052 16 16.0 4.7095 3.7598 13\n",
733 | "1.325281 0.384768 32 32.0 2.1972 1.5774 13\n",
734 | "1.350559 1.375836 64 64.0 5.0626 3.8186 13\n",
735 | "1.395717 1.440876 128 128.0 4.2195 4.0547 13\n",
736 | "1.165618 0.935518 256 256.0 2.0794 3.3485 13\n",
737 | "0.952714 0.739810 512 512.0 4.0775 3.6438 13\n",
738 | "0.757944 0.563175 1024 1024.0 5.4116 4.0760 13\n",
739 | "0.583856 0.409769 2048 2048.0 1.0986 1.0007 13\n",
740 | "0.453590 0.323324 4096 4096.0 5.4027 5.5651 13\n",
741 | "0.393729 0.333867 8192 8192.0 3.8286 4.1227 12\n",
742 | "0.561750 0.561750 16384 16384.0 4.3944 4.0809 13 h\n",
743 | "0.509105 0.456460 32768 32768.0 4.4659 4.4656 13 h\n",
744 | "0.468332 0.427559 65536 65536.0 4.5951 4.4378 13 h\n",
745 | "\n",
746 | "finished run\n",
747 | "number of examples per pass = 15999\n",
748 | "passes used = 6\n",
749 | "weighted example sum = 95994.000000\n",
750 | "weighted label sum = 439183.191893\n",
751 | "average loss = 0.427485 h\n",
752 | "best constant = 4.575111\n",
753 | "total feature number = 1235898\n",
754 | "------------ COMPLETED ------------\n",
755 | "\n"
756 | ]
757 | }
758 | ],
759 | "source": [
760 | "params = 'bike.vw -f regression.model -k --cache_file cache_train.vw --passes=1000 --hash strings --holdout_after 16000'\n",
761 | "results = execute_vw(params)"
762 | ]
763 | },
764 | {
765 | "cell_type": "code",
766 | "execution_count": 47,
767 | "metadata": {
768 | "collapsed": false
769 | },
770 | "outputs": [
771 | {
772 | "name": "stdout",
773 | "output_type": "stream",
774 | "text": [
775 | "only testing\n",
776 | "predictions = pred.test\n",
777 | "Num weight bits = 18\n",
778 | "learning rate = 0.5\n",
779 | "initial_t = 0\n",
780 | "power_t = 0.5\n",
781 | "creating cache_file = cache_test.vw\n",
782 | "Reading datafile = bike.vw\n",
783 | "num sources = 1\n",
784 | "average since example example current current current\n",
785 | "loss last counter weight label predict features\n",
786 | "0.127379 0.127379 1 1.0 2.8332 3.1901 12\n",
787 | "0.751745 1.376112 2 2.0 3.7136 2.5405 12\n",
788 | "1.210345 1.668944 4 4.0 2.6391 1.5334 12\n",
789 | "2.774795 4.339245 8 8.0 1.3863 4.3803 12\n",
790 | "2.276018 1.777242 16 16.0 4.7095 4.8526 13\n",
791 | "2.179675 2.083333 32 32.0 2.1972 4.6568 13\n",
792 | "1.411963 0.644251 64 64.0 5.0626 5.1554 13\n",
793 | "0.836451 0.260938 128 128.0 4.2195 4.6608 13\n",
794 | "0.677186 0.517921 256 256.0 2.0794 2.8816 13\n",
795 | "0.600932 0.524678 512 512.0 4.0775 4.0583 13\n",
796 | "0.512835 0.424738 1024 1024.0 5.4116 4.8593 13\n",
797 | "0.498590 0.484345 2048 2048.0 1.0986 1.0587 13\n",
798 | "0.422767 0.346943 4096 4096.0 5.4027 5.7840 13\n",
799 | "0.407376 0.391985 8192 8192.0 3.8286 3.9312 12\n",
800 | "0.374806 0.342236 16384 16384.0 5.7900 5.4536 12\n",
801 | "\n",
802 | "finished run\n",
803 | "number of examples per pass = 17379\n",
804 | "passes used = 1\n",
805 | "weighted example sum = 17379.000000\n",
806 | "weighted label sum = 79504.382239\n",
807 | "average loss = 0.380562\n",
808 | "best constant = 4.574739\n",
809 | "total feature number = 223723\n",
810 | "------------ COMPLETED ------------\n",
811 | "\n"
812 | ]
813 | }
814 | ],
815 | "source": [
816 | "params = '-t bike.vw -i regression.model -k --cache_file cache_test.vw -p pred.test'\n",
817 | "results = execute_vw(params)"
818 | ]
819 | },
820 | {
821 | "cell_type": "code",
822 | "execution_count": 10,
823 | "metadata": {
824 | "collapsed": false
825 | },
826 | "outputs": [
827 | {
828 | "name": "stdout",
829 | "output_type": "stream",
830 | "text": [
831 | "holdout RMSE: 135.306\n",
832 | "holdout RMSLE: 0.845\n"
833 | ]
834 | }
835 | ],
836 | "source": [
837 | "val_rmse = 0\n",
838 | "val_rmsle = 0\n",
839 | "with open('pred.test', 'rb') as R:\n",
840 | " with open('bike.vw', 'rb') as TRAIN:\n",
841 | " holdouts = 0.0\n",
842 | " for n,(line, example) in enumerate(zip(R,TRAIN)):\n",
843 | " if n > 16000:\n",
844 | " predicted = float(line.strip())\n",
845 | " y_log = float(example.split('|')[0])\n",
846 | " y = apply_exp(y_log)\n",
847 | " val_rmse += (apply_exp(predicted) - y)**2\n",
848 | " val_rmsle += (predicted - y_log)**2\n",
849 | " holdouts += 1\n",
850 | " \n",
851 | "print 'holdout RMSE: %0.3f' % ((val_rmse / holdouts)**0.5)\n",
852 | "print 'holdout RMSLE: %0.3f' % ((val_rmsle / holdouts)**0.5)\n"
853 | ]
854 | },
855 | {
856 | "cell_type": "markdown",
857 | "metadata": {},
858 | "source": [
859 | "###Covertype"
860 | ]
861 | },
862 | {
863 | "cell_type": "code",
864 | "execution_count": 8,
865 | "metadata": {
866 | "collapsed": false
867 | },
868 | "outputs": [],
869 | "source": [
870 | "import os\n",
871 | "local_path = os.getcwd()\n",
872 | "n_vars = ['var_'+'0'*int(j<10)+str(j) for j in range(54)]\n",
873 | "source = 'shuffled_covtype.data'\n",
874 | "origin = target_file=local_path+'\\\\'+source\n",
875 | "target = target_file=local_path+'\\\\'+'covtype.vw'\n",
876 | "vw_convert(origin, target, binary_features=list(), fieldnames= n_vars+['covertype'], numeric_features=n_vars,\n",
877 | " target = 'covertype', separator=',', classification=True, multiclass=True, header=False, sparse=False)"
878 | ]
879 | },
880 | {
881 | "cell_type": "code",
882 | "execution_count": 20,
883 | "metadata": {
884 | "collapsed": false
885 | },
886 | "outputs": [
887 | {
888 | "name": "stdout",
889 | "output_type": "stream",
890 | "text": [
891 | "creating cubic features for triples: nnn \n",
892 | "final_regressor = multiclass.model\n",
893 | "Num weight bits = 18\n",
894 | "learning rate = 1\n",
895 | "initial_t = 0\n",
896 | "power_t = 0.5\n",
897 | "decay_learning_rate = 1\n",
898 | "creating cache_file = cache_train.vw\n",
899 | "Reading datafile = covtype.vw\n",
900 | "num sources = 1\n",
901 | "average since example example current current current\n",
902 | "loss last counter weight label predict features\n",
903 | "0.000000 0.000000 1 1.0 1 1 377\n",
904 | "0.000000 0.000000 2 2.0 1 1 377\n",
905 | "0.250000 0.500000 4 4.0 2 1 377\n",
906 | "0.375000 0.500000 8 8.0 1 2 377\n",
907 | "0.437500 0.500000 16 16.0 2 1 231\n",
908 | "0.531250 0.625000 32 32.0 1 2 377\n",
909 | "0.546875 0.562500 64 64.0 2 1 377\n",
910 | "0.500000 0.453125 128 128.0 1 1 377\n",
911 | "0.519531 0.539063 256 256.0 2 2 377\n",
912 | "0.484375 0.449219 512 512.0 2 2 377\n",
913 | "0.446289 0.408203 1024 1024.0 3 6 377\n",
914 | "0.416504 0.386719 2048 2048.0 2 2 377\n",
915 | "0.402100 0.387695 4096 4096.0 1 1 377\n",
916 | "0.372559 0.343018 8192 8192.0 1 1 298\n",
917 | "0.348694 0.324829 16384 16384.0 1 1 377\n",
918 | "0.319092 0.289490 32768 32768.0 2 2 377\n",
919 | "0.297256 0.275421 65536 65536.0 2 2 377\n",
920 | "0.278419 0.259583 131072 131072.0 2 2 377\n",
921 | "0.263660 0.248901 262144 262144.0 2 2 377\n",
922 | "0.253858 0.253858 524288 524288.0 1 1 377 h\n",
923 | "\n",
924 | "finished run\n",
925 | "number of examples per pass = 522911\n",
926 | "passes used = 2\n",
927 | "weighted example sum = 1045822.000000\n",
928 | "weighted label sum = 0.000000\n",
929 | "average loss = 0.235538 h\n",
930 | "total feature number = 384838154\n",
931 | "------------ COMPLETED ------------\n",
932 | "\n"
933 | ]
934 | }
935 | ],
936 | "source": [
937 | "params = 'covtype.vw --ect 7 -f multiclass.model -k --cache_file cache_train.vw --passes=2 -l 1.0 --cubic nnn'\n",
938 | "results = execute_vw(params)"
939 | ]
940 | },
941 | {
942 | "cell_type": "code",
943 | "execution_count": 21,
944 | "metadata": {
945 | "collapsed": false
946 | },
947 | "outputs": [
948 | {
949 | "name": "stdout",
950 | "output_type": "stream",
951 | "text": [
952 | "creating cubic features for triples: nnn \n",
953 | "only testing\n",
954 | "predictions = covertype.test\n",
955 | "Num weight bits = 18\n",
956 | "learning rate = 0.5\n",
957 | "initial_t = 0\n",
958 | "power_t = 0.5\n",
959 | "creating cache_file = cache_test.vw\n",
960 | "Reading datafile = covtype.vw\n",
961 | "num sources = 1\n",
962 | "average since example example current current current\n",
963 | "loss last counter weight label predict features\n",
964 | "0.000000 0.000000 1 1.0 1 1 377\n",
965 | "0.000000 0.000000 2 2.0 1 1 377\n",
966 | "0.000000 0.000000 4 4.0 2 2 377\n",
967 | "0.000000 0.000000 8 8.0 1 1 377\n",
968 | "0.187500 0.375000 16 16.0 1 2 377\n",
969 | "0.156250 0.125000 32 32.0 3 3 377\n",
970 | "0.156250 0.156250 64 64.0 2 1 377\n",
971 | "0.218750 0.281250 128 128.0 2 2 377\n",
972 | "0.222656 0.226563 256 256.0 2 2 377\n",
973 | "0.240234 0.257813 512 512.0 2 2 377\n",
974 | "0.234375 0.228516 1024 1024.0 2 2 377\n",
975 | "0.242676 0.250977 2048 2048.0 2 2 377\n",
976 | "0.242920 0.243164 4096 4096.0 1 1 377\n",
977 | "0.236328 0.229736 8192 8192.0 1 1 377\n",
978 | "0.231079 0.225830 16384 16384.0 1 1 298\n",
979 | "0.229858 0.228638 32768 32768.0 1 1 377\n",
980 | "0.232224 0.234589 65536 65536.0 1 1 377\n",
981 | "0.231529 0.230835 131072 131072.0 2 2 377\n",
982 | "0.231815 0.232101 262144 262144.0 2 2 377\n",
983 | "0.231606 0.231396 524288 524288.0 1 1 377\n",
984 | "\n",
985 | "finished run\n",
986 | "number of examples per pass = 581012\n",
987 | "passes used = 1\n",
988 | "weighted example sum = 581012.000000\n",
989 | "weighted label sum = 0.000000\n",
990 | "average loss = 0.231111\n",
991 | "total feature number = 213797603\n",
992 | "------------ COMPLETED ------------\n",
993 | "\n"
994 | ]
995 | }
996 | ],
997 | "source": [
998 | "params = '-t covtype.vw -i multiclass.model -k --cache_file cache_test.vw -p covertype.test'\n",
999 | "results = execute_vw(params)"
1000 | ]
1001 | },
1002 | {
1003 | "cell_type": "code",
1004 | "execution_count": 8,
1005 | "metadata": {
1006 | "collapsed": false
1007 | },
1008 | "outputs": [
1009 | {
1010 | "name": "stdout",
1011 | "output_type": "stream",
1012 | "text": [
1013 | "holdout accuracy: 0.769\n"
1014 | ]
1015 | }
1016 | ],
1017 | "source": [
1018 | "accuracy = 0\n",
1019 | "with open('covertype.test', 'rb') as R:\n",
1020 | " with open('covtype.vw', 'rb') as TRAIN:\n",
1021 | " holdouts = 0.0\n",
1022 | " for n,(line, example) in enumerate(zip(R,TRAIN)):\n",
1023 | " if (n+1) % 10==0:\n",
1024 | " predicted = float(line.strip())\n",
1025 | " y = float(example.split('|')[0])\n",
1026 | " accuracy += predicted ==y\n",
1027 | " holdouts += 1\n",
1028 | "print 'holdout accuracy: %0.3f' % (accuracy / holdouts)"
1029 | ]
1030 | }
1031 | ],
1032 | "metadata": {
1033 | "kernelspec": {
1034 | "display_name": "Python 2",
1035 | "language": "python",
1036 | "name": "python2"
1037 | },
1038 | "language_info": {
1039 | "codemirror_mode": {
1040 | "name": "ipython",
1041 | "version": 2
1042 | },
1043 | "file_extension": ".py",
1044 | "mimetype": "text/x-python",
1045 | "name": "python",
1046 | "nbconvert_exporter": "python",
1047 | "pygments_lexer": "ipython2",
1048 | "version": "2.7.9"
1049 | }
1050 | },
1051 | "nbformat": 4,
1052 | "nbformat_minor": 0
1053 | }
1054 |
--------------------------------------------------------------------------------
/Chapter 09/Chapter_9_code_02.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "!rm -rf kdd*\n",
12 | "\n",
13 | "# !wget -q -O ../datasets/kddtrain.gz \\\n",
14 | "# http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz\n",
15 | "\n",
16 | "!wget -q -O ../datasets/kddtrain.gz \\\n",
17 | "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz\n",
18 | "\n",
19 | "!wget -q -O ../datasets/kddtest.gz \\\n",
20 | "http://kdd.ics.uci.edu/databases/kddcup99/corrected.gz\n",
21 | " \n",
22 | "!wget -q -O ../datasets/kddnames \\\n",
23 | "http://kdd.ics.uci.edu/databases/kddcup99/kddcup.names\n",
24 | "\n",
25 | "!gunzip ../datasets/kdd*gz"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 2,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "0,tcp,http,SF,181,5450,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,9,9,1.00,0.00,0.11,0.00,0.00,0.00,0.00,0.00,normal.\r\n",
40 | "0,tcp,http,SF,239,486,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,19,19,1.00,0.00,0.05,0.00,0.00,0.00,0.00,0.00,normal.\r\n",
41 | "0,tcp,http,SF,235,1337,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,8,8,0.00,0.00,0.00,0.00,1.00,0.00,0.00,29,29,1.00,0.00,0.03,0.00,0.00,0.00,0.00,0.00,normal.\r\n"
42 | ]
43 | }
44 | ],
45 | "source": [
46 | "!head -3 ../datasets/kddtrain"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": 3,
52 | "metadata": {
53 | "collapsed": false
54 | },
55 | "outputs": [
56 | {
57 | "name": "stdout",
58 | "output_type": "stream",
59 | "text": [
60 | "Num features: 41\n",
61 | "First 10: ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot']\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "with open('../datasets/kddnames', 'r') as fh:\n",
67 | " header = [line.split(':')[0] \n",
68 | " for line in fh.read().splitlines()][1:]\n",
69 | "\n",
70 | "header.append('target')\n",
71 | "\n",
72 | "print \"Num features:\", len(header)-1\n",
73 | "print \"First 10:\", header[:10]"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 4,
79 | "metadata": {
80 | "collapsed": false
81 | },
82 | "outputs": [],
83 | "source": [
84 | "train_rdd = sc.textFile('file:///home/vagrant/datasets/kddtrain')\n",
85 | "test_rdd = sc.textFile('file:///home/vagrant/datasets/kddtest')"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 5,
91 | "metadata": {
92 | "collapsed": false,
93 | "scrolled": false
94 | },
95 | "outputs": [],
96 | "source": [
97 | "def line_parser(line):\n",
98 | "\n",
99 | " def piece_parser(piece):\n",
100 | " if \".\" in piece or piece.isdigit():\n",
101 | " return float(piece)\n",
102 | " else:\n",
103 | " return piece\n",
104 | "\n",
105 | " return [piece_parser(piece) for piece in line[:-1].split(',')]\n",
106 | " \n",
107 | "train_df = sqlContext.createDataFrame(\n",
108 | " train_rdd.map(line_parser), header)\n",
109 | "\n",
110 | "test_df = sqlContext.createDataFrame(\n",
111 | " test_rdd.map(line_parser), header)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 6,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [
121 | {
122 | "name": "stdout",
123 | "output_type": "stream",
124 | "text": [
125 | "Train observations: 494021\n",
126 | "Test observations: 311029\n"
127 | ]
128 | }
129 | ],
130 | "source": [
131 | "print \"Train observations:\", train_df.count()\n",
132 | "print \"Test observations:\", test_df.count()"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 7,
138 | "metadata": {
139 | "collapsed": false
140 | },
141 | "outputs": [
142 | {
143 | "name": "stdout",
144 | "output_type": "stream",
145 | "text": [
146 | "root\n",
147 | " |-- duration: double (nullable = true)\n",
148 | " |-- protocol_type: string (nullable = true)\n",
149 | " |-- service: string (nullable = true)\n",
150 | " |-- flag: string (nullable = true)\n",
151 | " |-- src_bytes: double (nullable = true)\n",
152 | " |-- dst_bytes: double (nullable = true)\n",
153 | " |-- land: double (nullable = true)\n",
154 | " |-- wrong_fragment: double (nullable = true)\n",
155 | " |-- urgent: double (nullable = true)\n",
156 | " |-- hot: double (nullable = true)\n",
157 | " |-- num_failed_logins: double (nullable = true)\n",
158 | " |-- logged_in: double (nullable = true)\n",
159 | " |-- num_compromised: double (nullable = true)\n",
160 | " |-- root_shell: double (nullable = true)\n",
161 | " |-- su_attempted: double (nullable = true)\n",
162 | " |-- num_root: double (nullable = true)\n",
163 | " |-- num_file_creations: double (nullable = true)\n",
164 | " |-- num_shells: double (nullable = true)\n",
165 | " |-- num_access_files: double (nullable = true)\n",
166 | " |-- num_outbound_cmds: double (nullable = true)\n",
167 | " |-- is_host_login: double (nullable = true)\n",
168 | " |-- is_guest_login: double (nullable = true)\n",
169 | " |-- count: double (nullable = true)\n",
170 | " |-- srv_count: double (nullable = true)\n",
171 | " |-- serror_rate: double (nullable = true)\n",
172 | " |-- srv_serror_rate: double (nullable = true)\n",
173 | " |-- rerror_rate: double (nullable = true)\n",
174 | " |-- srv_rerror_rate: double (nullable = true)\n",
175 | " |-- same_srv_rate: double (nullable = true)\n",
176 | " |-- diff_srv_rate: double (nullable = true)\n",
177 | " |-- srv_diff_host_rate: double (nullable = true)\n",
178 | " |-- dst_host_count: double (nullable = true)\n",
179 | " |-- dst_host_srv_count: double (nullable = true)\n",
180 | " |-- dst_host_same_srv_rate: double (nullable = true)\n",
181 | " |-- dst_host_diff_srv_rate: double (nullable = true)\n",
182 | " |-- dst_host_same_src_port_rate: double (nullable = true)\n",
183 | " |-- dst_host_srv_diff_host_rate: double (nullable = true)\n",
184 | " |-- dst_host_serror_rate: double (nullable = true)\n",
185 | " |-- dst_host_srv_serror_rate: double (nullable = true)\n",
186 | " |-- dst_host_rerror_rate: double (nullable = true)\n",
187 | " |-- dst_host_srv_rerror_rate: double (nullable = true)\n",
188 | " |-- target: string (nullable = true)\n",
189 | "\n"
190 | ]
191 | }
192 | ],
193 | "source": [
194 | "train_df.printSchema()"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 8,
200 | "metadata": {
201 | "collapsed": false
202 | },
203 | "outputs": [],
204 | "source": [
205 | "from pyspark.ml import Pipeline\n",
206 | "from pyspark.ml.feature import StringIndexer\n",
207 | "\n",
208 | "\n",
209 | "cols_categorical = [\"protocol_type\", \"service\", \"flag\",\"target\"]\n",
210 | "preproc_stages = []\n",
211 | "\n",
212 | "for col in cols_categorical:\n",
213 | " out_col = col + \"_cat\"\n",
214 | " preproc_stages.append(\n",
215 | " StringIndexer(\n",
216 | " inputCol=col, outputCol=out_col, handleInvalid=\"skip\"))\n",
217 | "\n",
218 | "pipeline = Pipeline(stages=preproc_stages)\n",
219 | "indexer = pipeline.fit(train_df)\n",
220 | "\n",
221 | "train_num_df = indexer.transform(train_df)\n",
222 | "test_num_df = indexer.transform(test_df)"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 9,
228 | "metadata": {
229 | "collapsed": false
230 | },
231 | "outputs": [
232 | {
233 | "name": "stdout",
234 | "output_type": "stream",
235 | "text": [
236 | "[StringIndexer_46ae881ca7febd4a4e81, StringIndexer_49f6bbd151ce1e9bb5a7, StringIndexer_4cfcb173a161bbe6cd60, StringIndexer_4aa581cc25ad8d6eed7e]\n",
237 | "\n",
238 | "Pipeline_450a8f0d2083e96d03ca\n",
239 | "PipelineModel_475d9917035781236edb\n"
240 | ]
241 | }
242 | ],
243 | "source": [
244 | "print pipeline.getStages()\n",
245 | "print\n",
246 | "print pipeline\n",
247 | "print indexer"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 10,
253 | "metadata": {
254 | "collapsed": false
255 | },
256 | "outputs": [
257 | {
258 | "name": "stdout",
259 | "output_type": "stream",
260 | "text": [
261 | "First observation, after the 4 StringIndexers:\n",
262 | "\n",
263 | "Row(duration=0.0, protocol_type=u'tcp', service=u'http', flag=u'SF', src_bytes=181.0, dst_bytes=5450.0, land=0.0, wrong_fragment=0.0, urgent=0.0, hot=0.0, num_failed_logins=0.0, logged_in=1.0, num_compromised=0.0, root_shell=0.0, su_attempted=0.0, num_root=0.0, num_file_creations=0.0, num_shells=0.0, num_access_files=0.0, num_outbound_cmds=0.0, is_host_login=0.0, is_guest_login=0.0, count=8.0, srv_count=8.0, serror_rate=0.0, srv_serror_rate=0.0, rerror_rate=0.0, srv_rerror_rate=0.0, same_srv_rate=1.0, diff_srv_rate=0.0, srv_diff_host_rate=0.0, dst_host_count=9.0, dst_host_srv_count=9.0, dst_host_same_srv_rate=1.0, dst_host_diff_srv_rate=0.0, dst_host_same_src_port_rate=0.11, dst_host_srv_diff_host_rate=0.0, dst_host_serror_rate=0.0, dst_host_srv_serror_rate=0.0, dst_host_rerror_rate=0.0, dst_host_srv_rerror_rate=0.0, target=u'normal', protocol_type_cat=1.0, service_cat=2.0, flag_cat=0.0, target_cat=2.0)\n"
264 | ]
265 | }
266 | ],
267 | "source": [
268 | "print \"First observation, after the 4 StringIndexers:\\n\"\n",
269 | "print train_num_df.first()"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 11,
275 | "metadata": {
276 | "collapsed": false
277 | },
278 | "outputs": [
279 | {
280 | "name": "stdout",
281 | "output_type": "stream",
282 | "text": [
283 | "['num_access_files', 'src_bytes', 'srv_count', 'num_outbound_cmds', 'rerror_rate', 'urgent', 'protocol_type_cat', 'dst_host_same_srv_rate', 'duration', 'dst_host_diff_srv_rate', 'srv_serror_rate', 'is_host_login', 'wrong_fragment', 'serror_rate', 'num_compromised', 'is_guest_login', 'dst_host_rerror_rate', 'dst_host_srv_serror_rate', 'hot', 'dst_host_srv_count', 'logged_in', 'srv_rerror_rate', 'dst_host_srv_diff_host_rate', 'srv_diff_host_rate', 'dst_host_same_src_port_rate', 'root_shell', 'service_cat', 'su_attempted', 'dst_host_count', 'num_file_creations', 'flag_cat', 'count', 'land', 'same_srv_rate', 'dst_bytes', 'num_shells', 'dst_host_srv_rerror_rate', 'num_root', 'diff_srv_rate', 'num_failed_logins', 'dst_host_serror_rate']\n",
284 | "Total numerical features: 41\n"
285 | ]
286 | }
287 | ],
288 | "source": [
289 | "features_header = set(header) \\\n",
290 | " - set(cols_categorical) \\\n",
291 | " | set([c + \"_cat\" for c in cols_categorical]) \\\n",
292 | " - set([\"target\", \"target_cat\"])\n",
293 | "features_header = list(features_header)\n",
294 | "print features_header\n",
295 | "print \"Total numerical features:\", len(features_header)"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 12,
301 | "metadata": {
302 | "collapsed": false
303 | },
304 | "outputs": [],
305 | "source": [
306 | "from pyspark.mllib.linalg import Vectors\n",
307 | "from pyspark.ml.feature import VectorAssembler\n",
308 | "\n",
309 | "assembler = VectorAssembler(\n",
310 | " inputCols=features_header,\n",
311 | " outputCol=\"features\")\n",
312 | "\n",
313 | "Xy_train = (assembler\n",
314 | " .transform(train_num_df)\n",
315 | " .select(\"features\", \"target_cat\"))\n",
316 | "Xy_test = (assembler\n",
317 | " .transform(test_num_df)\n",
318 | " .select(\"features\", \"target_cat\"))"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 13,
324 | "metadata": {
325 | "collapsed": false
326 | },
327 | "outputs": [
328 | {
329 | "data": {
330 | "text/plain": [
331 | "Row(features=SparseVector(41, {1: 181.0, 2: 8.0, 6: 1.0, 7: 1.0, 19: 9.0, 20: 1.0, 24: 0.11, 26: 2.0, 28: 9.0, 31: 8.0, 33: 1.0, 34: 5450.0}), target_cat=2.0)"
332 | ]
333 | },
334 | "execution_count": 13,
335 | "metadata": {},
336 | "output_type": "execute_result"
337 | }
338 | ],
339 | "source": [
340 | "Xy_train.first()"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": 14,
346 | "metadata": {
347 | "collapsed": false
348 | },
349 | "outputs": [],
350 | "source": [
351 | "from pyspark.ml.classification import RandomForestClassifier\n",
352 | "\n",
353 | "clf = RandomForestClassifier(\n",
354 | " labelCol=\"target_cat\", featuresCol=\"features\", \n",
355 | " maxBins=100, seed=101)\n",
356 | "fit_clf = clf.fit(Xy_train)"
357 | ]
358 | },
359 | {
360 | "cell_type": "code",
361 | "execution_count": 15,
362 | "metadata": {
363 | "collapsed": false
364 | },
365 | "outputs": [
366 | {
367 | "name": "stdout",
368 | "output_type": "stream",
369 | "text": [
370 | "RandomForestClassifier_40f9923cb13e74b28cbe\n",
371 | "RandomForestClassificationModel (uid=rfc_ac17a1f959a3) with 20 trees\n"
372 | ]
373 | }
374 | ],
375 | "source": [
376 | "print clf\n",
377 | "print fit_clf"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 16,
383 | "metadata": {
384 | "collapsed": false
385 | },
386 | "outputs": [],
387 | "source": [
388 | "Xy_pred_train = fit_clf.transform(Xy_train)\n",
389 | "Xy_pred_test = fit_clf.transform(Xy_test)"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": 17,
395 | "metadata": {
396 | "collapsed": false
397 | },
398 | "outputs": [
399 | {
400 | "name": "stdout",
401 | "output_type": "stream",
402 | "text": [
403 | "First observation after classification stage:\n",
404 | "Row(features=SparseVector(41, {1: 105.0, 2: 1.0, 6: 2.0, 7: 1.0, 9: 0.01, 19: 254.0, 26: 1.0, 28: 255.0, 31: 1.0, 33: 1.0, 34: 146.0}), target_cat=2.0, rawPrediction=DenseVector([0.0283, 0.0112, 19.3474, 0.0677, 0.0251, 0.1414, 0.0357, 0.1194, 0.1309, 0.041, 0.0257, 0.0079, 0.0046, 0.0004, 0.0029, 0.0016, 0.002, 0.0023, 0.0013, 0.0008, 0.0012, 0.0006, 0.0006]), probability=DenseVector([0.0014, 0.0006, 0.9674, 0.0034, 0.0013, 0.0071, 0.0018, 0.006, 0.0065, 0.002, 0.0013, 0.0004, 0.0002, 0.0, 0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.0, 0.0001, 0.0, 0.0]), prediction=2.0)\n"
405 | ]
406 | }
407 | ],
408 | "source": [
409 | "print \"First observation after classification stage:\"\n",
410 | "print Xy_pred_test.first()"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 18,
416 | "metadata": {
417 | "collapsed": false
418 | },
419 | "outputs": [
420 | {
421 | "name": "stdout",
422 | "output_type": "stream",
423 | "text": [
424 | "F1-score train set: 0.991904372002\n",
425 | "F1-score test set: 0.966840043466\n"
426 | ]
427 | }
428 | ],
429 | "source": [
430 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
431 | "\n",
432 | "evaluator = MulticlassClassificationEvaluator(\n",
433 | " labelCol=\"target_cat\", predictionCol=\"prediction\", \n",
434 | " metricName=\"f1\")\n",
435 | "\n",
436 | "print \"F1-score train set:\", evaluator.evaluate(Xy_pred_train)\n",
437 | "print \"F1-score test set:\", evaluator.evaluate(Xy_pred_test)"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": 19,
443 | "metadata": {
444 | "collapsed": false
445 | },
446 | "outputs": [
447 | {
448 | "name": "stdout",
449 | "output_type": "stream",
450 | "text": [
451 | "F1-score test set: 0.966840043466\n"
452 | ]
453 | }
454 | ],
455 | "source": [
456 | "# All in one\n",
457 | "\n",
458 | "full_stages = preproc_stages + [assembler, clf]\n",
459 | "full_pipeline = Pipeline(stages=full_stages)\n",
460 | "full_model = full_pipeline.fit(train_df)\n",
461 | "predictions = full_model.transform(test_df)\n",
462 | "print \"F1-score test set:\", evaluator.evaluate(predictions)"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": 20,
468 | "metadata": {
469 | "collapsed": false
470 | },
471 | "outputs": [],
472 | "source": [
473 | "import matplotlib.pyplot as plt\n",
474 | "import numpy as np\n",
475 | "%matplotlib inline\n",
476 | " \n",
477 | "def plot_confusion_matrix(cm):\n",
478 | " cm_normalized = \\\n",
479 | " cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
480 | " plt.imshow(\n",
481 | " cm_normalized, interpolation='nearest', cmap=plt.cm.Blues)\n",
482 | " plt.title('Normalized Confusion matrix')\n",
483 | " plt.colorbar()\n",
484 | " plt.tight_layout()\n",
485 | " plt.ylabel('True label')\n",
486 | " plt.xlabel('Predicted label')\n"
487 | ]
488 | },
489 | {
490 | "cell_type": "code",
491 | "execution_count": 21,
492 | "metadata": {
493 | "collapsed": false
494 | },
495 | "outputs": [
496 | {
497 | "data": {
498 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAU4AAAEoCAYAAAA3/hguAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAH4dJREFUeJzt3Xu8VFXdx/HPnCMqclEUQyWVvJHaRTTFMOB4qcBXF+1m\naBfTNM1LT1qmPT2CaGVmpampaZpZWZqSWaFleTiYKKCAeMFLaaYgKpIKoiKc54/f2sw+c2b2XpuZ\nPbNmzvfNa17smdmzZ82Zw491/4GIiIiIiIiIiIiIiIiIiIiIiIj0cZ3A0e74COC2Gl9/BLAWaKvx\ndX30B24B/gv8torr5PFzaZSxwKJGF0IkzZPAUmCT2GNfBO5oSGl6uwM4KsfrjyA9cB4OzAVeARYD\nfwb2q8F7fxa4J+W9W8laYIdGF0L89ZVfzPXVBnylBtcpuFsrOQX4EXAO8BZgW+AS4CM1uPb2wKNY\nQOkrkn4/NqhbKUSq9ATwDWAZsKl7rLTGOQaYgzUpZwPvjT3XiQWVfwArgR2xQHA88BjwMjDVPT7L\nXeM3QD/3+s2APwLPAS9iTdfhsevHa5xHAjPd8WlYDTC6rQauds9tCvwMqx0+DZxN8T/PNuB84Hng\nn8AJVK5xbuqu/fEyz0U2Ai4AnnG3HwEbuuc63PufgtXqF7vPAHAW8DrwhnuPo4ApwLWxa48oKduR\nrswvA//CasLR4zNjr0v7vqYCd7rr3AZsUeGzReX/Ovb9LAYOAQ7GAv4y4PTY+ftg3/Fyd+5FFL/n\nLvdZVrjP+8nY9U8DlgDXuMf+416zo3uPUe7+Ntj3Nq5CeUXq5gngQOBGLMBAz8C5OfYP4QjsH/Cn\nsQA3xD3fiTX3d3XP98P+gUwDBgK7YQHi71ggGAw8CHwudv1DgY3d+de710YqBc64t2JB64Pu/jTg\nUqwPcUusOXyse+444GEsOA9x119D+cA5AQvISS2WqcBdwFB3+4d7DCwIrMYCYjswEfvPJfoPajLw\ni9i1JlM5cA4AXgJ2ds8Nw3620PPn4vN9PQbshP3M7wC+W+GzReX/liv/F4EXgF+58uwGvIrVnAH2\nxIJnm3vsIXq2ZEqb6tH1v4v93mxMz8CJe88Hse/yNuC8CmUVqasngAOA3bEaylB6Bs7PAneXvOYu\n4PPu+A4sMMStpWctZy5Wa4mcj9XMytkD+4ceSQuc/YF7Y9cfBryG/SOMTMICN+7vY2PPvZ/KNc4j\nsJpQksexABv5APYzBQsCr5ZceykWXKB3DbP0/gh6Bs7lwMewzxx3JMWfi8/39c3Yc8cD00s/VEn5\no+b1IFeevWPnzAU+WuH1/wPcFLtfLnC+TrGGHj0WD5wANwMLgfkUa7BSB+rjTPcg1mQ+HeiOPb4N\n8FTJuf92j0dKf9HBAkRkVZn7A93xJsDlWK31JWAGViPz7Sv9GVaD/L67vz32j2sJFmiWA5dhNU+A\nrUvKW/rZ4pZh/5Ek/f5sg/084teL/2yW0bMP81WKnz2LlcBhWI15MfZdjaxQnrTv69nYcfy7KGcZ\nxd+HVe7v0u9ygDvexZVrCfZdfpvK3QCR57HuiiRXYv+xX4TVUKVOFDj9TAaOoWcf4zMUm2KR7d3j\nkW7W36nYP7h9sIA5Hv9BptOxJufRscf+g9VitsCap0Pcdd/pnl8CbBc7P35capa71qEJ5yzGaobx\n6y1OL3pZK+g5u2Grkuf/gtVot8Km7FxR5ho+31deLsWa5zthP/P/Jf3fXtrvzkCsD/lKrF94SPLp\nUksKnH7+ic0njPdLTccC2yRs1PMw4O1YzSLiE+QKFY4HYrWWl7D+ucmeZZ0InIQ1XV+PPb4ECzA/\nxJqWbdggQzSgcD1wMsU+zvjgRqmXgDOxUfSPYkGtn3vv77lzrsP6AKM+zjPp2dzOYr4r57ZY4Dkj\n9txbXBkGYLWulVjfbKlafV/rYyA28POqe8/jS55fin0XWVyIDXAdC/wJaz1InShw+puKBYioJrAM\n+BBWM3wB+Jq7H++HLK01lKtFdJccR/cvwPrsXsD64qZXeH3p6z6FBaqHKY6s/8Q99zms3+whV84b\nKNbersAGGRZg/XM3JrwfWAA+BQuOz2HN4C9THMA6x13nfneb6x4r97mTPg/A7dh/XPdjo+K3xJ5v\nA76K1RyXYRPFjy9znazfV2kZypUx6X7c17CR/peBn2KzJ+LnT8FGzpcDn0h47+ixj2I17OhznoIN\nQE1KKIOIiIiXq7Aa/cKEc36MzahYQHGKl4hInzUWC4aVAufB2Io3gNH0nnkhItInjaBy4LwM6++O\nLMKm7iVSH6eI9GXD6TkN72ls4UiiINfAFgZs0929cn1nrohIo4wdN56ZXTNqNzuhfaNu1ryefl7R\nK9gqvCxKy5s6jTDIwNm9cjEb73FCr8dXL5lNv6336fHY8jkX16tYdXHO1Cl868wpjS5Grlr9M7b6\n54PKn7F/vxrP6FrzetlYUMlr8y8ZlPEdnsGmuUWiZcqJGtVUn4D1JTyGbaQhIlJeoc3/lt0fKO4P\nsS+2vHpp5dNNI2qc7cDFwEFYZJ+DFf7hBpRFREJXqKoWex226m4o1pc5meK6/suxEfWDsb0VVgJf\n8LloIwLnPlghn3T3f4NN6E0NnG0Dh6ed0vTGje9odBFy1+qfsdU/H9T5M65fTTLisyjgxKwXbUTg\nLDeKNdrnhe2DFDhbQat/xlb/fFDnz9jWXr/38tSIwOm18cXqJbPXHbcNHN4ngqZIs+ma0UnXjM58\n36S6pnouGhE4S0extsVqnT2Ujp6LSHjGje/oUfv89tln1f5Nqmuq56IRJZqL7dY9Attw4jBscEhE\npLdCwf9WJ42ocb6Jdcbeho2wRxvuioj0FmCNs1ET4KdTOS0B4D+xfcje2QbEWm3CvEjLUx+niEhG\nqnGKiGSkGqeISEaqcYqIZKTAKSKSUZua6iIi2ajGKSKSkQaHREQy0iYfIiIZBdhUD69EIiJx1a9V\nT8s4MQSYhuVVvwfYPa1ICpwiErbqUmdEGScmALthGxvvWnLON4H7gHdjaTQuTCtS0zfVs649z7K2\nXevaRQJQ3eCQT8aJXYFz3fEj2M5tWwLPV7qoapwiErbqapzlMk6U7oq+APiYO94H2J6U3OpNX+MU\nkRaXUONcs+wx1i57POnVPhknzsWa5/OAhe7vNUkvUOAUkbAljKq3Dx1J+9CR6+6vefzW0lN8Mk68\nAhwVu/8E8K+kIqmpLiJhq66p7pNxYlP3HMAxwAxgRVKRVOMUkbBVNzhUKePEl9zzl2Oj7T/HmvUP\nAEenXVSBU0TCVv0E+HIZJy6PHc8CRpKBAqeIhE1r1UVEMgpwyaUCp4iETTVOEZFs2tpU42y4p2de\n4H3ukAPPynTt5X+bnLU4IpImvApn3wucItJcCmqqi4hko8ApIpKRAqeISEYKnCIiWYUXNxU4RSRs\nqnGKiGSkwCkikpECp4hIRgqcIiJZhRc3tQO8iIStUCh43ypIy6s+FLgVmI9tZHxkWpmavsbZ3e2T\ni6lo5RuJOZh6yLr2fMjHL/O/9o3HZbq2SF9VZVM9yqt+EJZ/aA6WOiOeHvhELEHbGVgQfQT4JbZ7\nfFmqcYpI0AptBe9bGfG86qsp5lWPWwIMdseDgWUkBE1ogRqniLS2Kmuc5fKqjy455wrg78BiYBDw\nqbSLKnCKSNCSAucbix/kjSUPJr3cpy/vm1j/ZgewI/BX4N1Y2uCyFDhFJGhJgXOj4e9go+HvWHd/\n5bwbSk/xyas+Bvi2O/4nlld9JJZauCz1cYpI0KocVffJq74IGzwCGIYFzX8llUk1ThEJW3XzOH3y\nqn8HuBpYgFUmTwNeTLqoAqeIBK0GK4fS8qq/AHw4ywUVOEUkaFpyKSKSkQKniEhW4cXNcAPn2rV+\nSynbyq8WqGjzAf3WpzhesiyjHPK+07Jd+87zshZHpCWoxikikpECp4hIRgqcPT0JvAyswRbf79PA\nsohIoCps3tFQjQyc3dja0MSJpiLSt6nG2Vt4PxERCUqIgbORa9W7gduxtaTHNLAcIhKwQsH/Vi+N\nrHHuh20guiW2jdMiYGb05LfPnrLuxLHjOhg3vqO+pRORVF0zOuma0Znre4RY4wylRJOBFcAP3P3u\nla+v9Xph1nmcb67xuy7ABu35Vcg1j1NaUf9+BahtXOne+euly8wre+z7E2v9/mU1qqm+CbbTMsAA\n4APAwgaVRUQCVoNkbTXXqKb6MGBarAy/Av7SoLKISMACbKk3LHA+AezRoPcWkSaStTuuHho9Haki\n3x/WGs817ZEr73nS+9zjxuyQ6dpZypK1z/LIX83zPvfnR4zKdG2RkNWgxjkBuADbyPhK4Hslz38N\nOMIdbwDsiqUJ/m+lCyp1hogErco+ziiv+gRgN2ASFhjjzgdGudsZQCcJQRMUOEUkcFXO4/TJqx53\nOHBdWpkUOEUkaFXWOMvlVR9e4a02AT4I3JhWpmD7OEVEIHkC/Ion57PiyQVJL88yCPJh4E5Smumg\nwCkigUsaKB68wygG71AcDH2u69rSU3zyqkc+jUczHdRUF5HAVdnH6ZNXHWBTYBxws0+ZVOMUkaBV\nuSLIJ686wCHunFU+F1XgFJGg1WAeZ1pedYBr3M2LAqeIBC3E3ZEUOEUkaAHGzeYPnO0Z17Ge8ZUf\nep973JyLcy1LFlpGKX2VapwiIhkFGDcVOEUkbKpxiohkFGDcVOAUkbCpxikiklGAcVOBU0TCphqn\niEhGCpwiIhkp55CISEYBVjgVOEUkbGqqB+Cyn57W6CIEZeIld2U6f/oJY3IqiUh5AcbNvhc4RaS5\ntAUYObUDvIgErcod4MFSAy8CHgO+UeGcDmAe8ACWHjiRapwiErQq+zijvOoHYfmH5mCpMx6OnbMZ\ncAmW4fJpYGjaRVXjFJGgtRX8b2X45FU/HEsJHCVxeyG1TOv5WURE6qIOedV3BjYH7sCSu302rUxq\nqotI0JJa6sseuZdlj96X9HKfvOr9gD2BA4FNgFnA3VifaFlJgfOilMKc7FEgEZGqFKgcOYeOfA9D\nR75n3f3H/3Rl6Sk+edX/gzXPV7lbF/Bu1jNw3ksxWkcl73bHPlFcRKRqVa64jOdVX4zlVZ9Ucs7N\n2ABSO7ARMBpIzLGTFDh/XnJ/ALDSt7QiIrVQh7zqi4BbgfuBtcAVwENJF/Xp4xwDXAkMwqq5ewDH\nAl/O+glERLKqQRJEn7zq57ubF59R9QuwCaTREP18YLzvG4iIVKMGE+BrzndU/amS+2/WuiD1ctio\n7RpdhKBkXXs+ZO8Tvc9dnjG9skg5zbrJx1PAfu54Q2w0/eHKp4uI1E6AcdMrcB4PXIhNGn0G+Atw\nQp6FEhGJhLjJh0/gfB5bkiQiUnfhhU2/waEdgVuwwaHnsTlPO+RZKBGRSJVLLnPhEzh/DVwPbA1s\nA9wAXJdnoUREIlVu8pFPmTzO6Q9ci+0sshr4JbBxnoUSEYmEWONM6uPcHOtemA6cQbGWeRi9J5OK\niOQiwLGhxMB5Hz3XpB/r/o7Wqp+eV6FERCLNNo9zRL0KISJSSYBp1b1XDr0D2I2efZu/qH1xRER6\narYaZ2QKtjZ9d+BPwETgTpo0cGrJYHX0M5F6Cy9s+gXOT2Cbet4HfAEYBvwqz0KJiERqsDtSzflM\nR1oFrME29tgUeI6eOyonuQpYCiyMPbY58FfgUWz55ma+hRWRvqcG05HS0gN3AC9h6YHnAd9KK5NP\n4JwDDME295zrLnyXx+sArsYKHXc6Fjh3Af6GRudFJEGV28pF6YEnYOM0k4Bdy5w3Axjlbueklcmn\nqR5tWHwZtovyYGCBx+sAZtJ7dP4jFPfzvAZL/q7gKSJlVbnJRzw9MBTTA5fu8JbpTZIC515Uzi20\nJ9bnuT6GYc133N/D1vM6ItIHVDmoXi498OiSc7qxTBcLsB3gvkYVqTN+QHJStv2TLuypO+U9RKSP\nq3I6kk98uQ8bt3kVmzX0e6wrsaKkwNnhW7KMlgJbAc9iG4c8V+6kc6ZOWXc8bnwH48bnVRwRWV9d\nMzrpmtGZ63skDcQ8/cBsnnlgdtLLfdIDvxI7ng78BBvEfrHSResxzj8C25bune7+ecAy4HtY3+Zm\n9O7j7F61Op+KqOZxiuSnf78C1DaudJ80zT/hxEWH7lr6/hsAjwAHYumBZ2MDRPGLDsMqcN1Yn+j1\npKyc9F05tL6uwwaChmL9DGcC57qCHY112H4q5zKISBOrchqnT3rgT2CZLt7EmuufTrto3oGzNPF7\n5KCc31dEWkQN5r+npQe+xN28+czjbAM+i9UWAbbDqrMiIrlrtv04Iz8B1gIHAFOBFe6x9+RYrvwM\n8F+otOqNNZku3X/D9qylEZEUAa649Aqco7HZ9PPc/ReBfrmVSEQkJsDNkbwC5xtYp2pkS6wGKiKS\nu2ZND3wRMA14C/AdbAQqdRG8iEgttIcXN70C5y+Be7F5UFB+naeISC6atca5HbASm8QONkl0O+Cp\nvAolIhIJMG56Bc4/U1zvuTHwNmwm/u55FUpEJNKso+rvKLm/J3BCDmUREemlWZvqpe6j97ZMIiK5\nCDBuegXOU2PHbViN85l8iiMi0lOzNtUHxo7fBP4I3JhPcUREeioEmOcyLXC2Y6kyTk05r3ms/K/3\nqVpCKdJ4zVbj3ACrYe6H7W+nndpFpO6aLXDOxvoz5wM3Azdge9WBBdGb8i2aiEjVqTNykbStXFTa\njbEd2w8APuRuH865XCIigNU4fW8VpOVVj+yNtbI/llampBrnlsApwMK0i4iI5KXKCmeUV/0gbDbQ\nHOAP9F423o6l87kVj9QfSYGzHRi0PiUVEamVDarr5PTNq34S8Dus1plepoTnngXOylREEZEaq0Ne\n9eFYMD0AC5ypA+F55xwSEalKW0LL+dH77uaxeXcnvdxnNtAFWKbdbqyZXlVTXQnVRKThkmqcI/fa\nl5F77bvu/vSrLyw9xSev+l5YEx4sI+9EYDXWF1pWUuBclvCciEhdVDmPcy6wM5YnfTFwGL2z7+4Q\nO74a20KzYtAENdVFJHBV7o7kk1c9MwVOEQlaDea/p+VVj/uCzwUVOEUkaK2yH6eISN0EGDcVOEUk\nbEnrwhtFgVNEghbiJh8KnCIStPDCpgKniAROg0MiIhmFFzYVOEUkcG0BbgGvwCkiQdOouohIRhpV\nFxHJKLywqcApIoFTjVNEJCP1cYqIZKQap4hIRuGFzTBrwSIi6xQK/rcK0vKqfxRYAMwD7sWStiVS\njVNEgpaUrM2DT17124Gb3fE7gWnATsllEhEJWJU1znhe9dUU86rHrYwdDwReSCuTAqeIBK2Q4U8Z\n5fKqDy9z3iFYLXQ6cHJamdRUF5GgJQ2q3z/7Hyycc1fSy33yqgP83t3GAtcCI5NOVuAUkaC1J0TO\nUaPfx6jR71t3/9eXnl96ik9e9biZWFzcgoQU6Wqqi0jQquzjjOdV3xDLq16aM31HirOe9nR/Vwya\noBqniASuQt+lL5+86h8HPocNHq0APp12UQVOEQlaDbbjTMurfp67ecu7qX4VsBRYGHtsCtbHMM/d\nJuRcBhFpYlWOquci78B5Nb0DYzfwQ2CUu92acxlEpInVYOVQzeXdVJ+JdcqWCnH5qYgEqJ41SV+N\nGlU/CVsb+jNgswaVQUSaQFvB/1YvjRgcuhSY6o7PBn4AHF160jlTp6w7Hje+g3HjO+pQNBHJomtG\nJ10zOnN9jxBrnPUo0QjgFmzxvO9z3atW+074z2bI3id6n7t8zsW5lEGkVfXvV4DaxpXumY++6H3y\n2F02r/X7l9WIpvrWseND6TniLiLSQyHDrV7ybqpfB4wHhmIL7ScDHcAe2Oj6ExQnooqI9NLWB3eA\nn1Tmsatyfk8RaSHhhU2tHBKR0AUYORU4RSRofbGpLiJSlfDCpgKniIQuwMipwCkiQQtxArwCp4gE\nLcAuTu0ALyJhq8EE+LS86kdge2fcD/wDeFdamVTjFJGwVVfj9Mmr/i9gHPASFmR/CuybdFHVOEUk\naFVuZOyTV30WFjQB7gHemlYmBU4RCVqVGxn75lWPHA38Oa1MaqqLSNCSWupzZs1k7t0zk16eZZu1\n/YGjgP3STlTgFJGwJUTOvceMZe8xY9fdv/yCc0tP8c2r/i7gCqyPc3lakdRUF5GgVdnH6ZNXfTvg\nJuAzWH9oKtU4RSRoVc7j9MmrfiYwBMtOATaItE/SRRU4RSRoNZj/npZX/Yvu5k2BU0SCVghw6ZAC\np4gELcC4qcApImELMG4qcIpI4AKMnAqcIhI0bSsnIpKR+jhFRDIKMG4qcIpI4AKMnAqcIhI09XGK\niGSkPk4RkYwCjJsKnCISuAAjpwKniARNfZwiIhm1hRc3FThFJHABBk7tAC8iQatyB3hIz6v+dizT\n5WvAqT5lUo1TRIJW5XQkn7zqy4CTgEN8L6oap4gErZDhVoZPXvXnsdxEq33LpMApIkGrc151L2qq\ni0jgKrfVZ905g1l3diW9OEtedW8KnCIStKQ+zjFjxzNm7Ph193903jmlp/jmVc9ETXURCVqVfZw+\nedXjb+VFNU4RCVod8qpvhY22DwbWAl8BdgNWVLqoAqeIBK0GSy7T8qo/S8/mfCoFThEJW4ArhxQ4\nRSRoAcZNBU4RCZs2MhYRyagQYORU4BSRoIUXNhU4RSRwAVY4FThFJGzaAV5EJKMQa5xacikiklGe\ngXNb4A7gQeAB4GT3+ObAX4FHgb8Am+VYBhFpclVuK5eLPAPnauCrwO7AvsAJwK7A6Vjg3AX4m7sv\nIlJWDVJn1FyegfNZYL47XoEtrB8OfAS4xj1+DRm2qxeRvifEGme9BodGAKOAe4BhwFL3+FJ3X0Sk\nrADHhuoSOAcCN2JbNb1S8lw3FXZoPmfqlHXH48Z3MG58Rz6lE5H11jWjk64Znfm+SYCRM+8i9QP+\niG3pdIF7bBHQgTXlt8YGkN5e8rruVatz2fGeIXuf6H3u8jkX51IGkVbVv18BahtXul95ba33yYM2\nbqv1+5eVZx9nAds09CGKQRNs9+XPu+PPA7/3vWDu/7MFQJ+x+bX654P6fsYa9HGm5VUH+LF7fgHW\nrZgoz8C5H/AZYH9gnrtNAM4F3o9NRzrA3feiX8jW0OqfsdU/H9Q5cGa4lRHlVZ+A7eo+CZvdE3cw\nsBOWYuNY4NK0MuXZx3knlQPzQTm+r4i0kCp3R4rnVYdiXvWHY+fEZ/rcg80tjw9i96KVQyIStDrk\nVS93zltr+RnqpZPiiLtuuunWPLdOaivr+79c8vqPA1fE7n8GuKjknFuwrsXI7cCeSYUKdZOPjkYX\nQESCUO0IuU9e9dJz3uoeExHpkzYA/kkxr/p8yg8O/dkd7wvcXa/CiYiEaiLwCDZIdIZ77EsUc6uD\njbw/jk1HSmymi4hIi/OZxNrsngTux+a8zm5sUWriKmxKx8LYY622rWC5zzgF60eLz19uZtoiskm1\nY9XoEdgyznL9FK3gCeyXsVWMxVZhxIPKecBp7vgbZFgAEahyn3EycEpjipOLrYA93PFArNm7K633\nXbac9wK3xu6fTmvu4/kEsEWjC1FjI+gZVBZR3BFrK3e/2Y2gd+A8tTFFqYvfY4tYWvG79NIsE+B9\nJrG2gm5sDtlc4JgGlyUvfWVbwZOwgYaf0VpN2BFoi8imCZzdjS5AneyH/VJOxHbMH9vY4uQumrTc\nai4F3oY1b5cAP2hscWpmvbaIbEXNEjh9JrG2giXu7+eBadg621azFGvWgW0r+FwDy5KX5ygGkitp\nje+xHxY0r6W4o1lf+C7LapbAORfbuWQENon1MGx7ulayCTDIHQ8APkDPfrNWsd7bCjaRrWPHh9L8\n32PNt4iU+ik3ibWVvA2bLTAfm/LRCp/xOmAx8AbWR/0FbNbA7bTOFJbSz3gU8AtsWtkCLJg0e9/f\n+4C12O9mfIpVq32XIiIiIiIiIiIiIiIiIiIiIiIiInlYg829WwhcD/Sv4lo/x/K4gOVySdqlajy2\nQUtWT1J+l6hKj8etyPheU2jtDTkkB82yckiq8yq2Bv6d2ETt40qez5J7Kr4m+Rh6plkttT8wJsO1\n4++R5fGs51RzvogCZx80E9gJqw3OBG7GViq1Ad/HNlBeABzrzi9gaQUWYZvWviV2rU5gL3c8AbgX\nW13yV2B7LDXBV7Ha7n7AlsDv3HvMphhUt8BWnjyA1WJ9EnRNw5biPkDvnaR+6B6/HRjqHtsRmO5e\n0wWM9HgPEenDop1sNsAC5ZewwLkCC3BggfJ/3fFGwBxsb4CPYUGtgK3BXu4eA9sVfE8sID4Vu1a0\n9K50Q99fU0zDuh229hngx8C33PHB2PK+ck3y+EbPQ9zf/bEuiOj+WmCSO/4/iqlg/4b9hwEw2t2P\nyqimumQSanpgqa3+WK0PrLZ1FRbAZgP/do9/AGvKf8LdH4xtrDIWC3jd2O5Nfy+5dgHLDNgVu9Z/\nS56PHETPPtFB2IYmY7HNMMCyDS73+ExfAQ5xx9u6ss7GAudv3eO/BG5y7zEGuCH2+g093kOkLAXO\nvmEV1sdZamXJ/ROxZnbcwaQ3nX37CQtYbe+NCs/56gAOxAL2a1jNd+MK1+zGuiGWU/5nIJKZ+jgl\nchvwZYr/me6CbXXXhW3j14Y11fcveV03lod6HNa0h2Jz+hWKW+WBNflPjt1/t/u7CzjcHU+k2Oyu\nZDAWCF8D3o4F0Egb8El3fDjWj/sK1syPatMF4F0p7yFSkQJn31CuRli6Y/eVWJ/jfVif4aVYkrxp\nWGbRh4BrgLvKXOsFrI/0Jmxw6Dr3+C1YEzwaHDoZeA82+PQgxbzWZ2GB9wF3ftTkr/Q5bsUC/EPA\nd4FZsXNWYhsHL8RqplPd40cAR1Pctu8jZa4rIiIiIiIiIiIiIiIiIiIiIiIiIiIiAft/wkVYtYwN\n5aYAAAAASUVORK5CYII=\n",
499 | "text/plain": [
500 | ""
501 | ]
502 | },
503 | "metadata": {},
504 | "output_type": "display_data"
505 | }
506 | ],
507 | "source": [
508 | "from pyspark.mllib.evaluation import MulticlassMetrics\n",
509 | "\n",
510 | "metrics = MulticlassMetrics(\n",
511 | " predictions.select(\"prediction\", \"target_cat\").rdd)\n",
512 | "conf_matrix = metrics.confusionMatrix().toArray()\n",
513 | "plot_confusion_matrix(conf_matrix)"
514 | ]
515 | },
516 | {
517 | "cell_type": "code",
518 | "execution_count": 22,
519 | "metadata": {
520 | "collapsed": false
521 | },
522 | "outputs": [
523 | {
524 | "data": {
525 | "text/plain": [
526 | "{u'back': 2203,\n",
527 | " u'buffer_overflow': 30,\n",
528 | " u'ftp_write': 8,\n",
529 | " u'guess_passwd': 53,\n",
530 | " u'imap': 12,\n",
531 | " u'ipsweep': 1247,\n",
532 | " u'land': 21,\n",
533 | " u'loadmodule': 9,\n",
534 | " u'multihop': 7,\n",
535 | " u'neptune': 107201,\n",
536 | " u'nmap': 231,\n",
537 | " u'normal': 97278,\n",
538 | " u'perl': 3,\n",
539 | " u'phf': 4,\n",
540 | " u'pod': 264,\n",
541 | " u'portsweep': 1040,\n",
542 | " u'rootkit': 10,\n",
543 | " u'satan': 1589,\n",
544 | " u'smurf': 280790,\n",
545 | " u'spy': 2,\n",
546 | " u'teardrop': 979,\n",
547 | " u'warezclient': 1020,\n",
548 | " u'warezmaster': 20}"
549 | ]
550 | },
551 | "execution_count": 22,
552 | "metadata": {},
553 | "output_type": "execute_result"
554 | }
555 | ],
556 | "source": [
557 | "# Let's now improve the score: is the training dataset balanced?\n",
558 | "\n",
559 | "train_composition = train_df.groupBy(\"target\").count().rdd.collectAsMap()\n",
560 | "train_composition"
561 | ]
562 | },
563 | {
564 | "cell_type": "code",
565 | "execution_count": 23,
566 | "metadata": {
567 | "collapsed": false
568 | },
569 | "outputs": [
570 | {
571 | "data": {
572 | "text/plain": [
573 | "{u'back': 1,\n",
574 | " u'buffer_overflow': 33.333333333333336,\n",
575 | " u'ftp_write': 125.0,\n",
576 | " u'guess_passwd': 18.867924528301888,\n",
577 | " u'imap': 83.33333333333333,\n",
578 | " u'ipsweep': 1,\n",
579 | " u'land': 47.61904761904762,\n",
580 | " u'loadmodule': 111.11111111111111,\n",
581 | " u'multihop': 142.85714285714286,\n",
582 | " u'neptune': 0.23320677978750198,\n",
583 | " u'nmap': 4.329004329004329,\n",
584 | " u'normal': 0.2569954152017928,\n",
585 | " u'perl': 333.3333333333333,\n",
586 | " u'phf': 250.0,\n",
587 | " u'pod': 3.787878787878788,\n",
588 | " u'portsweep': 1,\n",
589 | " u'rootkit': 100.0,\n",
590 | " u'satan': 1,\n",
591 | " u'smurf': 0.08903450977598917,\n",
592 | " u'spy': 500.0,\n",
593 | " u'teardrop': 1.0214504596527068,\n",
594 | " u'warezclient': 1,\n",
595 | " u'warezmaster': 50.0}"
596 | ]
597 | },
598 | "execution_count": 23,
599 | "metadata": {},
600 | "output_type": "execute_result"
601 | }
602 | ],
603 | "source": [
604 | "def set_sample_rate_between_vals(cnt, the_min, the_max):\n",
605 | " if the_min <= cnt <= the_max:\n",
606 | " # no sampling\n",
607 | " return 1\n",
608 | " \n",
609 | " elif cnt < the_min:\n",
610 | " # Oversampling: return many times the same observation\n",
611 | " return the_min/float(cnt)\n",
612 | "\n",
613 | " else:\n",
614 | " # Subsampling: sometime don't retunt it\n",
615 | " return the_max/float(cnt)\n",
616 | " \n",
617 | "sample_rates = {k:set_sample_rate_between_vals(v, 1000, 25000) \n",
618 | " for k,v in train_composition.iteritems()} \n",
619 | "sample_rates"
620 | ]
621 | },
622 | {
623 | "cell_type": "code",
624 | "execution_count": 24,
625 | "metadata": {
626 | "collapsed": false
627 | },
628 | "outputs": [],
629 | "source": [
630 | "bc_sample_rates = sc.broadcast(sample_rates)\n",
631 | "\n",
632 | "def map_and_sample(el, rates):\n",
633 | " rate = rates.value[el['target']]\n",
634 | " if rate > 1:\n",
635 | " return [el]*int(rate)\n",
636 | " else:\n",
637 | " import random\n",
638 | " return [el] if random.random() < rate else []\n",
639 | " \n",
640 | "sampled_train_df = (train_df\n",
641 | " .flatMap(\n",
642 | " lambda x: map_and_sample(x, bc_sample_rates))\n",
643 | " .toDF()\n",
644 | " .cache())"
645 | ]
646 | },
647 | {
648 | "cell_type": "code",
649 | "execution_count": 25,
650 | "metadata": {
651 | "collapsed": false
652 | },
653 | "outputs": [
654 | {
655 | "data": {
656 | "text/plain": [
657 | "96755"
658 | ]
659 | },
660 | "execution_count": 25,
661 | "metadata": {},
662 | "output_type": "execute_result"
663 | }
664 | ],
665 | "source": [
666 | "sampled_train_df.count()"
667 | ]
668 | },
669 | {
670 | "cell_type": "code",
671 | "execution_count": 26,
672 | "metadata": {
673 | "collapsed": false,
674 | "scrolled": true
675 | },
676 | "outputs": [
677 | {
678 | "data": {
679 | "text/plain": [
680 | "Row(duration=0.0, protocol_type=u'tcp', service=u'http', flag=u'SF', src_bytes=181.0, dst_bytes=5450.0, land=0.0, wrong_fragment=0.0, urgent=0.0, hot=0.0, num_failed_logins=0.0, logged_in=1.0, num_compromised=0.0, root_shell=0.0, su_attempted=0.0, num_root=0.0, num_file_creations=0.0, num_shells=0.0, num_access_files=0.0, num_outbound_cmds=0.0, is_host_login=0.0, is_guest_login=0.0, count=8.0, srv_count=8.0, serror_rate=0.0, srv_serror_rate=0.0, rerror_rate=0.0, srv_rerror_rate=0.0, same_srv_rate=1.0, diff_srv_rate=0.0, srv_diff_host_rate=0.0, dst_host_count=9.0, dst_host_srv_count=9.0, dst_host_same_srv_rate=1.0, dst_host_diff_srv_rate=0.0, dst_host_same_src_port_rate=0.11, dst_host_srv_diff_host_rate=0.0, dst_host_serror_rate=0.0, dst_host_srv_serror_rate=0.0, dst_host_rerror_rate=0.0, dst_host_srv_rerror_rate=0.0, target=u'normal')"
681 | ]
682 | },
683 | "execution_count": 26,
684 | "metadata": {},
685 | "output_type": "execute_result"
686 | }
687 | ],
688 | "source": [
689 | "sampled_train_df.first()"
690 | ]
691 | },
692 | {
693 | "cell_type": "code",
694 | "execution_count": 27,
695 | "metadata": {
696 | "collapsed": false
697 | },
698 | "outputs": [
699 | {
700 | "name": "stdout",
701 | "output_type": "stream",
702 | "text": [
703 | "F1-score test set: 0.966865218179\n"
704 | ]
705 | }
706 | ],
707 | "source": [
708 | "full_model = full_pipeline.fit(sampled_train_df)\n",
709 | "predictions = full_model.transform(test_df)\n",
710 | "print \"F1-score test set:\", evaluator.evaluate(predictions)"
711 | ]
712 | },
713 | {
714 | "cell_type": "code",
715 | "execution_count": 28,
716 | "metadata": {
717 | "collapsed": false,
718 | "scrolled": false
719 | },
720 | "outputs": [
721 | {
722 | "name": "stdout",
723 | "output_type": "stream",
724 | "text": [
725 | "F1-score test set: 0.967669293816\n"
726 | ]
727 | }
728 | ],
729 | "source": [
730 | "clf = RandomForestClassifier(\n",
731 | " numTrees=50, maxBins=100, seed=101,\n",
732 | " labelCol=\"target_cat\", featuresCol=\"features\")\n",
733 | "\n",
734 | "stages = full_pipeline.getStages()[:-1]\n",
735 | "stages.append(clf)\n",
736 | "\n",
737 | "refined_pipeline = Pipeline(stages=stages)\n",
738 | "\n",
739 | "refined_model = refined_pipeline.fit(sampled_train_df)\n",
740 | "predictions = refined_model.transform(test_df)\n",
741 | "print \"F1-score test set:\", evaluator.evaluate(predictions)"
742 | ]
743 | },
744 | {
745 | "cell_type": "code",
746 | "execution_count": 29,
747 | "metadata": {
748 | "collapsed": false
749 | },
750 | "outputs": [],
751 | "source": [
752 | "pipeline_to_clf = Pipeline(\n",
753 | " stages=preproc_stages + [assembler]).fit(sampled_train_df)\n",
754 | "train = pipeline_to_clf.transform(sampled_train_df).cache()\n",
755 | "test = pipeline_to_clf.transform(test_df)"
756 | ]
757 | },
758 | {
759 | "cell_type": "code",
760 | "execution_count": 30,
761 | "metadata": {
762 | "collapsed": false
763 | },
764 | "outputs": [],
765 | "source": [
766 | "# May take some 10 minutes\n",
767 | "\n",
768 | "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n",
769 | " \n",
770 | "rf = RandomForestClassifier(\n",
771 | " cacheNodeIds=True, seed=101, labelCol=\"target_cat\", \n",
772 | " featuresCol=\"features\", maxBins=100)\n",
773 | "\n",
774 | "grid = (ParamGridBuilder() \n",
775 | " .addGrid(rf.maxDepth, [3, 6, 9, 12]) \n",
776 | " .addGrid(rf.numTrees, [20, 50]) \n",
777 | " .build())\n",
778 | "\n",
779 | "cv = CrossValidator(\n",
780 | " estimator=rf, estimatorParamMaps=grid, \n",
781 | " evaluator=evaluator, numFolds=3)\n",
782 | "cvModel = cv.fit(train)"
783 | ]
784 | },
785 | {
786 | "cell_type": "code",
787 | "execution_count": 31,
788 | "metadata": {
789 | "collapsed": false
790 | },
791 | "outputs": [
792 | {
793 | "name": "stdout",
794 | "output_type": "stream",
795 | "text": [
796 | "F1-score test set: 0.969948273422\n"
797 | ]
798 | }
799 | ],
800 | "source": [
801 | "predictions = cvModel.transform(test)\n",
802 | "print \"F1-score test set:\", evaluator.evaluate(predictions)"
803 | ]
804 | },
805 | {
806 | "cell_type": "code",
807 | "execution_count": 32,
808 | "metadata": {
809 | "collapsed": false
810 | },
811 | "outputs": [
812 | {
813 | "data": {
814 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAU4AAAEoCAYAAAA3/hguAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X28VVWdx/HPuRcU5EFRFBUfbj6RWika4IA8+DANOpNp\nT4b2qGljPjRpU9o4QmSTlTaWljmaZlbO6BipU2hZwsVEAQXEZzTNBxAVSQFREe788Vubs++55+y9\nNufsc9Y59/vmdV7ss88+e69zz+XHWmuvtX4gIiIiIiIiIiIiIiIiIiIiIiIivdws4GS3fSJwR43P\n3wFsBNpqfF4f/YHbgL8B/1PFefL4uTTKeOCxRhdCJM0zwApgq9i+zwN3NaQ0Pd0FnJTj+TtID5wn\nAAuA1cAy4HfAuBpc+1PAfSnXbiUbgT0aXQjx11t+MTdXG/ClGpyn4B6t5GzgP4ELgR2AXYEfAcfU\n4Ny7A09gAaW3SPr96FO3UohU6Wnga8BKYGu3r7TGORaYjzUp5wF/F3ttFhZU/gysBfbEAsFpwFLg\ndWC62z/XneO/gb7u/dsA/we8BLyKNV2Hx84fr3F+Fpjjtr+K1QCjx3rgWvfa1sBPsdrh88A3Kf7n\n2QZcDLwMPAWcTuUa59bu3B8p81pkS+BS4AX3+E9gC/faJHf9s7Fa/TL3GQC+AbwFvO2ucRIwDbg+\ndu6OkrJ91pX5deAvWE042j8n9r6072s6cLc7zx3AdhU+W1T+f8W+n2XAscDRWMBfCZwbO3409h2v\ncsdeRvF77nSfZY37vB+Lnf+rwHLgOrfvOfeePd01RrrnO2Pf24QK5RWpm6eBI4CbsQAD3QPnttg/\nhBOxf8CfwALcEPf6LKy5v697vS/2D2QGMBDYDwsQf8ICwWDgYeDTsfMfB/Rzx9/o3hupFDjjdsGC\n1j+45zOAK7A+xO2x5vCp7rV/Bh7FgvMQd/4NlA+ck7GAnNRimQ7cAwx1jz+7fWBBYD0WENuBo7D/\nXKL/oKYCP4+dayqVA+cA4DVgb/faMOxnC91/Lj7f11JgL+xnfhfw7QqfLSr/+a78nwdeAX7pyrMf\n8AZWcwY4CAuebW7fI3RvyZQ21aPzfxv7velH98CJu+bD2Hd5B/DdCmUVqaungcOB/bEaylC6B85P\nAfeWvOce4DNu+y4sMMRtpHstZwFWa4lcjNXMyjkQ+4ceSQuc/YH7Y+cfBryJ/SOMTMECN+7vU2Ov\n/T2Va5wnYjWhJE9iATbyAexnChYE3ig59wosuEDPGmbp8w66B85VwIexzxz3WYo/F5/v6+ux104D\nZpZ+qJLyR83rQa48o2LHLAA+VOH9/wL8Ova8XOB8i2INPdoXD5wAtwBLgEUUa7BSB+rjTPcw1mQ+\nF+iK7d8ZeLbk2L+6/ZHSX3SwABFZV+b5QLe9FXAlVmt9DZiN1ch8+0p/itUgv+ee747941qOBZpV\nwE+wmifATiXlLf1scSux/0iSfn92xn4e8fPFfzYr6d6H+QbFz57FWuB4rMa8DPuuRlQoT9r39WJs\nO/5dlLOS4u/DOvd36Xc5wG3v48q1HPsuv0XlboDIy1h3RZKrsf/YL8NqqFInCpx+pgKn0L2P8QWK\nTbHI7m5/pIvNdw72D240FjAn4n+T6VysyXlybN9zWC1mO6x5OsSd973u9eXAbrHj49ul5rpzHZdw\nzDKsZhg/37L0ope1hu6jG3Ysef33WI12R2zIzlVlzuHzfeXlCqx5vhf2M/830v/tpf3uDMT6kK/G\n+oWHJB8utaTA6ecpbDxhvF9qJhbYpmB3PY8H3o3VLCI+Qa5QYXsgVmt5Deufm+pZ1qOAM7Gm61ux\n/cuxAPN9rGnZht1kiG4o3AicRbGPM35zo9RrwAXYXfQPYUGtr7v2d9wxN2B9gFEf5wV0b25nsciV\nc1cs8JwXe20HV4YBWK1rLdY3W6pW39fmGIjd+HnDXfO0ktdXYN9FFj/AbnCdCvwWaz1InShw+puO\nBYioJrAS+CesZvgK8BX3PN4PWVprKFeL6CrZjp5fivXZvYL1xc2s8P7S930cC1SPUryz/mP32qex\nfrNHXDlvolh7uwq7ybAY65+7OeF6YAH4bCw4voQ1g79I8QbWhe48D7rHArev3OdO+jwAd2L/cT2I\n3RW/LfZ6G/BlrOa4EhsoflqZ82T9vkrLUK6MSc/jvoLd6X8d+C9s9ET8+GnYnfNVwEcTrh3t+xBW\nw44+59nYDagpCWUQERHxcg1Wo1+ScMwPsREViykO8RIR6bXGY8GwUuA8GpvxBjCGniMvRER6pQ4q\nB86fYP3dkcewoXuJ1McpIr3ZcLoPw3semziSKMg5sIUBO3d1rd3ckSsi0ijjJ0xkTufs2o1OaN+y\niw1vpR9XtBqbhZdFaXlThxEGGTi71i6j34Gn99i/fvk8+u40utu+VfMvr1ex6uLC6dM4/4JpjS5G\nrlr9M7b654PKn7F/3xqP6NrwVtlYUMmbi340KOMVXsCGuUWiacqJGtVUn4z1JSzFFtIQESmv0Ob/\nyO5WiutDHIJNr15R+XDTiBpnO3A5cCQW2edjhX+0AWURkdAVqqrF3oDNuhuK9WVOpTiv/0rsjvrR\n2NoKa4HP+Zy0EYFzNFbIZ9zz/8YG9KYGzraBw9MOaXoTJk5qdBFy1+qfsdU/H9T5M25eTTLiMyng\njKwnbUTgLHcXa4zPG9sHKXC2glb/jK3++aDOn7GtvX7X8tSIwOm18MX65fM2bbcNHN4rgqZIs+mc\nPYvO2bPyvUh1TfVcNCJwlt7F2hWrdXZTevdcRMIzYeKkbrXPb33zG7W/SHVN9Vw0okQLsNW6O7AF\nJ47Hbg6JiPRUKPg/6qQRNc53sM7YO7A77NGCuyIiPQVY42zUAPiZVE5LAPgPbB8yKtsNsVYbMC/S\n8tTHKSKSkWqcIiIZqcYpIpKRapwiIhkpcIqIZNSmprqISDaqcYqIZKSbQyIiGWmRDxGRjAJsqodX\nIhGRuOrnqqdlnBgCzMDyqt8H7J9WJAVOEQlbdakzoowTk4H9sIWN9y055uvAA8ABWBqNH6QVqemb\n6lnnng8Zfab/ueddlrU4IlJr1d0c8sk4sS9wkdt+HFu5bXvg5UonVY1TRMJWXY2zXMaJ0lXRFwMf\ndtujgd1Jya3e9DVOEWlxCTXODSuXsnHlk0nv9sk4cRHWPF8ILHF/b0h6gwKniIQt4a56+9ARtA8d\nsen5hidvLz3EJ+PEauCk2POngb8kFUlNdREJW3VNdZ+ME1u71wBOAWYDa5KKpBqniIStuptDlTJO\nfMG9fiV2t/1nWLP+IeDktJMqcIpI2KofAF8u48SVse25wAgyUOAUkbBprrqISEYBTrlU4BSRsKnG\nKSKSTVubapwNl2UapVIPiwQgvApn7wucItJcCmqqi4hko8ApIpKRAqeISEYKnCIiWYUXNxU4RSRs\nqnGKiGSkwCkikpECp4hIRgqcIiJZhRc3tQK8iIStUCh4PypIy6s+FLgdWIQtZPzZtDKpxplAqYdF\nGq/KpnqUV/1ILP/QfCx1Rjw98BlYgrbzsCD6OPALbPX4slTjFJGgFdoK3o8y4nnV11PMqx63HBjs\ntgcDK0kImqAap4gErsoaZ7m86mNKjrkK+BOwDBgEfDztpAqcIhK0pMD59rKHeXv5w0lv98mr/nWs\nf3MSsCfwB+AALG1wWQqcIhK0pMC55fD3sOXw92x6vnbhTaWH+ORVHwt8y20/heVVH4GlFi5LfZwi\nErQq76r75FV/DLt5BDAMC5p/SSqTapwiErbqxnH65FX/D+BaYDFWmfwq8GrSSRU4RSRoNZg5lJZX\n/RXgg1lOqMApIkHTlEsRkYwUOEVEsgovbjZ/4Ny40WeY1uZpKz8ToaJMqYfHnpPt3Pdckul4kVah\nGqeISEYKnCIiGSlwdvcM8DqwAZt8P7qBZRGRQFVYvKOhGhk4u7C5oYkDTUWkd1ONs6fwfiIiEpQQ\nA2cj56p3AXdic0lPaWA5RCRghYL/o14aWeMchy0guj22jNNjwJzoxQunT9t04ISJk5gwcVJ9Syci\nqTpnz6Jz9qxcrxFijTOUEk0F1gDRYMWudev9xmeGNI4zC43jlFbUv28BahtXuvb+19Jp5pUt/d5R\ntb5+WY1qqm+FrbQMMAD4ALCkQWURkYDVIFlbzTWqqT4MmBErwy+B3zeoLCISsABb6g0LnE8DBzbo\n2iLSRPLsMttcjR6OVLWQfqgbMvS3Zu2zHHLIv/if+95LM51bJGQ1qHFOBi7FFjK+GvhOyetfAU50\n232AfbE0wX+rdEKlzhCRoFXZxxnlVZ8M7AdMwQJj3MXASPc4D5hFQtAEBU4RCVyV4zh98qrHnQDc\nkFYmBU4RCVqVNc5yedWHV7jUVsA/ADenlanp+zhFpLUlDTNa88wi1jyzOOntWQZ6fxC4m5RmOihw\nikjgkm4AD95jJIP3GLnp+Uud15ce4pNXPfIJPJrpoKa6iASuyj5On7zqAFsDE4BbfMqkGqeIBK3K\nGUE+edUBjnXHrPM5qQKniAStBuM40/KqA1znHl4UOEUkaCGujqTAKSJBCzBu9r7Aec6tj3gfe8kx\n+2U6d3uO0z+zTKPM8zOK1JtqnCIiGQUYNxU4RSRsqnGKiGQUYNxU4BSRsKnGKSKSUYBxU4FTRMKm\nGqeISEYKnCIiGYWUHieiwCkiQQuwwqnAKSJhU1M9ANf8+DbvY5t1OmKzlrsZvfbG+kzHb71V35xK\n0roCjJu9L3CKSHNpCzByagV4EQlalSvAg6UGfgxYCnytwjGTgIXAQ1h64ESqcYpI0Krs44zyqh+J\n5R+aj6XOeDR2zDbAj7AMl88DQ9NOqhqniAStreD/KMMnr/oJWErgKInbK6ll2szPIiJSF3XIq743\nsC1wF5bc7VNpZVJTXUSCltRSX/n4/ax84oGkt/vkVe8LHAQcAWwFzAXuxfpEy0oKnJelFOYsjwKJ\niFSlQOXIOXTE+xk64v2bnj/526tLD/HJq/4c1jxf5x6dwAFsZuC8n2K0jkre5bZ9oriISNWqnHEZ\nz6u+DMurPqXkmFuwG0jtwJbAGOD7SSdNCpw/K3k+AFjrW1oRkVqoQ171x4DbgQeBjcBVQGLiLp8+\nzrHA1cAgrJp7IHAq8MWsn0BEJKsaJEH0yat+sXt48bmrfik2gDS6Rb8ImOh7ARGRatRgAHzN+d5V\nf7bk+Tu1Lki9jP7HQxtdhF7j7qWpw+G6OXTv1HHHdZFl/rnmnuevWRf5eBYY57a3wO6mP1r5cBGR\n2gkwbnoFztOAH2CDRl8Afg+cnmehREQiIS7y4RM4X8amJImI1F14YdPv5tCewG3YzaGXsTFPe+RZ\nKBGRSJVTLnPhEzh/BdwI7ATsDNwE3JBnoUREIlUu8pFPmTyO6Q9cj60ssh74BdAvz0KJiERCrHEm\n9XFui3UvzATOo1jLPJ6eg0lFRHIR4L2hxMD5AN3npJ/q/o7mqp+bV6FERCLNNo6zo16FEBGpJMC0\n6t4zh94D7Ef3vs2f1744IiLdNVuNMzINm5u+P/Bb4Cjgbpo0cM77eYYBAWeOSz9GKgplCmVWmkYZ\nlvDCpl/g/Ci2qOcDwOeAYcAv8yyUiEikBqsj1ZzPcKR1wAZsYY+tgZfovqJykmuAFcCS2L5tgT8A\nT2DTN7fxLayI9D41GI6Ulh54EvAalh54IXB+Wpl8Aud8YAi2uOcCd+J7PN4HcC1W6LhzscC5D/BH\ndHdeRBJUuaxclB54MnafZgqwb5njZgMj3ePCtDL5NNWjBYt/gq2iPBhY7PE+gDn0vDt/DMX1PK/D\nkr8reIpIWVUu8hFPDwzF9MClK7xlukhS4DyYyrmFDsL6PDfHMKz5jvt72GaeR0R6gSpvqpdLDzym\n5JguLNPFYmwFuK9QReqMS0hOynZY0ok9daVcQ0R6uSqHI/nElwew+zZvYKOGfoN1JVaUFDgn+ZYs\noxXAjsCL2MIhL5U76MLp0zZtT5g4iQkT8yqOiGyuztmz6Jw9K9drJN2Ief6hebzw0Lykt/ukB14d\n254J/Bi7if1qpZPW4z5/B7Ys3Xvd8+8CK4HvYH2b29Czj7Nr3fp8KqJDRp3hfeyq+ZfnUgaRVtW/\nbwFqG1e6zpzhn3DisuP2Lb1+H+Bx4AgsPfA87AZR/KTDsApcF9YneiMpMyd9Zw5trhuwG0FDsX6G\nC4CLXMFOxjpsP55zGUSkiVU5jNMnPfBHsUwX72DN9U+knTTvwFma+D1yZM7XFZEWUYPx72npgX/k\nHt58xnG2AZ/CaosAu2HVWRGR3DXbepyRHwMbgcOB6cAat+/9OZYrNyf9+xfTD5KGOOfWxBEg3Vxy\nzH45lkRCEuCMS6/AOQYbTb/QPX8V0CoIIlIXAS6O5BU438Y6VSPbYzVQEZHcNWt64MuAGcAOwH9g\nd6BSJ8GLiNRCe3hx0ytw/gK4HxsHBeXneYqI5KJZa5y7AWuxQexgg0R3A57Nq1AiIpEA46ZX4Pwd\nxfme/YB3YSPx98+rUCIikWa9q/6ekucHAafnUBYRkR6atale6gF6LsskIpKLAOOmV+A8J7bdhtU4\nX8inOCIi3TVrU31gbPsd4P+Am/MpjohId4UA81ymBc52LFXGOSnHiYjkotlqnH2wGuY4bH07rdQu\nInXXbIFzHtafuQi4BbgJW6sOLIj+Ot+iiYhUnTojF0nLykWl7Yet2H448E/u8cGcyyUiAliN0/dR\nQVpe9cgorJX94bQyJdU4twfOBpaknUREJC9VVjijvOpHYqOB5gO30nPaeDuWzud2PFJ/JAXOdmDQ\n5pRURKRW+lTXyembV/1M4H+xWmd6mRJeexH4RqYiiojUWB3yqg/HgunhWOBMvRGed84hEZGqtCW0\nnJ944F6WLrw36e0+o4EuxTLtdmHN9Kqa6kqoJiINl1TjHHHwIYw4+JBNz2de+4PSQ3zyqh+MNeHB\nMvIeBazH+kLLSgqcKxNeExGpiyrHcS4A9sbypC8Djqdn9t09YtvXYktoVgyaoKa6iASuytWRfPKq\nZ6bAKSJBq8H497S86nGf8zlheEPyTde69fnM8Bwy6gzvY1fNvzyXMoi0qv59C1DbuNJ19X1/9T74\n82N2r/X1y1KNU0SCFuCMSwVOEQlb0rzwRlHgFJGghbjIhwKniAQtvLCpwCkigWuVZG0iInUTXthU\n4BSRwLUFuAS8AqeIBE131UVEMtJddRGRjMILm70wcHZMVrqkuJdeezPT8Tts3c/72LVvvpO1ON4G\n9AvjVzfrZwyl3M1ENU4RkYzUxykikpFqnCIiGYUXNsOsBYuIbFIo+D8qSMur/iFgMbAQuB9L2pZI\nNU4RCVpSsjYPPnnV7wRucdvvBWYAeyWXSUQkYFXWOON51ddTzKsetza2PRB4Ja1MCpwiErRChj9l\nlMurPrzMccditdCZwFlpZVJTXUSClnRT/cF5f2bJ/HuS3u6bg+c37jEeuB4YkXSwAqeIBK09IXKO\nHHMoI8ccuun5r664uPQQn7zqcXOwuLgdCSnS1VQXkaBV2ccZz6u+BZZXvTRn+p4URz0d5P6uGDSh\nF9Y493rXto0uQlDWb8iWTfSFV9d5Hzt82/6Zzr3u7Q2Zjg/Bm+9szHT8gJzK0coq9F368smr/hHg\n09jNozXAJ9JO2usCp4g0lxosx5mWV/277uEt76b6NcAKYEls3zSsj2Ghe0zOuQwi0sSqvKuei7wD\n57X0DIxdwPeBke5xe85lEJEmVoOZQzWXd1N9DtYpWyrE6aciEqB61iR9Nequ+pnY3NCfAts0qAwi\n0gTaCv6PemnEzaErgOlu+5vAJcDJpQddOH3apu0JEycxYeKkOhRNRLLonD2Lztmzcr1GiDXOepSo\nA7gNmzzv+1rXuvXZhsn4+tg1872PvemkUbmUISRZhhdlledwpP5btGctTi5Wrnk70/HbDdwip5KE\noX/fAtQ2rnTNeeJV74PH77Ntra9fViOa6jvFto+j+x13EZFuChke9ZJ3U/0GYCIwFJtoPxWYBByI\n3V1/muJAVBGRHtp64QrwU8rsuybna4pICwkvbGrmkIiELsDIGWCRAOj66yt+Ny2ypKsVabQhx13h\nfeyqGaflWJJ85HFz6L6n/uZ98Jg9t6n19ctSjVNEghZi7U6BU0TCFmDkVOAUkaCFOABegVNEghbg\naCStAC8iYavBAPi0vOonYmtnPAj8GXhfWplU4xSRsFVX4/TJq/4XYALwGhZk/ws4JOmkqnGKSNCq\nXMjYJ6/6XCxoAtwH7JJWJgVOEQlalQsZ++ZVj5wM/C6tTGqqi0jQklrq8+fOYcG9c5LenmWZtcOA\nk4BxaQcqcIpI2BIi56ix4xk1dvym51deelHpIb551d8HXIX1ca6qokgNldt6nCLNYsioMzIdv2r+\n5TmVxF8eUy4XP7va++ADdhtUev0+wOPAEcAyYB62+FD85tBuwJ+ATwL3+lxHNU4RCVqV4zh98qpf\nAAzBslOA3UQanXRSBU4RCVoNqq9pedU/7x7eFDhFJGiFAKcOKXCKSNACjJsKnCIStgDjpgKniAQu\nwMipwCkiQdOyciIiGamPU0QkowDjpgKniAQuwMipwCkiQQuxjzO8Epnc5qpnmf8bwtxfkWaSx1z1\npSve8D5472Fb1fr6ZanGKSJBC7F2p8ApImELMHIqcIpI0ELs41TgFJGgtYUXNxU4RSRwAQZOJWsT\nkaBVmeUS0vOqvxvLdPkmcI5PmVTjFJGgVTnl0iev+krgTOBY35OqxikiQStkeJThk1f9ZWCBe92L\nAqeIBK3OedW9qKkuIoGr3Fafe/ds5t7dmfTmXKYgBni/ClB6YJFc5ZV6OI8pl8+vesv74F2GbFl6\n/UOAadgNIoDzgI3Ad8q8fSqwBrgk7TpqqotI0Krs41wA7A10AFsAx2M3hypdyoua6iIStDrkVd8R\nu9s+GKuNfgnYD6t9lqXAKSJBq8GUy7S86i8Cu2Y5oQKniIQtwDsxCpwiErQA46YCp4iETcnaREQy\nKgQYORU4RSRo4YVNBU4RCVyAFU4FThEJm1aAFxHJSDXOALyzYaP3sX3aNSNVWlPW1NdDxnqt79tr\n5BkZdgXuAh4GHgLOcvu3Bf4APAH8HtgmxzKISJOrclm5XOQZONcDXwb2x1YoOR3YFzgXC5z7AH90\nz0VEyqpB6oyayzNwvggscttrsIn1w4FjgOvc/uvIsFy9iPQ+IdY469XH2QGMBO4DhgEr3P4V7rmI\nSFkB3huqS+AcCNyMLdW0uuS1Liqs0Hzh9GmbtidMnMSEiZPyKZ2IbLYNrz/HxtXPpR9YjQAjZ96B\nsy8WNK8HfuP2rcDWv3sR2Al4qdwbz79gWs5FE5FqtQ/elfbBxRXZNiy7t+bXCHEcZ559nAVs0dBH\ngEtj+28FPuO2P0MxoKbqnD2rVmULlj5j82v1zwdW06yXGvRxpuVVB/ihe30x1q2YKM/AOQ74JHAY\nsNA9JgMXAX+PDUc63D330ht+IfUZm1+rfz4g/+Z5TJWpM6K86pOxVd2nYKN74o4G9sJSbJwKXJFW\npjyb6ndTOTAfmeN1RaSFVLk6UjyvOhTzqj8aOyY+0uc+bGx5/CZ2D5oaIyJBq0Ne9XLH7FLLz1Av\nsyjecddDDz2a5zGL2sp6/ddL3v8R4KrY808Cl5UccxvWtRi5EzgoqVChzlWf1OgCiEgQqr2l/gLd\nE7HtitUok47Zxe0TEemV+gBPUcyrvojyN4d+57YPAWo/pkpEpMkcBTyO3SQ6z+37AsXc6mB33p/E\nhiMlNtNFRKTF+QxibXbPAA9iY17nNbYoNXENNqRjSWxfqy0rWO4zTsP60eLjl5uZlohsUu1YNboD\nm8ZZrp+iFTyN/TK2ivHYLIx4UPku8FW3/TUyTIAIVLnPOBU4uzHFycWOwIFueyDW7N2X1vsuW87f\nAbfHnp9La67j+TSwXaMLUWMddA8qj1FcEWtH97zZddAzcLbykum/wSaxtOJ36aVZBsD7DGJtBV3Y\nGLIFwCkNLkteesuygmdiNxp+Sms1YTvQEpFNEzi7Gl2AOhmH/VIeha2YP76xxcldNGi51VwBvAtr\n3i4HLmlscWpms5aIbEXNEjh9BrG2guXu75eBGdg821YTLSsICcsKNrmXKAaSq2mN7zFpiUho3e+y\nrGYJnAuwlUs6sEGsx2PL07WSrYBBbnsA8AG695u1is1eVrCJ7BTbPo7m/x5rvkSk1E+5Qayt5F3Y\naIFF2JCPVviMNwDLgLexPurPYaMG7qR1hrCUfsaTgJ9jw8oWY8Gk2fv+DgU2Yr+b8SFWrfZdioiI\niIiIiIiIiIiIiIiIiIiIiORhAzb2bglwI9C/inP9DMvjApbLJWmVqonYAi1ZPUP5VaIq7Y9bk/Fa\n02jtBTkkB80yc0iq8wY2B/692EDtfy55PUvuqfic5FPonma11GHA2Aznjl8jy/6sx1RzvIgCZy80\nB9gLqw3OAW7BZiq1Ad/DFlBeDJzqji9gaQUewxat3SF2rlnAwW57MnA/NrvkD8DuWGqCL2O13XHA\n9sD/umvMoxhUt8NmnjyE1WJ9EnTNwKbiPkTPlaS+7/bfCQx1+/YEZrr3dAIjPK4hIr1YtJJNHyxQ\nfgELnGuwAAcWKP/NbW8JzMfWBvgwFtQK2BzsVW4f2KrgB2EB8dnYuaKpd6UL+v6KYhrW3bC5zwA/\nBM5320dj0/vKNcnjCz0PcX/3x7ogoucbgSlu+98ppoL9I/YfBsAY9zwqo5rqkkmo6YGltvpjtT6w\n2tY1WACbB/zV7f8A1pT/qHs+GFtYZTwW8Lqw1Zv+VHLuApYZsDN2rr+VvB45ku59ooOwBU3GY4th\ngGUbXOXxmb4EHOu2d3VlnYcFzv9x+38B/NpdYyxwU+z9W3hcQ6QsBc7eYR3Wx1lqbcnzM7BmdtzR\npDedffsJC1ht7+0Kr/maBByBBew3sZpvvwrn7MK6IVZR/mcgkpn6OCVyB/BFiv+Z7oMtddeJLePX\nhjXVDyt5XxeWh3oC1rSHYnN6NcWl8sCa/GfFnh/g/u4ETnDbR1FsdlcyGAuEbwLvxgJopA34mNs+\nAevHXY3m0qtDAAAAlElEQVQ186PadAF4X8o1RCpS4OwdytUIS1fsvhrrc3wA6zO8AkuSNwPLLPoI\ncB1wT5lzvYL1kf4auzl0g9t/G9YEj24OnQW8H7v59DDFvNbfwALvQ+74qMlf6XPcjgX4R4BvA3Nj\nx6zFFg5egtVMp7v9JwInU1y275gy5xUREREREREREREREREREREREREREZGA/T9G7auTAnZNiwAA\nAABJRU5ErkJggg==\n",
815 | "text/plain": [
816 | ""
817 | ]
818 | },
819 | "metadata": {},
820 | "output_type": "display_data"
821 | }
822 | ],
823 | "source": [
824 | "metrics = MulticlassMetrics(predictions.select(\n",
825 | " \"prediction\", \"target_cat\").rdd)\n",
826 | "conf_matrix = metrics.confusionMatrix().toArray()\n",
827 | "plot_confusion_matrix(conf_matrix)"
828 | ]
829 | },
830 | {
831 | "cell_type": "code",
832 | "execution_count": 33,
833 | "metadata": {
834 | "collapsed": false
835 | },
836 | "outputs": [
837 | {
838 | "data": {
839 | "text/plain": [
840 | "DataFrame[duration: double, protocol_type: string, service: string, flag: string, src_bytes: double, dst_bytes: double, land: double, wrong_fragment: double, urgent: double, hot: double, num_failed_logins: double, logged_in: double, num_compromised: double, root_shell: double, su_attempted: double, num_root: double, num_file_creations: double, num_shells: double, num_access_files: double, num_outbound_cmds: double, is_host_login: double, is_guest_login: double, count: double, srv_count: double, serror_rate: double, srv_serror_rate: double, rerror_rate: double, srv_rerror_rate: double, same_srv_rate: double, diff_srv_rate: double, srv_diff_host_rate: double, dst_host_count: double, dst_host_srv_count: double, dst_host_same_srv_rate: double, dst_host_diff_srv_rate: double, dst_host_same_src_port_rate: double, dst_host_srv_diff_host_rate: double, dst_host_serror_rate: double, dst_host_srv_serror_rate: double, dst_host_rerror_rate: double, dst_host_srv_rerror_rate: double, target: string, protocol_type_cat: double, service_cat: double, flag_cat: double, target_cat: double, features: vector]"
841 | ]
842 | },
843 | "execution_count": 33,
844 | "metadata": {},
845 | "output_type": "execute_result"
846 | }
847 | ],
848 | "source": [
849 | "#cleanup\n",
850 | "bc_sample_rates.unpersist()\n",
851 | "sampled_train_df.unpersist()\n",
852 | "train.unpersist()"
853 | ]
854 | },
855 | {
856 | "cell_type": "code",
857 | "execution_count": null,
858 | "metadata": {
859 | "collapsed": true
860 | },
861 | "outputs": [],
862 | "source": []
863 | }
864 | ],
865 | "metadata": {
866 | "kernelspec": {
867 | "display_name": "Python 2",
868 | "language": "python",
869 | "name": "python2"
870 | },
871 | "language_info": {
872 | "codemirror_mode": {
873 | "name": "ipython",
874 | "version": 2
875 | },
876 | "file_extension": ".py",
877 | "mimetype": "text/x-python",
878 | "name": "python",
879 | "nbconvert_exporter": "python",
880 | "pygments_lexer": "ipython2",
881 | "version": "2.7.6"
882 | }
883 | },
884 | "nbformat": 4,
885 | "nbformat_minor": 0
886 | }
887 |
--------------------------------------------------------------------------------