├── Data ├── Products │ └── sample_metadata.json ├── Reviews │ ├── electronics.json │ ├── fashion.json │ └── sports.json └── classifiers │ └── classifier.pkl ├── Images └── reducebykey.png ├── README.md ├── Spark_Tutorial.ipynb ├── requirements.txt └── spark_tutorial.py /Images/reducebykey.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrm1001/spark_tutorial/9fdabdc63e7e664cb022e89257a32cbb8f854d50/Images/reducebykey.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spark_tutorial 2 | Code for the Spark tutorial presented by Maria Mestre, Sahan Bulathwela and Erik Pazos 3 | 4 | 5 | ## How to setup the local environment for the tuorial 6 | 7 | Instructions to setup your local environment to run this Jupyter notebook can be found in the [following blog post](https://in4maniac.wordpress.com/2016/10/09/spark-tutorial/) 8 | 9 | 10 | ## Data 11 | 12 | Special thanks to J. McAuley,R, Pandey, J. Leskovec, et al. at Stanford University for allowing us to use a sample of the Amazon product dataset for this tutorial. 13 | 14 | ## Files 15 | 16 | Spark_Tutorial.ipynb : The main file that contains the code for the Tutorial 17 | 18 | Data 19 | 20 | |_ classifiers 21 | 22 | | |_ classifier.pkl : a scikit_learn logistic regression classifier used as a python classifier in the tutorial 23 | 24 | | 25 | 26 | |_ Products 27 | 28 | | |_ sample_metadata.json : This file contains details about products found in Amazon (eg: price, asin, category) 29 | 30 | | 31 | 32 | |_ Reviews 33 | 34 | |_ electronics.json : This file contains reviews about electronic products 35 | 36 | |_ fashion.json : This file contains reviews about fashion products 37 | 38 | |_ sports.json : This file contains reviews about sports equipment 39 | 40 | -------------------------------------------------------------------------------- /Spark_Tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

1. Creating an RDD

" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "We start by creating the 3 RDDs from the different datasets from Amazon product reviews. Note that it does not move the data at this stage due to the lazy evaluation nature." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "" 28 | ] 29 | }, 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "sc" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "We load the data using the Spark context." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 3, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "fashion = sc.textFile('Data/Reviews/fashion.json')\n", 55 | "electronics = sc.textFile('Data/Reviews/electronics.json')\n", 56 | "sports = sc.textFile('Data/Reviews/sports.json')" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "Nothing has happened, why is that?\n", 64 | "In Spark, some operations are *transformations*, which are lazily evaluated and others are *actions*.\n", 65 | "\n", 66 | "Read more here: http://spark.apache.org/docs/latest/programming-guide.html#transformations" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "metadata": { 73 | "collapsed": false 74 | }, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "Result 1: PythonRDD[6] at RDD at PythonRDD.scala:43\n", 81 | "Result 2: 10000\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "# Example of a basic transformation\n", 87 | "print \"Result 1: \", fashion.map(lambda x: len(x))\n", 88 | "\n", 89 | "# Example of an action:\n", 90 | "print \"Result 2: \", fashion.count()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "We do some basic data exploration." 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 5, 103 | "metadata": { 104 | "collapsed": false 105 | }, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "fashion has 10000 rows, electronics 10000 rows and sports 10000 rows\n", 112 | "\n", 113 | "fashion first row:\n" 114 | ] 115 | }, 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "u'{\"reviewerID\": \"A2XVJBSRI3SWDI\", \"asin\": \"0000031887\", \"reviewerName\": \"abigail\", \"helpful\": [0, 0], \"reviewText\": \"Perfect red tutu for the price. I baught it as part of my daughters Halloween costume and it looked great on her.\", \"overall\": 5.0, \"summary\": \"Nice tutu\", \"unixReviewTime\": 1383523200, \"reviewTime\": \"11 4, 2013\"}'" 120 | ] 121 | }, 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "print \"fashion has {0} rows, electronics {1} rows and sports {2} rows\\n\".format(fashion.count(), electronics.count(), sports.count())\n", 129 | "print \"fashion first row:\"\n", 130 | "fashion.first()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "We can union them." 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 6, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "{\"reviewerID\": \"A2XVJBSRI3SWDI\", \"asin\": \"0000031887\", \"reviewerName\": \"abigail\", \"helpful\": [0, 0], \"reviewText\": \"Perfect red tutu for the price. I baught it as part of my daughters Halloween costume and it looked great on her.\", \"overall\": 5.0, \"summary\": \"Nice tutu\", \"unixReviewTime\": 1383523200, \"reviewTime\": \"11 4, 2013\"}\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "union_of_rdds = fashion.union(electronics).union(sports)\n", 157 | "print union_of_rdds.first()" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "We can now parse the file using the json library." 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 7, 170 | "metadata": { 171 | "collapsed": false, 172 | "scrolled": true 173 | }, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "{u'asin': u'0000031887',\n", 179 | " u'helpful': [0, 0],\n", 180 | " u'overall': 5.0,\n", 181 | " u'reviewText': u'Perfect red tutu for the price. I baught it as part of my daughters Halloween costume and it looked great on her.',\n", 182 | " u'reviewTime': u'11 4, 2013',\n", 183 | " u'reviewerID': u'A2XVJBSRI3SWDI',\n", 184 | " u'reviewerName': u'abigail',\n", 185 | " u'summary': u'Nice tutu',\n", 186 | " u'unixReviewTime': 1383523200}" 187 | ] 188 | }, 189 | "execution_count": 7, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "import json\n", 196 | "parsed_fashion = fashion.map(lambda x: json.loads(x))\n", 197 | "parsed_fashion.first()" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "Another way of loading files is by using a list of comma-separated file paths or a wildcard." 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 8, 210 | "metadata": { 211 | "collapsed": false 212 | }, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/plain": [ 217 | "3" 218 | ] 219 | }, 220 | "execution_count": 8, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "data = sc.textFile('Data/Reviews/fashion.json,Data/Reviews/electronics.json,Data/Reviews/sports.json').map(lambda x: json.loads(x))\n", 227 | "\n", 228 | "# QUESTION: How many partitions does the rdd have?\n", 229 | "data.getNumPartitions()" 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": {}, 235 | "source": [ 236 | "Now let's imagine we want to know the number of lines in each partition. For that, we need to access the data in each single partition and run operations on them instead of on each row.\n", 237 | "\n", 238 | "For this, we will use mapPartitionsWithIndex which takes a partition index and an iterator over the data as arguments. Each function in the API is documented in: https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.RDD.mapPartitionsWithIndex" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 9, 244 | "metadata": { 245 | "collapsed": false 246 | }, 247 | "outputs": [ 248 | { 249 | "name": "stdout", 250 | "output_type": "stream", 251 | "text": [ 252 | "partition 0 has 10000 rows\n", 253 | "partition 1 has 10000 rows\n", 254 | "partition 2 has 10000 rows\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "indexed_data = data.mapPartitionsWithIndex(lambda splitIndex, it: [(splitIndex, len([x for x in it]))])\n", 260 | "\n", 261 | "for num_partition, count_partition in indexed_data.collect():\n", 262 | " print \"partition {0} has {1} rows\".format(num_partition, count_partition)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "

2. Reducers

" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "The next thing we have been tasked to do is **to get the minimum and maximum number of reviews per product**." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 10, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "Max number of reviews is 2033, min number of reviews is 1\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "product_num = data.map(lambda x: (x['asin'], 1)).reduceByKey(lambda x,y: x+y)\n", 296 | "# The rdd product_num will contain (product_asin, total_number_reviews)\n", 297 | "\n", 298 | "# What are the maximum and minimum number of reviews?\n", 299 | "max_num = product_num.map(lambda x: x[1]).max()\n", 300 | "min_num = product_num.map(lambda x: x[1]).min()\n", 301 | "\n", 302 | "print \"Max number of reviews is {0}, min number of reviews is {1}\".format(max_num, min_num)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "![Alt text](Images/reducebykey.png)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "metadata": {}, 315 | "source": [ 316 | "**EXERCISE**: what is the max score for each product?" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "

3. Joining multiple sources

" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "We want to join the product reviews by users to the product metadata." 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 11, 343 | "metadata": { 344 | "collapsed": false 345 | }, 346 | "outputs": [ 347 | { 348 | "name": "stdout", 349 | "output_type": "stream", 350 | "text": [ 351 | "{u'asin': u'0000037214', u'title': u'Purple Sequin Tiny Dancer Tutu Ballet Dance Fairy Princess Costume Accessory', u'price': 6.99, u'imUrl': u'http://ecx.images-amazon.com/images/I/31mCncNuAZL.jpg', u'related': {u'also_viewed': [u'B00JO8II76', u'B00DGN4R1Q', u'B00E1YRI4C']}, u'salesRank': {u'Clothing': 1233557}, u'brand': u'Big Dreams', u'categories': [[u'Clothing, Shoes & Jewelry', u'Girls'], [u'Clothing, Shoes & Jewelry', u'Novelty, Costumes & More', u'Costumes & Accessories', u'More Accessories', u'Kids & Baby']]}\n" 352 | ] 353 | } 354 | ], 355 | "source": [ 356 | "product_metadata = sc.textFile('Data/Products/sample_metadata.json').map(lambda x: json.loads(x))\n", 357 | "print product_metadata.first()" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 12, 363 | "metadata": { 364 | "collapsed": false 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "def flatten_categories(line):\n", 369 | " old_cats = line['categories']\n", 370 | " line['categories'] = [item for sublist in old_cats for item in sublist]\n", 371 | " return line\n", 372 | "\n", 373 | "product_metadata = product_metadata.map(flatten_categories)" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "We want to join the review data to the metadata about the product. We can use the ASIN for that, which is a unique identifier for each product. In order to do a join, we need to turn each structure into key-value pairs." 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 13, 386 | "metadata": { 387 | "collapsed": false 388 | }, 389 | "outputs": [ 390 | { 391 | "name": "stdout", 392 | "output_type": "stream", 393 | "text": [ 394 | "We are joining 30000 product reviews to 2469 rows of metadata information about the products.\n", 395 | "\n", 396 | "First row of key_val_data:\n", 397 | "(u'0000031887', {u'reviewerID': u'A2XVJBSRI3SWDI', u'asin': u'0000031887', u'reviewerName': u'abigail', u'helpful': [0, 0], u'reviewText': u'Perfect red tutu for the price. I baught it as part of my daughters Halloween costume and it looked great on her.', u'overall': 5.0, u'summary': u'Nice tutu', u'unixReviewTime': 1383523200, u'reviewTime': u'11 4, 2013'})\n" 398 | ] 399 | } 400 | ], 401 | "source": [ 402 | "key_val_data = data.map(lambda x: (x['asin'], x))\n", 403 | "key_val_metadata = product_metadata.map(lambda x: (x['asin'], x))\n", 404 | "\n", 405 | "print \"We are joining {0} product reviews to {1} rows of metadata information about the products.\\n\".format(key_val_data.count(),key_val_metadata.count())\n", 406 | "print \"First row of key_val_data:\"\n", 407 | "print key_val_data.first()" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 14, 413 | "metadata": { 414 | "collapsed": false 415 | }, 416 | "outputs": [ 417 | { 418 | "name": "stdout", 419 | "output_type": "stream", 420 | "text": [ 421 | "number partitions key_val_data: 3\n", 422 | "number partitions key_val_metadata: 2\n", 423 | "\n", 424 | "For key 8179050874:\n", 425 | "\n", 426 | "the review is {u'reviewerID': u'A1IQJSHCMW69O5', u'asin': u'8179050874', u'reviewerName': u'Jose Perez', u'helpful': [0, 0], u'reviewText': u\"I bought this item because of the description that is for the Blackberry bold, to my surprise is for the curve it doesn't fit the screen there is like one inch of screen not protected by the screen, also it reflects sunlight making the screen virtually unusable when outdoors, and looks ugly..\", u'overall': 1.0, u'summary': u'This is not for Bold is for Curve', u'unixReviewTime': 1242518400, u'reviewTime': u'05 17, 2009'}\n", 427 | "\n", 428 | "the product metadata is {u'asin': u'8179050874', u'salesRank': {u'Electronics': 324466}, u'imUrl': u'http://ecx.images-amazon.com/images/I/41f2QHnWYNL._SY300_.jpg', u'categories': [u'Electronics', u'Computers & Accessories', u'Laptop & Netbook Computer Accessories', u'Batteries'], u'title': u'PRIVACY Screen Saver for your BLACKBERRY Bold 9000 ! Shield and Prevent others from viewing your information while protecting your phone!'}.\n", 429 | "\n" 430 | ] 431 | } 432 | ], 433 | "source": [ 434 | "print \"number partitions key_val_data: \", \n", 435 | "print key_val_data.getNumPartitions()\n", 436 | "print \"number partitions key_val_metadata: \", \n", 437 | "print key_val_metadata.getNumPartitions()\n", 438 | "print\n", 439 | "\n", 440 | "joined = key_val_data.join(key_val_metadata)\n", 441 | "\n", 442 | "key, (review, product) = joined.first()\n", 443 | "print \"For key {0}:\\n\\nthe review is {1}\\n\\nthe product metadata is {2}.\\n\".format(key, review, product)" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "What is the number of output partitions of the join? To understand this, the best is to refer back to the Pyspark source code: https://github.com/apache/spark/blob/branch-1.3/python/pyspark/join.py" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": 15, 456 | "metadata": { 457 | "collapsed": false 458 | }, 459 | "outputs": [ 460 | { 461 | "name": "stdout", 462 | "output_type": "stream", 463 | "text": [ 464 | "There are 5 partitions\n" 465 | ] 466 | } 467 | ], 468 | "source": [ 469 | "# QUESTION: what is the number of partitions of the joined dataset?\n", 470 | "\n", 471 | "print \"There are {0} partitions\".format(joined.getNumPartitions())" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "To make it easier to manipulate, we will change the structure of the joined rdd to be a single dictionary." 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 16, 484 | "metadata": { 485 | "collapsed": false 486 | }, 487 | "outputs": [ 488 | { 489 | "name": "stdout", 490 | "output_type": "stream", 491 | "text": [ 492 | "row 0:\n", 493 | "\n", 494 | "{u'reviewerID': u'A1IQJSHCMW69O5', u'asin': u'8179050874', u'reviewerName': u'Jose Perez', u'helpful': [0, 0], u'title': u'PRIVACY Screen Saver for your BLACKBERRY Bold 9000 ! Shield and Prevent others from viewing your information while protecting your phone!', u'imUrl': u'http://ecx.images-amazon.com/images/I/41f2QHnWYNL._SY300_.jpg', u'reviewText': u\"I bought this item because of the description that is for the Blackberry bold, to my surprise is for the curve it doesn't fit the screen there is like one inch of screen not protected by the screen, also it reflects sunlight making the screen virtually unusable when outdoors, and looks ugly..\", u'overall': 1.0, u'summary': u'This is not for Bold is for Curve', u'unixReviewTime': 1242518400, u'salesRank': {u'Electronics': 324466}, u'reviewTime': u'05 17, 2009', u'categories': [u'Electronics', u'Computers & Accessories', u'Laptop & Netbook Computer Accessories', u'Batteries']}\n", 495 | "\n", 496 | "row 1:\n", 497 | "\n", 498 | "{u'reviewerID': u'A2HC8YQVZ4HMF5', u'asin': u'8179050874', u'reviewerName': u'Wowbagger the Infinitely Prolonged', u'helpful': [0, 0], u'title': u'PRIVACY Screen Saver for your BLACKBERRY Bold 9000 ! Shield and Prevent others from viewing your information while protecting your phone!', u'imUrl': u'http://ecx.images-amazon.com/images/I/41f2QHnWYNL._SY300_.jpg', u'reviewText': u'Despite being sold specifically for the Blackberry Bold 9000, it simply doesn\\'t fit a Blackberry Bold.The screen protector is about a third of a millimetre too wide. As a result, the chrome trim around the outside of the Blackberry prevents it from lying flat on the edges of the screen so it does not attach to the screen properly: there is always a 2-3 millimetres of \"air margin\" down either one or both sides.The problems are therefore:1. It looks ugly2. It will fill with dust3. Case-mate support have been messing me around for over a month now and I\\'m beginning to suspect they are just hoping that I\\'ll go away and stop annoying them. In other words, the tech support is as useless as the product...', u'overall': 1.0, u'summary': u\"Doesn't even fit the screen...\", u'unixReviewTime': 1238025600, u'salesRank': {u'Electronics': 324466}, u'reviewTime': u'03 26, 2009', u'categories': [u'Electronics', u'Computers & Accessories', u'Laptop & Netbook Computer Accessories', u'Batteries']}\n", 499 | "\n" 500 | ] 501 | } 502 | ], 503 | "source": [ 504 | "def merge_dictionaries(metadata_line, review_line):\n", 505 | " new_dict = review_line\n", 506 | " new_dict.update(metadata_line)\n", 507 | " return new_dict\n", 508 | "\n", 509 | "nice_joined = joined.map(lambda x: merge_dictionaries(x[1][0], x[1][1]))\n", 510 | "row0, row1 = nice_joined.take(2)\n", 511 | "\n", 512 | "print \"row 0:\\n\\n{0}\\n\\nrow 1:\\n\\n{1}\\n\".format(row0, row1)" 513 | ] 514 | }, 515 | { 516 | "cell_type": "markdown", 517 | "metadata": {}, 518 | "source": [ 519 | "

4. GroupByKey

" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": {}, 525 | "source": [ 526 | "Now that we have joined two data sources, we can start doing some ad-hoc analysis of the data! Now the task is **to get the average product review length for each category**. The categories are encoded as a list of categories, so we first need to 'flatten them out'." 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": 17, 532 | "metadata": { 533 | "collapsed": false 534 | }, 535 | "outputs": [ 536 | { 537 | "data": { 538 | "text/plain": [ 539 | "30000" 540 | ] 541 | }, 542 | "execution_count": 17, 543 | "metadata": {}, 544 | "output_type": "execute_result" 545 | } 546 | ], 547 | "source": [ 548 | "nice_joined.cache()\n", 549 | "nice_joined.count()" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 18, 555 | "metadata": { 556 | "collapsed": false 557 | }, 558 | "outputs": [ 559 | { 560 | "name": "stdout", 561 | "output_type": "stream", 562 | "text": [ 563 | "original_categories.take(5):\n", 564 | "\n", 565 | "[u'Electronics', u'Computers & Accessories', u'Laptop & Netbook Computer Accessories', u'Batteries']\n", 566 | "[u'Electronics', u'Computers & Accessories', u'Laptop & Netbook Computer Accessories', u'Batteries']\n", 567 | "[u'Clothing, Shoes & Jewelry', u'Novelty, Costumes & More', u'Costumes & Accessories', u'Costumes', u'Kids & Baby', u'Infants & Toddlers', u'Baby Boys']\n", 568 | "[u'Clothing, Shoes & Jewelry', u'Novelty, Costumes & More', u'Costumes & Accessories', u'Costumes', u'Kids & Baby', u'Infants & Toddlers', u'Baby Boys']\n", 569 | "[u'Sports & Outdoors', u'Outdoor Gear', u'Camping & Hiking', u'Camp Bedding', u'Sleeping Pads', u'Foam Pads']\n", 570 | "\n", 571 | "flat_categories.take(5):\n", 572 | "\n", 573 | "Electronics\n", 574 | "Computers & Accessories\n", 575 | "Laptop & Netbook Computer Accessories\n", 576 | "Batteries\n", 577 | "Electronics\n", 578 | "\n", 579 | "There are 925 distinct categories.\n" 580 | ] 581 | } 582 | ], 583 | "source": [ 584 | "original_categories = nice_joined.map(lambda x: x['categories'])\n", 585 | "flat_categories = nice_joined.flatMap(lambda x: x['categories'])\n", 586 | "\n", 587 | "print \"original_categories.take(5):\\n\"\n", 588 | "print '\\n'.join([str(x) for x in original_categories.take(5)]) + '\\n'\n", 589 | "\n", 590 | "print \"flat_categories.take(5):\\n\"\n", 591 | "print '\\n'.join([str(x) for x in flat_categories.take(5)]) + '\\n'\n", 592 | "\n", 593 | "num_categories = flat_categories.distinct().count()\n", 594 | "print \"There are {0} distinct categories.\".format(num_categories)" 595 | ] 596 | }, 597 | { 598 | "cell_type": "markdown", 599 | "metadata": {}, 600 | "source": [ 601 | "Next, in order to get the average review length across all categories, we will use a new function: groupByKey!" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": 19, 607 | "metadata": { 608 | "collapsed": false 609 | }, 610 | "outputs": [ 611 | { 612 | "name": "stdout", 613 | "output_type": "stream", 614 | "text": [ 615 | "After the flatMap: (u'Electronics', 293)\n", 616 | "After the groupByKey: (u'Screen Protectors', [191, 135, 135, 782, 782, 782, 446, 446, 446, 85])\n", 617 | "\n", 618 | "grouped_category_review.first(): (u'Screen Protectors', 423.0)\n", 619 | "\n", 620 | "The top 10 categories are: [(u'Photos', 7570.0), (u'Bags, Packs & Accessories', 6411.0), (u'Rifles', 5079.888888888889), (u'Motets', 3404.0), (u'Free-Weight Racks', 3404.0), (u'Weight Racks', 3404.0), (u'Magnificats', 3404.0), (u'Sonatinas', 3239.2), (u'Sonatas', 3239.2), (u'Rugby', 3156.0)]\n" 621 | ] 622 | } 623 | ], 624 | "source": [ 625 | "category_review = nice_joined.flatMap(lambda x: [(y, len(x['reviewText'])) for y in x['categories']])\n", 626 | "print \"After the flatMap: \" + str(category_review.first())\n", 627 | "print \"After the groupByKey: \" + str(category_review.groupByKey().map(lambda x: (x[0], list(x[1]))).first())\n", 628 | "print\n", 629 | "\n", 630 | "grouped_category_review = category_review.groupByKey().map(lambda x: (x[0], sum(x[1])/float(len(x[1]))))\n", 631 | "print \"grouped_category_review.first(): \" + str(grouped_category_review.first()) + '\\n'\n", 632 | "\n", 633 | "### Now we can sort the categories by average product review length\n", 634 | "print \"The top 10 categories are: \" + str(sorted(grouped_category_review.collect(), key=lambda x: x[1], reverse=True)[:10])" 635 | ] 636 | }, 637 | { 638 | "cell_type": "markdown", 639 | "metadata": {}, 640 | "source": [ 641 | "**EXERCISE**: Do the same thing, but this time you are not allowed to use groupByKey()!" 642 | ] 643 | }, 644 | { 645 | "cell_type": "markdown", 646 | "metadata": {}, 647 | "source": [ 648 | "

Optional: Data skewness

" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": 20, 654 | "metadata": { 655 | "collapsed": false 656 | }, 657 | "outputs": [ 658 | { 659 | "name": "stdout", 660 | "output_type": "stream", 661 | "text": [ 662 | "***Creating the large rdd***\n", 663 | "first 5 items:[(0, 0), (1, 0), (1, 1), (2, 0), (2, 1)]\n", 664 | "num rows: 5171502\n", 665 | "num partitions: 16\n", 666 | "The distribution of elements per partition is [(0, 1), (1, 2), (2, 7), (3, 20), (4, 54), (5, 148), (6, 403), (7, 1096), (8, 2980), (9, 8103), (10, 22026), (11, 59874), (12, 162754), (13, 442413), (14, 1202604), (15, 3269017)]\n", 667 | "\n", 668 | "***Creating the small rdd***\n", 669 | "first 5 items:[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)]\n", 670 | "num rows: 16\n", 671 | "num partitions: 16\n", 672 | "The distribution of elements per partition is [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1)]\n", 673 | "\n", 674 | "Joining them\n", 675 | "The direct join takes 0:00:37.139197\n", 676 | "The joined rdd has 32 partitions and 5171502 rows\n" 677 | ] 678 | } 679 | ], 680 | "source": [ 681 | "from math import exp\n", 682 | "from datetime import datetime\n", 683 | "\n", 684 | "def get_part_index(splitIndex, iterator):\n", 685 | " for it in iterator:\n", 686 | " yield (splitIndex, it)\n", 687 | " \n", 688 | "def count_elements(splitIndex, iterator):\n", 689 | " n = sum(1 for _ in iterator)\n", 690 | " yield (splitIndex, n)\n", 691 | " \n", 692 | "print \"***Creating the large rdd***\"\n", 693 | "num_parts = 16\n", 694 | "# create the large skewed rdd\n", 695 | "skewed_large_rdd = sc.parallelize(range(0,num_parts), num_parts).flatMap(lambda x: range(0, int(exp(x)))).mapPartitionsWithIndex(lambda ind, x: get_part_index(ind, x)).cache()\n", 696 | "print \"first 5 items:\" + str(skewed_large_rdd.take(5))\n", 697 | "print \"num rows: \" + str(skewed_large_rdd.count())\n", 698 | "print \"num partitions: \" + str(skewed_large_rdd.getNumPartitions())\n", 699 | "print \"The distribution of elements per partition is \" + str(skewed_large_rdd.mapPartitionsWithIndex(lambda ind, x: count_elements(ind, x)).collect())\n", 700 | "print\n", 701 | "\n", 702 | "print \"***Creating the small rdd***\"\n", 703 | "small_rdd = sc.parallelize(range(0,num_parts), num_parts).map(lambda x: (x, x))\n", 704 | "print \"first 5 items:\" + str(small_rdd.take(5))\n", 705 | "print \"num rows: \" + str(small_rdd.count())\n", 706 | "print \"num partitions: \" + str(small_rdd.getNumPartitions())\n", 707 | "print \"The distribution of elements per partition is \" + str(small_rdd.mapPartitionsWithIndex(lambda ind, x: count_elements(ind, x)).collect())\n", 708 | "\n", 709 | "print\n", 710 | "\n", 711 | "print \"Joining them\"\n", 712 | "t0 = datetime.now()\n", 713 | "result = skewed_large_rdd.leftOuterJoin(small_rdd)\n", 714 | "result.count() \n", 715 | "print \"The direct join takes %s\"%(str(datetime.now() - t0))\n", 716 | "print \"The joined rdd has {0} partitions and {1} rows\".format(result.getNumPartitions(), result.count())" 717 | ] 718 | }, 719 | { 720 | "cell_type": "markdown", 721 | "metadata": {}, 722 | "source": [ 723 | "

Optional: Integrating Spark with popular Python libraries

" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": 21, 729 | "metadata": { 730 | "collapsed": false 731 | }, 732 | "outputs": [ 733 | { 734 | "data": { 735 | "text/plain": [ 736 | "('Perfect red tutu for the price. I baught it as part of my daughters Halloween costume and it looked great on her.',\n", 737 | " 'fashion')" 738 | ] 739 | }, 740 | "execution_count": 21, 741 | "metadata": {}, 742 | "output_type": "execute_result" 743 | } 744 | ], 745 | "source": [ 746 | "import sklearn\n", 747 | "import pickle\n", 748 | "\n", 749 | "model = pickle.load(open('Data/classifiers/classifier.pkl', 'r'))\n", 750 | "model\n", 751 | "bla = fashion.map(lambda x: eval(x)['reviewText']).first()\n", 752 | "model_b = sc.broadcast(model)\n", 753 | "fashion.map(lambda x: eval(x)['reviewText']).map(lambda x: (x, model_b.value.predict([x])[0])).first()" 754 | ] 755 | }, 756 | { 757 | "cell_type": "markdown", 758 | "metadata": {}, 759 | "source": [ 760 | "

Part 2: Spark DataFrame API and Spark SQL

" 761 | ] 762 | }, 763 | { 764 | "cell_type": "markdown", 765 | "metadata": {}, 766 | "source": [ 767 | "

Introduction

\n", 768 | "\n", 769 | "This is the latter part of the tutorial. The main focus will be on Spark DataFrames and Spark SQL." 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": 22, 775 | "metadata": { 776 | "collapsed": false 777 | }, 778 | "outputs": [ 779 | { 780 | "name": "stdout", 781 | "output_type": "stream", 782 | "text": [ 783 | "number of reviews : 30000\n", 784 | "sample row : \n", 785 | "{\"reviewerID\": \"AKM1MP6P0OYPR\", \"asin\": \"0132793040\", \"reviewerName\": \"Vicki Gibson \\\"momo4\\\"\", \"helpful\": [1, 1], \"reviewText\": \"Corey Barker does a great job of explaining Blend Modes in this DVD. All of the Kelby training videos are great but pricey to buy individually. If you really want bang for your buck just subscribe to Kelby Training online.\", \"overall\": 5.0, \"summary\": \"Very thorough\", \"unixReviewTime\": 1365811200, \"reviewTime\": \"04 13, 2013\"}\n" 786 | ] 787 | } 788 | ], 789 | "source": [ 790 | "review_filepaths = 'Data/Reviews/*'\n", 791 | "textRDD = sc.textFile(review_filepaths)\n", 792 | "\n", 793 | "print 'number of reviews : {0}'.format(textRDD.count())\n", 794 | "\n", 795 | "print 'sample row : \\n{0}'.format(textRDD.first())" 796 | ] 797 | }, 798 | { 799 | "cell_type": "markdown", 800 | "metadata": {}, 801 | "source": [ 802 | "

5. Loading Data into a DataFrame

\n", 803 | "\n", 804 | "A DataFrame requires schema. There are two main functions that can be used to assign schema into an RDD. \n", 805 | "+ Inferring Schema : This functions infers the schema of the RDD by observing it\n", 806 | "+ Applying Schema : This function applies a manually defined schema an RDD" 807 | ] 808 | }, 809 | { 810 | "cell_type": "code", 811 | "execution_count": 23, 812 | "metadata": { 813 | "collapsed": false 814 | }, 815 | "outputs": [ 816 | { 817 | "data": { 818 | "text/plain": [ 819 | "" 820 | ] 821 | }, 822 | "execution_count": 23, 823 | "metadata": {}, 824 | "output_type": "execute_result" 825 | } 826 | ], 827 | "source": [ 828 | "# You need SQL context do \n", 829 | "from pyspark.sql import SQLContext\n", 830 | "\n", 831 | "# # Instantiate SQL Context\n", 832 | "sqlContext = SQLContext(sc)\n", 833 | "sqlContext\n", 834 | "# sqlContext\n", 835 | "\n", 836 | "# print sqc" 837 | ] 838 | }, 839 | { 840 | "cell_type": "markdown", 841 | "metadata": {}, 842 | "source": [ 843 | "

Inferring the Schema Using Reflection

" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": 24, 849 | "metadata": { 850 | "collapsed": false 851 | }, 852 | "outputs": [ 853 | { 854 | "data": { 855 | "text/plain": [ 856 | "Row(asin=u'0132793040', helpful=[1, 1], overall=5.0, reviewText=u'Corey Barker does a great job of explaining Blend Modes in this DVD. All of the Kelby training videos are great but pricey to buy individually. If you really want bang for your buck just subscribe to Kelby Training online.', reviewTime=u'04 13, 2013', reviewerID=u'AKM1MP6P0OYPR', reviewerName=u'Vicki Gibson \"momo4\"', summary=u'Very thorough', unixReviewTime=1365811200)" 857 | ] 858 | }, 859 | "execution_count": 24, 860 | "metadata": {}, 861 | "output_type": "execute_result" 862 | } 863 | ], 864 | "source": [ 865 | "inferredDF = sqlContext.read.json(review_filepaths)\n", 866 | "inferredDF.first()" 867 | ] 868 | }, 869 | { 870 | "cell_type": "code", 871 | "execution_count": 25, 872 | "metadata": { 873 | "collapsed": false 874 | }, 875 | "outputs": [ 876 | { 877 | "name": "stdout", 878 | "output_type": "stream", 879 | "text": [ 880 | "root\n", 881 | " |-- asin: string (nullable = true)\n", 882 | " |-- helpful: array (nullable = true)\n", 883 | " | |-- element: long (containsNull = true)\n", 884 | " |-- overall: double (nullable = true)\n", 885 | " |-- reviewText: string (nullable = true)\n", 886 | " |-- reviewTime: string (nullable = true)\n", 887 | " |-- reviewerID: string (nullable = true)\n", 888 | " |-- reviewerName: string (nullable = true)\n", 889 | " |-- summary: string (nullable = true)\n", 890 | " |-- unixReviewTime: long (nullable = true)\n", 891 | "\n" 892 | ] 893 | } 894 | ], 895 | "source": [ 896 | "inferredDF.printSchema()" 897 | ] 898 | }, 899 | { 900 | "cell_type": "markdown", 901 | "metadata": {}, 902 | "source": [ 903 | "

Manually Specifying the Schema

\n", 904 | "\n", 905 | "The Documentation about different data types can be found at [Spark SQL DataTypes section](https://spark.apache.org/docs/latest/sql-programming-guide.html#data-types \"Spark SQL DataTypes Documentation\") \n", 906 | "+ Defining the schema can be useful" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": 26, 912 | "metadata": { 913 | "collapsed": false 914 | }, 915 | "outputs": [ 916 | { 917 | "name": "stdout", 918 | "output_type": "stream", 919 | "text": [ 920 | "StructType(List(StructField(reviewerID,StringType,true),StructField(asin,StringType,true),StructField(reviewerName,StringType,true),StructField(helpful,ArrayType(IntegerType,true),true),StructField(reviewText,StringType,true),StructField(reviewTime,StringType,true),StructField(overall,DoubleType,true),StructField(summary,StringType,true),StructField(unixReviewTime,LongType,true)))\n" 921 | ] 922 | } 923 | ], 924 | "source": [ 925 | "# Export the modules\n", 926 | "from pyspark.sql.types import *\n", 927 | "\n", 928 | "# Define Schema\n", 929 | "REVIEWS_SCHEMA_DEF = StructType([\n", 930 | " StructField('reviewerID', StringType(), True),\n", 931 | " StructField('asin', StringType(), True),\n", 932 | " StructField('reviewerName', StringType(), True),\n", 933 | " StructField('helpful', ArrayType(\n", 934 | " IntegerType(), True), \n", 935 | " True),\n", 936 | " StructField('reviewText', StringType(), True),\n", 937 | " StructField('reviewTime', StringType(), True),\n", 938 | " StructField('overall', DoubleType(), True),\n", 939 | " StructField('summary', StringType(), True),\n", 940 | " StructField('unixReviewTime', LongType(), True)\n", 941 | " ])\n", 942 | "\n", 943 | "print REVIEWS_SCHEMA_DEF" 944 | ] 945 | }, 946 | { 947 | "cell_type": "markdown", 948 | "metadata": {}, 949 | "source": [ 950 | "*QUESTION*: What do you think will happen if *QUESTION*: What do you think will happen if we remove some fields from this schema?\n", 951 | "\n", 952 | "1. The schema fails\n", 953 | "2. The schema works fine\n", 954 | "\n", 955 | "ANSWER???" 956 | ] 957 | }, 958 | { 959 | "cell_type": "code", 960 | "execution_count": 27, 961 | "metadata": { 962 | "collapsed": false 963 | }, 964 | "outputs": [ 965 | { 966 | "data": { 967 | "text/plain": [ 968 | "Row(reviewerID=u'AKM1MP6P0OYPR', asin=u'0132793040', reviewerName=u'Vicki Gibson \"momo4\"', helpful=[1, 1], reviewText=u'Corey Barker does a great job of explaining Blend Modes in this DVD. All of the Kelby training videos are great but pricey to buy individually. If you really want bang for your buck just subscribe to Kelby Training online.', reviewTime=u'04 13, 2013', overall=5.0, summary=u'Very thorough', unixReviewTime=1365811200)" 969 | ] 970 | }, 971 | "execution_count": 27, 972 | "metadata": {}, 973 | "output_type": "execute_result" 974 | } 975 | ], 976 | "source": [ 977 | "# Using a handcrafted schema with to create a DataFrame\n", 978 | "appliedDF = sqlContext.read.json(review_filepaths,schema=REVIEWS_SCHEMA_DEF)\n", 979 | "appliedDF.first()" 980 | ] 981 | }, 982 | { 983 | "cell_type": "markdown", 984 | "metadata": {}, 985 | "source": [ 986 | "

6. DataFrame operations

\n", 987 | "\n", 988 | "Spark DataFrame API allow you to do multiple operations on the Data. The primary advantage of using the DataFrame API is that you can do data transoformations with the high level API without having to use Python. Using the high level API has its advantages which will be explained later in the tutorial.\n", 989 | "\n", 990 | "DataFrame API have functionality similar to that of Core RDD API. For example: \n", 991 | "+ map : foreach, Select\n", 992 | "+ mapPartition : foreachPartition\n", 993 | "+ filter : filter\n", 994 | "+ groupByKey, reduceByKey : groupBy \n", 995 | "\n", 996 | "

6.1. Selecting Columns

\n", 997 | "\n", 998 | "You can use SELECT statement to select columns from your dataframe" 999 | ] 1000 | }, 1001 | { 1002 | "cell_type": "code", 1003 | "execution_count": 28, 1004 | "metadata": { 1005 | "collapsed": false 1006 | }, 1007 | "outputs": [ 1008 | { 1009 | "name": "stdout", 1010 | "output_type": "stream", 1011 | "text": [ 1012 | "+----------+-------+--------------------+------------------+--------------+--------------+\n", 1013 | "| asin|overall| reviewText| helpful| reviewerID|unixReviewTime|\n", 1014 | "+----------+-------+--------------------+------------------+--------------+--------------+\n", 1015 | "|0132793040| 5.0|Corey Barker does...| 1.0| AKM1MP6P0OYPR| 1365811200|\n", 1016 | "|0321732944| 5.0|While many beginn...| null|A2CX7LUOHB2NDG| 1341100800|\n", 1017 | "|0439886341| 1.0|It never worked. ...| 1.0|A2NWSAGRHCP8N5| 1367193600|\n", 1018 | "|0439886341| 3.0|Some of the funct...| 1.0|A2WNBOD3WNDNKT| 1374451200|\n", 1019 | "|0439886341| 1.0|Do not waste your...| 1.0|A1GI0U4ZRJA8WN| 1334707200|\n", 1020 | "|0511189877| 5.0|Dog got the old r...| null|A1QGNMC6O1VW39| 1397433600|\n", 1021 | "|0511189877| 2.0|This remote, for ...| 1.0|A3J3BRHTDRFJ2G| 1397433600|\n", 1022 | "|0511189877| 5.0|We had an old Tim...| 0.0|A2TY0BTJOTENPG| 1395878400|\n", 1023 | "|0511189877| 5.0|This unit works j...| null|A34ATBPOK6HCHY| 1395532800|\n", 1024 | "|0511189877| 5.0|It is an exact du...| null| A89DO69P0XZ27| 1395446400|\n", 1025 | "|0511189877| 5.0|Works on my t.v. ...| 0.0| AZYNQZ94U6VDB| 1401321600|\n", 1026 | "|0528881469| 5.0|Love it has every...| null|A1DA3W4GTFXP6O| 1405641600|\n", 1027 | "|0528881469| 1.0|I have owned two ...| null|A29LPQQDG7LD5J| 1352073600|\n", 1028 | "|0528881469| 5.0|We got this GPS f...| null| AO94DHGC771SJ| 1370131200|\n", 1029 | "|0528881469| 1.0|I'm a professiona...| 0.8| AMO214LNFCEI4| 1290643200|\n", 1030 | "|0528881469| 4.0|This is a great t...|0.9545454545454546|A28B1G1MSJ6OO1| 1280016000|\n", 1031 | "|0528881469| 3.0|Well, what can I ...|0.9555555555555556|A3N7T0DY83Y4IG| 1283990400|\n", 1032 | "|0528881469| 2.0|Not going to writ...| 0.9|A1H8PY3QHMQQA0| 1290556800|\n", 1033 | "|0528881469| 2.0|My brother is a t...| 0.71875| A2CPBQ5W4OGBX| 1277078400|\n", 1034 | "|0528881469| 4.0|This unit is a fa...| 1.0|A265MKAR2WEH3Y| 1294790400|\n", 1035 | "+----------+-------+--------------------+------------------+--------------+--------------+\n", 1036 | "only showing top 20 rows\n", 1037 | "\n" 1038 | ] 1039 | } 1040 | ], 1041 | "source": [ 1042 | "columnDF = appliedDF.select(appliedDF.asin,\n", 1043 | " appliedDF.overall,\n", 1044 | " appliedDF.reviewText,\n", 1045 | " appliedDF.helpful[0]/appliedDF.helpful[1],\n", 1046 | " appliedDF.reviewerID,\n", 1047 | " appliedDF.unixReviewTime).\\\n", 1048 | " withColumnRenamed('(helpful[0] / helpful[1])','helpful')\n", 1049 | "columnDF.show()" 1050 | ] 1051 | }, 1052 | { 1053 | "cell_type": "markdown", 1054 | "metadata": {}, 1055 | "source": [ 1056 | "

6.2. Missing Values

\n", 1057 | "\n", 1058 | "Similar to Pandas, DataFrames come equipped with functions to address missing data.\n", 1059 | "+ dropna function: can be used to remove observations with missing values\n", 1060 | "+ fillna function: can be used to fill missing values with a default value" 1061 | ] 1062 | }, 1063 | { 1064 | "cell_type": "code", 1065 | "execution_count": 29, 1066 | "metadata": { 1067 | "collapsed": false 1068 | }, 1069 | "outputs": [ 1070 | { 1071 | "name": "stdout", 1072 | "output_type": "stream", 1073 | "text": [ 1074 | "+----------+-------+--------------------+------------------+--------------+--------------+\n", 1075 | "| asin|overall| reviewText| helpful| reviewerID|unixReviewTime|\n", 1076 | "+----------+-------+--------------------+------------------+--------------+--------------+\n", 1077 | "|0132793040| 5.0|Corey Barker does...| 1.0| AKM1MP6P0OYPR| 1365811200|\n", 1078 | "|0321732944| 5.0|While many beginn...| 0.0|A2CX7LUOHB2NDG| 1341100800|\n", 1079 | "|0439886341| 1.0|It never worked. ...| 1.0|A2NWSAGRHCP8N5| 1367193600|\n", 1080 | "|0439886341| 3.0|Some of the funct...| 1.0|A2WNBOD3WNDNKT| 1374451200|\n", 1081 | "|0439886341| 1.0|Do not waste your...| 1.0|A1GI0U4ZRJA8WN| 1334707200|\n", 1082 | "|0511189877| 5.0|Dog got the old r...| 0.0|A1QGNMC6O1VW39| 1397433600|\n", 1083 | "|0511189877| 2.0|This remote, for ...| 1.0|A3J3BRHTDRFJ2G| 1397433600|\n", 1084 | "|0511189877| 5.0|We had an old Tim...| 0.0|A2TY0BTJOTENPG| 1395878400|\n", 1085 | "|0511189877| 5.0|This unit works j...| 0.0|A34ATBPOK6HCHY| 1395532800|\n", 1086 | "|0511189877| 5.0|It is an exact du...| 0.0| A89DO69P0XZ27| 1395446400|\n", 1087 | "|0511189877| 5.0|Works on my t.v. ...| 0.0| AZYNQZ94U6VDB| 1401321600|\n", 1088 | "|0528881469| 5.0|Love it has every...| 0.0|A1DA3W4GTFXP6O| 1405641600|\n", 1089 | "|0528881469| 1.0|I have owned two ...| 0.0|A29LPQQDG7LD5J| 1352073600|\n", 1090 | "|0528881469| 5.0|We got this GPS f...| 0.0| AO94DHGC771SJ| 1370131200|\n", 1091 | "|0528881469| 1.0|I'm a professiona...| 0.8| AMO214LNFCEI4| 1290643200|\n", 1092 | "|0528881469| 4.0|This is a great t...|0.9545454545454546|A28B1G1MSJ6OO1| 1280016000|\n", 1093 | "|0528881469| 3.0|Well, what can I ...|0.9555555555555556|A3N7T0DY83Y4IG| 1283990400|\n", 1094 | "|0528881469| 2.0|Not going to writ...| 0.9|A1H8PY3QHMQQA0| 1290556800|\n", 1095 | "|0528881469| 2.0|My brother is a t...| 0.71875| A2CPBQ5W4OGBX| 1277078400|\n", 1096 | "|0528881469| 4.0|This unit is a fa...| 1.0|A265MKAR2WEH3Y| 1294790400|\n", 1097 | "+----------+-------+--------------------+------------------+--------------+--------------+\n", 1098 | "only showing top 20 rows\n", 1099 | "\n" 1100 | ] 1101 | } 1102 | ], 1103 | "source": [ 1104 | "# get null observations out\n", 1105 | "densedDF=columnDF.dropna(subset=[\"overall\"]).fillna(0.0,subset=[\"helpful\"]) \n", 1106 | "densedDF.show()" 1107 | ] 1108 | }, 1109 | { 1110 | "cell_type": "markdown", 1111 | "metadata": {}, 1112 | "source": [ 1113 | "

6.3. Filtering rows

\n", 1114 | "\n", 1115 | "Filtering lets you select rows based on arguments. The implementation pattern is similar to filtering RDDs, But simpler. " 1116 | ] 1117 | }, 1118 | { 1119 | "cell_type": "code", 1120 | "execution_count": 30, 1121 | "metadata": { 1122 | "collapsed": false 1123 | }, 1124 | "outputs": [ 1125 | { 1126 | "name": "stdout", 1127 | "output_type": "stream", 1128 | "text": [ 1129 | "+----------+-------+--------------------+------------------+--------------+--------------+\n", 1130 | "| asin|overall| reviewText| helpful| reviewerID|unixReviewTime|\n", 1131 | "+----------+-------+--------------------+------------------+--------------+--------------+\n", 1132 | "|0132793040| 5.0|Corey Barker does...| 1.0| AKM1MP6P0OYPR| 1365811200|\n", 1133 | "|0321732944| 5.0|While many beginn...| 0.0|A2CX7LUOHB2NDG| 1341100800|\n", 1134 | "|0439886341| 3.0|Some of the funct...| 1.0|A2WNBOD3WNDNKT| 1374451200|\n", 1135 | "|0511189877| 5.0|Dog got the old r...| 0.0|A1QGNMC6O1VW39| 1397433600|\n", 1136 | "|0511189877| 5.0|We had an old Tim...| 0.0|A2TY0BTJOTENPG| 1395878400|\n", 1137 | "|0511189877| 5.0|This unit works j...| 0.0|A34ATBPOK6HCHY| 1395532800|\n", 1138 | "|0511189877| 5.0|It is an exact du...| 0.0| A89DO69P0XZ27| 1395446400|\n", 1139 | "|0511189877| 5.0|Works on my t.v. ...| 0.0| AZYNQZ94U6VDB| 1401321600|\n", 1140 | "|0528881469| 5.0|Love it has every...| 0.0|A1DA3W4GTFXP6O| 1405641600|\n", 1141 | "|0528881469| 5.0|We got this GPS f...| 0.0| AO94DHGC771SJ| 1370131200|\n", 1142 | "|0528881469| 4.0|This is a great t...|0.9545454545454546|A28B1G1MSJ6OO1| 1280016000|\n", 1143 | "|0528881469| 3.0|Well, what can I ...|0.9555555555555556|A3N7T0DY83Y4IG| 1283990400|\n", 1144 | "|0528881469| 4.0|This unit is a fa...| 1.0|A265MKAR2WEH3Y| 1294790400|\n", 1145 | "|0528881469| 5.0|I did a lot of co...| 1.0|A37K02NKUIT68K| 1293235200|\n", 1146 | "|0528881469| 4.0|I purchased this ...| 0.5|A2AW1SSVUIYV9Y| 1289001600|\n", 1147 | "|0528881469| 5.0|EXCELLENT. BEST T...|0.7142857142857143|A2AEHUKOV014BP| 1284249600|\n", 1148 | "|0528881469| 4.0|Well as one of th...| 1.0|A2O8FIJR9EBU56| 1278547200|\n", 1149 | "|0528881469| 4.0|Was fast and what...| 0.0| AYTBGUX49LF3W| 1398470400|\n", 1150 | "|0528881469| 5.0|We had the GPS fo...| 0.0|A1E4WG8HRWWK4R| 1390867200|\n", 1151 | "|0528881469| 5.0|Back in the old d...| 0.5|A2AOEW5UGXFOOQ| 1294790400|\n", 1152 | "+----------+-------+--------------------+------------------+--------------+--------------+\n", 1153 | "only showing top 20 rows\n", 1154 | "\n" 1155 | ] 1156 | } 1157 | ], 1158 | "source": [ 1159 | "filteredDF=densedDF.filter(densedDF.overall>=3)\n", 1160 | "filteredDF.show()" 1161 | ] 1162 | }, 1163 | { 1164 | "cell_type": "markdown", 1165 | "metadata": {}, 1166 | "source": [ 1167 | "

6.4. Grouping by overall scores

\n", 1168 | "\n", 1169 | "Grouping is equivalent to the groupByKey in the core RDD API. You can transform the grouped values using a summary action such as:\n", 1170 | "+ count\n", 1171 | "+ sum\n", 1172 | "+ average\n", 1173 | "+ max and so on ..." 1174 | ] 1175 | }, 1176 | { 1177 | "cell_type": "code", 1178 | "execution_count": 31, 1179 | "metadata": { 1180 | "collapsed": false 1181 | }, 1182 | "outputs": [ 1183 | { 1184 | "name": "stdout", 1185 | "output_type": "stream", 1186 | "text": [ 1187 | "+-------+-----+\n", 1188 | "|overall|count|\n", 1189 | "+-------+-----+\n", 1190 | "| 3.0| 2128|\n", 1191 | "| 5.0|18503|\n", 1192 | "| 4.0| 5324|\n", 1193 | "+-------+-----+\n", 1194 | "\n" 1195 | ] 1196 | } 1197 | ], 1198 | "source": [ 1199 | "grouped = filteredDF.groupBy(\"overall\").count()\n", 1200 | "grouped.show()" 1201 | ] 1202 | }, 1203 | { 1204 | "cell_type": "markdown", 1205 | "metadata": {}, 1206 | "source": [ 1207 | "

6.5. Joining DataFrames together

\n", 1208 | "\n", 1209 | "You can join two DataFrames together by using a common key." 1210 | ] 1211 | }, 1212 | { 1213 | "cell_type": "code", 1214 | "execution_count": 32, 1215 | "metadata": { 1216 | "collapsed": false 1217 | }, 1218 | "outputs": [ 1219 | { 1220 | "data": { 1221 | "text/plain": [ 1222 | "u'{\"asin\": \"0000037214\", \"title\": \"Purple Sequin Tiny Dancer Tutu Ballet Dance Fairy Princess Costume Accessory\", \"price\": 6.9900000000000002, \"imUrl\": \"http://ecx.images-amazon.com/images/I/31mCncNuAZL.jpg\", \"related\": {\"also_viewed\": [\"B00JO8II76\", \"B00DGN4R1Q\", \"B00E1YRI4C\"]}, \"salesRank\": {\"Clothing\": 1233557}, \"brand\": \"Big Dreams\", \"categories\": [[\"Clothing, Shoes & Jewelry\", \"Girls\"], [\"Clothing, Shoes & Jewelry\", \"Novelty, Costumes & More\", \"Costumes & Accessories\", \"More Accessories\", \"Kids & Baby\"]]}'" 1223 | ] 1224 | }, 1225 | "execution_count": 32, 1226 | "metadata": {}, 1227 | "output_type": "execute_result" 1228 | } 1229 | ], 1230 | "source": [ 1231 | "product_filepaths = 'Data/Products/*'\n", 1232 | "productRDD = sc.textFile(product_filepaths)\n", 1233 | "productRDD.first()" 1234 | ] 1235 | }, 1236 | { 1237 | "cell_type": "code", 1238 | "execution_count": 33, 1239 | "metadata": { 1240 | "collapsed": false 1241 | }, 1242 | "outputs": [ 1243 | { 1244 | "name": "stdout", 1245 | "output_type": "stream", 1246 | "text": [ 1247 | "+----------+--------------------+-----+--------------------+\n", 1248 | "| asin| title|price| categories|\n", 1249 | "+----------+--------------------+-----+--------------------+\n", 1250 | "|0000037214|Purple Sequin Tin...| 6.99|[WrappedArray(Clo...|\n", 1251 | "|0000032069|Adult Ballet Tutu...| 7.89|[WrappedArray(Spo...|\n", 1252 | "|0000031909|Girls Ballet Tutu...| 7.0|[WrappedArray(Spo...|\n", 1253 | "|0000032034|Adult Ballet Tutu...| 7.87|[WrappedArray(Spo...|\n", 1254 | "|0000031852|Girls Ballet Tutu...| 3.17|[WrappedArray(Spo...|\n", 1255 | "|0000032050|Adult Ballet Tutu...|12.85|[WrappedArray(Spo...|\n", 1256 | "|0000031887|Ballet Dress-Up F...| 6.79|[WrappedArray(Clo...|\n", 1257 | "|0000031895|Girls Ballet Tutu...| 2.99|[WrappedArray(Spo...|\n", 1258 | "|0123456479|SHINING IMAGE HUG...|64.98|[WrappedArray(Clo...|\n", 1259 | "|0132793040|Kelby Training DV...| null|[WrappedArray(Ele...|\n", 1260 | "|0188477284|Klean Kanteen Cla...| null|[WrappedArray(Spo...|\n", 1261 | "|0321732944|Kelby Training DV...| null|[WrappedArray(Ele...|\n", 1262 | "|0439886341|Digital Organizer...| 8.15|[WrappedArray(Ele...|\n", 1263 | "|0456844570|RiZ Women's Beaut...| null|[WrappedArray(Clo...|\n", 1264 | "|0456808574|Lantin White Viso...| null|[WrappedArray(Clo...|\n", 1265 | "|0456830197|NVC Unisex Light ...| null|[WrappedArray(Clo...|\n", 1266 | "|0456856293|Kismeth Eyewear C...| null|[WrappedArray(Clo...|\n", 1267 | "|0456840532|Max-MPH Black - L...| null|[WrappedArray(Clo...|\n", 1268 | "|0456787283|FX1 Small Adult A...| null|[WrappedArray(Clo...|\n", 1269 | "|0456838384|Riz Small Unisex ...| null|[WrappedArray(Clo...|\n", 1270 | "+----------+--------------------+-----+--------------------+\n", 1271 | "only showing top 20 rows\n", 1272 | "\n" 1273 | ] 1274 | } 1275 | ], 1276 | "source": [ 1277 | "# Load Dataset2 : Amazon Product information\n", 1278 | "# First, define Schema for second Dataset\n", 1279 | "PRODUCTS_SCHEMA_DEF = StructType([\n", 1280 | " StructField('asin', StringType(), True),\n", 1281 | " StructField('title', StringType(), True),\n", 1282 | " StructField('price', DoubleType(), True),\n", 1283 | " StructField('categories', ArrayType(ArrayType(\n", 1284 | " StringType(), True),True),True)\n", 1285 | " ])\n", 1286 | "\n", 1287 | "# Load the dataset\n", 1288 | "productDF = sqlContext.read.json(product_filepaths,PRODUCTS_SCHEMA_DEF)\n", 1289 | "productDF.show()\n", 1290 | "# productDF.first()" 1291 | ] 1292 | }, 1293 | { 1294 | "cell_type": "code", 1295 | "execution_count": 34, 1296 | "metadata": { 1297 | "collapsed": false 1298 | }, 1299 | "outputs": [ 1300 | { 1301 | "data": { 1302 | "text/plain": [ 1303 | "25566" 1304 | ] 1305 | }, 1306 | "execution_count": 34, 1307 | "metadata": {}, 1308 | "output_type": "execute_result" 1309 | } 1310 | ], 1311 | "source": [ 1312 | "enrichedReviews = filteredDF.join(productDF, productDF.asin==filteredDF.asin).dropna(subset=\"title\")\n", 1313 | "enrichedReviews.count()" 1314 | ] 1315 | }, 1316 | { 1317 | "cell_type": "markdown", 1318 | "metadata": {}, 1319 | "source": [ 1320 | "When you join two RDDs, you have to restructure the data into (k,V) pairs where the key is the join key. This may involve two additional map transformations. This is not necessary in DataFrames. " 1321 | ] 1322 | }, 1323 | { 1324 | "cell_type": "code", 1325 | "execution_count": 35, 1326 | "metadata": { 1327 | "collapsed": false 1328 | }, 1329 | "outputs": [ 1330 | { 1331 | "data": { 1332 | "text/plain": [ 1333 | "DataFrame[asin: string, overall: double, reviewText: string, helpful: double, reviewerID: string, unixReviewTime: bigint, asin: string, title: string, price: double, categories: array>]" 1334 | ] 1335 | }, 1336 | "execution_count": 35, 1337 | "metadata": {}, 1338 | "output_type": "execute_result" 1339 | } 1340 | ], 1341 | "source": [ 1342 | "enrichedReviews" 1343 | ] 1344 | }, 1345 | { 1346 | "cell_type": "code", 1347 | "execution_count": 36, 1348 | "metadata": { 1349 | "collapsed": false 1350 | }, 1351 | "outputs": [ 1352 | { 1353 | "name": "stdout", 1354 | "output_type": "stream", 1355 | "text": [ 1356 | "+----------+-------+--------------------+------------------+--------------+--------------+----------+--------------------+------+--------------------+\n", 1357 | "| asin|overall| reviewText| helpful| reviewerID|unixReviewTime| asin| title| price| categories|\n", 1358 | "+----------+-------+--------------------+------------------+--------------+--------------+----------+--------------------+------+--------------------+\n", 1359 | "|0132793040| 5.0|Corey Barker does...| 1.0| AKM1MP6P0OYPR| 1365811200|0132793040|Kelby Training DV...| null|[WrappedArray(Ele...|\n", 1360 | "|0321732944| 5.0|While many beginn...| 0.0|A2CX7LUOHB2NDG| 1341100800|0321732944|Kelby Training DV...| null|[WrappedArray(Ele...|\n", 1361 | "|0439886341| 3.0|Some of the funct...| 1.0|A2WNBOD3WNDNKT| 1374451200|0439886341|Digital Organizer...| 8.15|[WrappedArray(Ele...|\n", 1362 | "|0511189877| 5.0|Dog got the old r...| 0.0|A1QGNMC6O1VW39| 1397433600|0511189877|CLIKR-5 Time Warn...| 23.36|[WrappedArray(Ele...|\n", 1363 | "|0511189877| 5.0|We had an old Tim...| 0.0|A2TY0BTJOTENPG| 1395878400|0511189877|CLIKR-5 Time Warn...| 23.36|[WrappedArray(Ele...|\n", 1364 | "|0511189877| 5.0|This unit works j...| 0.0|A34ATBPOK6HCHY| 1395532800|0511189877|CLIKR-5 Time Warn...| 23.36|[WrappedArray(Ele...|\n", 1365 | "|0511189877| 5.0|It is an exact du...| 0.0| A89DO69P0XZ27| 1395446400|0511189877|CLIKR-5 Time Warn...| 23.36|[WrappedArray(Ele...|\n", 1366 | "|0511189877| 5.0|Works on my t.v. ...| 0.0| AZYNQZ94U6VDB| 1401321600|0511189877|CLIKR-5 Time Warn...| 23.36|[WrappedArray(Ele...|\n", 1367 | "|0528881469| 5.0|Love it has every...| 0.0|A1DA3W4GTFXP6O| 1405641600|0528881469|Rand McNally 5288...|299.99|[WrappedArray(Ele...|\n", 1368 | "|0528881469| 5.0|We got this GPS f...| 0.0| AO94DHGC771SJ| 1370131200|0528881469|Rand McNally 5288...|299.99|[WrappedArray(Ele...|\n", 1369 | "|0528881469| 4.0|This is a great t...|0.9545454545454546|A28B1G1MSJ6OO1| 1280016000|0528881469|Rand McNally 5288...|299.99|[WrappedArray(Ele...|\n", 1370 | "|0528881469| 3.0|Well, what can I ...|0.9555555555555556|A3N7T0DY83Y4IG| 1283990400|0528881469|Rand McNally 5288...|299.99|[WrappedArray(Ele...|\n", 1371 | "|0528881469| 4.0|This unit is a fa...| 1.0|A265MKAR2WEH3Y| 1294790400|0528881469|Rand McNally 5288...|299.99|[WrappedArray(Ele...|\n", 1372 | "|0528881469| 5.0|I did a lot of co...| 1.0|A37K02NKUIT68K| 1293235200|0528881469|Rand McNally 5288...|299.99|[WrappedArray(Ele...|\n", 1373 | "|0528881469| 4.0|I purchased this ...| 0.5|A2AW1SSVUIYV9Y| 1289001600|0528881469|Rand McNally 5288...|299.99|[WrappedArray(Ele...|\n", 1374 | "|0528881469| 5.0|EXCELLENT. BEST T...|0.7142857142857143|A2AEHUKOV014BP| 1284249600|0528881469|Rand McNally 5288...|299.99|[WrappedArray(Ele...|\n", 1375 | "|0528881469| 4.0|Well as one of th...| 1.0|A2O8FIJR9EBU56| 1278547200|0528881469|Rand McNally 5288...|299.99|[WrappedArray(Ele...|\n", 1376 | "|0528881469| 4.0|Was fast and what...| 0.0| AYTBGUX49LF3W| 1398470400|0528881469|Rand McNally 5288...|299.99|[WrappedArray(Ele...|\n", 1377 | "|0528881469| 5.0|We had the GPS fo...| 0.0|A1E4WG8HRWWK4R| 1390867200|0528881469|Rand McNally 5288...|299.99|[WrappedArray(Ele...|\n", 1378 | "|0528881469| 5.0|Back in the old d...| 0.5|A2AOEW5UGXFOOQ| 1294790400|0528881469|Rand McNally 5288...|299.99|[WrappedArray(Ele...|\n", 1379 | "+----------+-------+--------------------+------------------+--------------+--------------+----------+--------------------+------+--------------------+\n", 1380 | "only showing top 20 rows\n", 1381 | "\n" 1382 | ] 1383 | } 1384 | ], 1385 | "source": [ 1386 | "enrichedReviews.show()" 1387 | ] 1388 | }, 1389 | { 1390 | "cell_type": "markdown", 1391 | "metadata": {}, 1392 | "source": [ 1393 | "

7. Saving your DataFrame

\n", 1394 | "\n", 1395 | "Now that we have done some operations on the data, we can save the file for later use. Standard data formats are a great way to opening up valuable data to your entire organization. Spark DataFrames can be saved in many different formats including and not limited to JSON, parquet, Hive and etc... " 1396 | ] 1397 | }, 1398 | { 1399 | "cell_type": "code", 1400 | "execution_count": 37, 1401 | "metadata": { 1402 | "collapsed": false 1403 | }, 1404 | "outputs": [ 1405 | { 1406 | "name": "stdout", 1407 | "output_type": "stream", 1408 | "text": [ 1409 | "ERROR !!\n" 1410 | ] 1411 | } 1412 | ], 1413 | "source": [ 1414 | "try:\n", 1415 | " columnDF.write.parquet('Data/Outputs/reviews_filtered.parquet')\n", 1416 | " print \"Saved as parquet successfully\"\n", 1417 | "except:\n", 1418 | " print \"ERROR !!\"\n", 1419 | "\n" 1420 | ] 1421 | }, 1422 | { 1423 | "cell_type": "markdown", 1424 | "metadata": {}, 1425 | "source": [ 1426 | "

8. Using Spark SQL

\n", 1427 | "\n", 1428 | "Spark DataFrames also allow you to use Spark SQL to query from Petabytes of data. Spark comes with a SQL like query language which can be used to query from Distributed DataFrames. A key advantage of using Spark SQL is that the [Catelyst query optimizer](https://databricks.com/blog/2015/04/13/deep-dive-into-spark-sqls-catalyst-optimizer.html \"Catelyst\") under the hood transforms your SQL query to run it most efficiently. \n", 1429 | "\n", 1430 | "

8.1. Example Queries

\n", 1431 | "\n", 1432 | "Spark SQL can leverage the same functionality as the DataFrame API provides. In fact, it provides more functionality via SQL capabilities and HQL capabilities that are available to Spark SQL environment. \n", 1433 | "\n", 1434 | "For the sake of time constrains, I will explain different functions available in Spark SQL environment by using examples that use multiple functions. This will benefit by:\n", 1435 | "+ Covering many functions that are possible via spark SQL\n", 1436 | "+ Giving an understanding about how to pipe multiple functions together\n" 1437 | ] 1438 | }, 1439 | { 1440 | "cell_type": "code", 1441 | "execution_count": 38, 1442 | "metadata": { 1443 | "collapsed": false 1444 | }, 1445 | "outputs": [ 1446 | { 1447 | "name": "stdout", 1448 | "output_type": "stream", 1449 | "text": [ 1450 | "There are 30000 reviews about 2469 products\n" 1451 | ] 1452 | } 1453 | ], 1454 | "source": [ 1455 | "# Read the reviews parquet file\n", 1456 | "reviewsDF = sqlContext.read.parquet('Data/Outputs/reviews_filtered.parquet')\n", 1457 | "\n", 1458 | "# Register the DataFrames to be used in sql\n", 1459 | "reviewsDF.registerTempTable(\"reviews\")\n", 1460 | "productDF.registerTempTable(\"products\")\n", 1461 | "\n", 1462 | "print 'There are {0} reviews about {1} products'.format(reviewsDF.count(),productDF.count())" 1463 | ] 1464 | }, 1465 | { 1466 | "cell_type": "code", 1467 | "execution_count": 39, 1468 | "metadata": { 1469 | "collapsed": false 1470 | }, 1471 | "outputs": [ 1472 | { 1473 | "name": "stdout", 1474 | "output_type": "stream", 1475 | "text": [ 1476 | "+----------+-------+--------------------+------+\n", 1477 | "| asin|overall| reviewText| price|\n", 1478 | "+----------+-------+--------------------+------+\n", 1479 | "|0528881469| 5.0|Love it has every...|299.99|\n", 1480 | "|0528881469| 1.0|I have owned two ...|299.99|\n", 1481 | "|0528881469| 5.0|We got this GPS f...|299.99|\n", 1482 | "|0528881469| 1.0|I'm a professiona...|299.99|\n", 1483 | "|0528881469| 4.0|This is a great t...|299.99|\n", 1484 | "|0528881469| 3.0|Well, what can I ...|299.99|\n", 1485 | "|0528881469| 2.0|Not going to writ...|299.99|\n", 1486 | "|0528881469| 2.0|My brother is a t...|299.99|\n", 1487 | "|0528881469| 4.0|This unit is a fa...|299.99|\n", 1488 | "|0528881469| 5.0|I did a lot of co...|299.99|\n", 1489 | "|0528881469| 4.0|I purchased this ...|299.99|\n", 1490 | "|0528881469| 5.0|EXCELLENT. BEST T...|299.99|\n", 1491 | "|0528881469| 1.0|I was real psyche...|299.99|\n", 1492 | "|0528881469| 4.0|Well as one of th...|299.99|\n", 1493 | "|0528881469| 1.0|Thought the unit ...|299.99|\n", 1494 | "|0528881469| 4.0|Was fast and what...|299.99|\n", 1495 | "|0528881469| 2.0|Twice this item h...|299.99|\n", 1496 | "|0528881469| 1.0|DONT WAIST YOUR M...|299.99|\n", 1497 | "|0528881469| 5.0|We had the GPS fo...|299.99|\n", 1498 | "|0528881469| 5.0|Back in the old d...|299.99|\n", 1499 | "+----------+-------+--------------------+------+\n", 1500 | "only showing top 20 rows\n", 1501 | "\n" 1502 | ] 1503 | } 1504 | ], 1505 | "source": [ 1506 | "sql_query = \"\"\"SELECT reviews.asin, overall, reviewText, price\n", 1507 | " FROM reviews JOIN products ON reviews.asin=products.asin\n", 1508 | " WHERE price > 50.00\n", 1509 | "\"\"\"\n", 1510 | "\n", 1511 | "result = sqlContext.sql(sql_query)\n", 1512 | "result.show()" 1513 | ] 1514 | }, 1515 | { 1516 | "cell_type": "markdown", 1517 | "metadata": {}, 1518 | "source": [ 1519 | "

Optional: User Defined Functions

\n", 1520 | "\n", 1521 | "Spark SQL also provides the functionality similar to User Defined Functions (UDF) offering in Hive. Spark uses registerFunction() function to register python functions in SQLContext." 1522 | ] 1523 | }, 1524 | { 1525 | "cell_type": "code", 1526 | "execution_count": 40, 1527 | "metadata": { 1528 | "collapsed": false 1529 | }, 1530 | "outputs": [ 1531 | { 1532 | "name": "stdout", 1533 | "output_type": "stream", 1534 | "text": [ 1535 | "+----------+--------------------+--------------------+\n", 1536 | "| asin| reviewText| cleaned|\n", 1537 | "+----------+--------------------+--------------------+\n", 1538 | "|0528881469|Love it has every...|love it has every...|\n", 1539 | "|0528881469|I have owned two ...|i have owned two ...|\n", 1540 | "|0528881469|We got this GPS f...|we got this gps f...|\n", 1541 | "|0528881469|I'm a professiona...|im a professional...|\n", 1542 | "|0528881469|This is a great t...|this is a great t...|\n", 1543 | "|0528881469|Well, what can I ...|well what can i s...|\n", 1544 | "|0528881469|Not going to writ...|not going to writ...|\n", 1545 | "|0528881469|My brother is a t...|my brother is a t...|\n", 1546 | "|0528881469|This unit is a fa...|this unit is a fa...|\n", 1547 | "|0528881469|I did a lot of co...|i did a lot of co...|\n", 1548 | "|0528881469|I purchased this ...|i purchased this ...|\n", 1549 | "|0528881469|EXCELLENT. BEST T...|excellent best tr...|\n", 1550 | "|0528881469|I was real psyche...|i was real psyche...|\n", 1551 | "|0528881469|Well as one of th...|well as one of th...|\n", 1552 | "|0528881469|Thought the unit ...|thought the unit ...|\n", 1553 | "|0528881469|Was fast and what...|was fast and what...|\n", 1554 | "|0528881469|Twice this item h...|twice this item h...|\n", 1555 | "|0528881469|DONT WAIST YOUR M...|dont waist your m...|\n", 1556 | "|0528881469|We had the GPS fo...|we had the gps fo...|\n", 1557 | "|0528881469|Back in the old d...|back in the old d...|\n", 1558 | "+----------+--------------------+--------------------+\n", 1559 | "only showing top 20 rows\n", 1560 | "\n" 1561 | ] 1562 | } 1563 | ], 1564 | "source": [ 1565 | "import re\n", 1566 | "\n", 1567 | "def transform_review(review):\n", 1568 | " x1 = re.sub('[^0-9a-zA-Z\\s]+','',review)\n", 1569 | " return x1.lower()\n", 1570 | "\n", 1571 | "result.registerTempTable(\"result\")\n", 1572 | "sqlContext.registerFunction(\"to_lowercase\", lambda x:transform_review(x), returnType=StringType())\n", 1573 | "\n", 1574 | "sql_query_transform = \"\"\"SELECT asin, reviewText, to_lowercase(reviewText) as cleaned\n", 1575 | " FROM result\n", 1576 | "\"\"\"\n", 1577 | "\n", 1578 | "result_transform = sqlContext.sql(sql_query_transform)\n", 1579 | "result_transform.show()" 1580 | ] 1581 | }, 1582 | { 1583 | "cell_type": "markdown", 1584 | "metadata": {}, 1585 | "source": [ 1586 | "

Optional : Mix and Match!!

\n", 1587 | "\n", 1588 | "You can also mix DataFrames, RDDs and SparkSQL to make it work for you. \n", 1589 | "\n", 1590 | "

Scenario

\n", 1591 | "\n", 1592 | "We want to investigate the average rating of reviews in terms of the categories they belong to. In order to do this, we:\n", 1593 | "+ query the needed data using DataFrames API\n", 1594 | "+ classify the reviews into different categories using core RDD API\n", 1595 | "+ query the avearage rating for each category using Spark SQL" 1596 | ] 1597 | }, 1598 | { 1599 | "cell_type": "code", 1600 | "execution_count": null, 1601 | "metadata": { 1602 | "collapsed": false 1603 | }, 1604 | "outputs": [ 1605 | { 1606 | "name": "stdout", 1607 | "output_type": "stream", 1608 | "text": [ 1609 | "+----------+--------------------+-----------+\n", 1610 | "| asin| review| category|\n", 1611 | "+----------+--------------------+-----------+\n", 1612 | "|0528881469|Love it has every...|electronics|\n", 1613 | "|0528881469|I have owned two ...|electronics|\n", 1614 | "|0528881469|We got this GPS f...|electronics|\n", 1615 | "|0528881469|I'm a professiona...|electronics|\n", 1616 | "|0528881469|This is a great t...|electronics|\n", 1617 | "|0528881469|Well, what can I ...|electronics|\n", 1618 | "|0528881469|Not going to writ...|electronics|\n", 1619 | "|0528881469|My brother is a t...|electronics|\n", 1620 | "|0528881469|This unit is a fa...| sports|\n", 1621 | "|0528881469|I did a lot of co...|electronics|\n", 1622 | "|0528881469|I purchased this ...|electronics|\n", 1623 | "|0528881469|EXCELLENT. BEST T...|electronics|\n", 1624 | "|0528881469|I was real psyche...|electronics|\n", 1625 | "|0528881469|Well as one of th...|electronics|\n", 1626 | "|0528881469|Thought the unit ...|electronics|\n", 1627 | "|0528881469|Was fast and what...| sports|\n", 1628 | "|0528881469|Twice this item h...| sports|\n", 1629 | "|0528881469|DONT WAIST YOUR M...|electronics|\n", 1630 | "|0528881469|We had the GPS fo...|electronics|\n", 1631 | "|0528881469|Back in the old d...|electronics|\n", 1632 | "+----------+--------------------+-----------+\n", 1633 | "only showing top 20 rows\n", 1634 | "\n" 1635 | ] 1636 | } 1637 | ], 1638 | "source": [ 1639 | "import sklearn\n", 1640 | "import cPickle\n", 1641 | "\n", 1642 | "from pyspark.sql import Row\n", 1643 | "\n", 1644 | "model = cPickle.load(open('Data/classifiers/classifier.pkl', 'r'))\n", 1645 | "classifier_b = sc.broadcast(model)\n", 1646 | "\n", 1647 | "classifiedRDD = result_transform.filter(\"cleaned <> ''\")\\\n", 1648 | " .map(lambda row: \n", 1649 | " (row.asin,row.reviewText,str(classifier_b.value.predict([row.reviewText])[0]))\n", 1650 | " )\n", 1651 | "\n", 1652 | "CLASSIFIED_SCHEMA = StructType([\n", 1653 | " StructField('asin', StringType(), True),\n", 1654 | " StructField('review', StringType(), True),\n", 1655 | " StructField('category', StringType(), True)\n", 1656 | " ])\n", 1657 | "\n", 1658 | "classifiedDF = sqlContext.createDataFrame(classifiedRDD,CLASSIFIED_SCHEMA)\n", 1659 | "\n", 1660 | "classifiedDF.show()" 1661 | ] 1662 | }, 1663 | { 1664 | "cell_type": "code", 1665 | "execution_count": null, 1666 | "metadata": { 1667 | "collapsed": false 1668 | }, 1669 | "outputs": [], 1670 | "source": [ 1671 | "classifiedDF.registerTempTable('enrichedReviews')\n", 1672 | "\n", 1673 | "sql_query_test = \"\"\"SELECT category, avg(overall) as avgRating\n", 1674 | " FROM reviews \n", 1675 | " JOIN products ON reviews.asin=products.asin \n", 1676 | " JOIN enrichedReviews ON products.asin=enrichedReviews.asin\n", 1677 | " WHERE price > 50.0\n", 1678 | " GROUP BY enrichedReviews.category\n", 1679 | "\"\"\"\n", 1680 | "\n", 1681 | "resultTest = sqlContext.sql(sql_query_test)\n", 1682 | "resultTest.show()" 1683 | ] 1684 | } 1685 | ], 1686 | "metadata": { 1687 | "kernelspec": { 1688 | "display_name": "pySpark (Spark 1.4.0)", 1689 | "language": "python", 1690 | "name": "pyspark" 1691 | }, 1692 | "language_info": { 1693 | "codemirror_mode": { 1694 | "name": "ipython", 1695 | "version": 2 1696 | }, 1697 | "file_extension": ".py", 1698 | "mimetype": "text/x-python", 1699 | "name": "python", 1700 | "nbconvert_exporter": "python", 1701 | "pygments_lexer": "ipython2", 1702 | "version": "2.7.7" 1703 | } 1704 | }, 1705 | "nbformat": 4, 1706 | "nbformat_minor": 0 1707 | } 1708 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.10.4 2 | scipy==0.17.0 3 | sklearn 4 | ipython==5.1.0 5 | jupyter==1.0.0 -------------------------------------------------------------------------------- /spark_tutorial.py: -------------------------------------------------------------------------------- 1 | import json 2 | from math import exp 3 | from datetime import datetime 4 | import sklearn 5 | import pickle 6 | import re 7 | from pyspark.sql import SQLContext # You need SQL context 8 | from pyspark.sql.types import * # Export the type modules for schema 9 | from pyspark.sql import Row 10 | 11 | 12 | # Instanciate SQL Context 13 | sqlContext = SQLContext(sc) 14 | 15 | ############################################# SPARK CORE ############################################# 16 | 17 | #### PART 1: Creating an RDD 18 | 19 | # We start by creating the 3 RDDs from the different datasets from Amazon product reviews. 20 | # Note that it does not move the data at this stage due to the lazy evaluation nature. 21 | fashion = sc.textFile('Data/Reviews/fashion.json') 22 | electronics = sc.textFile('Data/Reviews/electronics.json') 23 | sports = sc.textFile('Data/Reviews/sports.json') 24 | 25 | # Example of a basic transformation 26 | print "Result 1: ", fashion.map(lambda x: len(x)) 27 | 28 | # Example of an action: 29 | print "Result 2: ", fashion.count() 30 | 31 | # Let's do some data exploration. 32 | print "fashion has {0} rows, electronics {1} rows and sports {2} rows".format(fashion.count(), electronics.count(), sports.count()) 33 | print "fashion first row:" 34 | fashion.first() 35 | 36 | # We can union them. 37 | union_of_rdds = fashion.union(electronics).union(sports) 38 | print union_of_rdds.first() 39 | 40 | # We can now parse the file using the json library. 41 | parsed_fashion = fashion.map(lambda x: json.loads(x)) 42 | parsed_fashion.first() 43 | 44 | # Another way of loading files is by using a list of comma-separated file paths or a wildcard. 45 | data = sc.textFile('Data/Reviews/fashion.json,Data/Reviews/electronics.json,Data/Reviews/sports.json').map(lambda x: json.loads(x)) 46 | 47 | # QUESTION: How many partitions does the rdd have? 48 | data.getNumPartitions() 49 | 50 | # Now let's imagine we want to know the number of lines in each partition. 51 | # For that, we need to access the data in each single partition and run operations on them instead of on each row. 52 | # For this, we will use mapPartitionsWithIndex which takes a partition index and an iterator over the data as arguments. 53 | # Each function in the API is documented in: https://spark.apache.org/docs/1.3.1/api/python/pyspark.html#pyspark.RDD. 54 | indexed_data = data.mapPartitionsWithIndex(lambda splitIndex, it: [(splitIndex, len([x for x in it]))]) 55 | indexed_data.collect() 56 | 57 | #### PART 2: Reducers 58 | 59 | # The next thing we have been tasked to do is to get the minimum and maximum number of reviews per product. 60 | 61 | product_num = data.map(lambda x: (x['asin'], 1)).reduceByKey(lambda x,y: x+y) 62 | # The rdd product_num will contain (product_asin, total_number_reviews) 63 | 64 | # What are the maximum and minimum number of reviews? 65 | max_num = product_num.map(lambda x: x[1]).max() 66 | min_num = product_num.map(lambda x: x[1]).min() 67 | 68 | print "Max number of reviews is {0}, min number of reviews is {1}".format(max_num, min_num) 69 | 70 | # EXERCISE: what is the max score for each product? 71 | 72 | #### PART 3: Joining multiple data sources 73 | 74 | # We want to join the product reviews by users to the product metadata. 75 | product_metadata = sc.textFile('data/sample_metadata.json').map(lambda x: json.loads(x)) 76 | print product_metadata.first() 77 | 78 | # The categories are a list of lists, so we will make it easier to handle by 'flattening them out'. 79 | def flatten_categories(line): 80 | old_cats = line['categories'] 81 | line['categories'] = [item for sublist in old_cats for item in sublist] 82 | return line 83 | 84 | product_metadata = product_metadata.map(flatten_categories) 85 | print product_metadata.first() 86 | 87 | # We want to join the review data to the metadata about the product. 88 | # We can use the 'asin' for that, which is a unique identifier for each product. 89 | # In order to do a join, we need to turn each structure into key-value pairs. 90 | key_val_data = data.map(lambda x: (x['asin'], x)) 91 | key_val_metadata = product_metadata.map(lambda x: (x['asin'], x)) 92 | 93 | print "We are joining {0} product reviews to {1} rows of metadata information about the products.\n".format(key_val_data.count(),key_val_metadata.count()) 94 | print "First row of key_val_data:" 95 | print key_val_data.first() 96 | 97 | print "number partitions key_val_data: ", 98 | print key_val_data.getNumPartitions() 99 | print "number partitions key_val_metadata: ", 100 | print key_val_metadata.getNumPartitions() 101 | joined = key_val_data.join(key_val_metadata) 102 | 103 | key, (review, product) = joined.first() 104 | print "For key {0}:\n\nthe review is {1}\n\nthe product metadata is {2}.\n".format(key, review, product) 105 | 106 | # What is the number of output partitions of the join? To understand this, 107 | # the best is to refer back to the Pyspark source code: 108 | # https://github.com/apache/spark/blob/branch-1.3/python/pyspark/join.py 109 | 110 | # QUESTION: what is the number of partitions in joined? 111 | print "This RDD has {0} partitions.".format(joined.getNumPartitions()) 112 | 113 | # To make it easier to manipulate, we will change the structure of the joined rdd to be a single dictionary. 114 | def merge_dictionaries(metadata_line, review_line): 115 | new_dict = review_line 116 | new_dict.update(metadata_line) 117 | return new_dict 118 | 119 | nice_joined = joined.map(lambda x: merge_dictionaries(x[1][0], x[1][1])) 120 | row0, row1 = nice_joined.take(2) 121 | 122 | print "row 0:\n\n{0}\n\nrow 1:\n\n{1}\n".format(row0, row1) 123 | 124 | #### PART 4: GroupByKey 125 | 126 | # Now that we have joined two data sources, we can start doing some ad-hoc analysis of the data! 127 | # Now the task is to get the average product review length for each category. The categories are encoded as a list of 128 | # categories, so we first need to 'flatten them out'. 129 | 130 | nice_joined.cache() 131 | nice_joined.count() 132 | 133 | original_categories = nice_joined.map(lambda x: x['categories']) 134 | flat_categories = nice_joined.flatMap(lambda x: x['categories']) 135 | 136 | print "original_categories.take(5):\n" 137 | print '\n'.join([str(x) for x in original_categories.take(5)]) + '\n' 138 | 139 | print "flat_categories.take(5):\n" 140 | print '\n'.join([str(x) for x in flat_categories.take(5)]) + '\n' 141 | 142 | num_categories = flat_categories.distinct().count() 143 | print "There are {0} distinct categories.".format(num_categories) 144 | 145 | # Next, in order to get the average review length across all categories, we will use a new function: groupByKey! 146 | 147 | category_review = nice_joined.flatMap(lambda x: [(y, len(x['reviewText'])) for y in x['categories']]) 148 | print "After the flatMap: " + str(category_review.first()) 149 | print "After the groupByKey: " + str(category_review.groupByKey().map(lambda x: (x[0], list(x[1]))).first()) 150 | print 151 | 152 | grouped_category_review = category_review.groupByKey().map(lambda x: (x[0], sum(x[1])/float(len(x[1])))) 153 | print "grouped_category_review.first(): " + str(grouped_category_review.first()) + '\n' 154 | 155 | ### Now we can sort the categories by average product review length 156 | print "The top 10 categories are: " + str(sorted(grouped_category_review.collect(), key=lambda x: x[1], reverse=True)[:10]) 157 | 158 | # EXERCISE: Do the same thing, but this time you are not allowed to use groupByKey()! 159 | 160 | #### Optional: Data skewness 161 | def get_part_index(splitIndex, iterator): 162 | for it in iterator: 163 | yield (splitIndex, it) 164 | 165 | def count_elements(splitIndex, iterator): 166 | n = sum(1 for _ in iterator) 167 | yield (splitIndex, n) 168 | 169 | print "***Creating the large rdd***" 170 | num_parts = 16 171 | # create the large skewed rdd 172 | skewed_large_rdd = sc.parallelize(range(0,num_parts), num_parts).flatMap(lambda x: range(0, int(exp(x)))).mapPartitionsWithIndex(lambda ind, x: get_part_index(ind, x)).cache() 173 | print "first 5 items:" + str(skewed_large_rdd.take(5)) 174 | print "num rows: " + str(skewed_large_rdd.count()) 175 | print "num partitions: " + str(skewed_large_rdd.getNumPartitions()) 176 | print "The distribution of elements per partition is " + str(skewed_large_rdd.mapPartitionsWithIndex(lambda ind, x: count_elements(ind, x)).collect()) 177 | print 178 | 179 | print "***Creating the small rdd***" 180 | small_rdd = sc.parallelize(range(0,num_parts), num_parts).map(lambda x: (x, x)) 181 | print "first 5 items:" + str(small_rdd.take(5)) 182 | print "num rows: " + str(small_rdd.count()) 183 | print "num partitions: " + str(small_rdd.getNumPartitions()) 184 | print "The distribution of elements per partition is " + str(small_rdd.mapPartitionsWithIndex(lambda ind, x: count_elements(ind, x)).collect()) 185 | 186 | print 187 | 188 | print "Joining them" 189 | t0 = datetime.now() 190 | result = skewed_large_rdd.leftOuterJoin(small_rdd) 191 | result.count() 192 | print "The direct join takes %s"%(str(datetime.now() - t0)) 193 | print "The joined rdd has {0} partitions and {1} rows".format(result.getNumPartitions(), result.count()) 194 | 195 | #### Optional: Integrating Spark with popular Python libraries 196 | 197 | model = pickle.load(open('data/classifier.pkl', 'r')) 198 | model_b = sc.broadcast(model) 199 | fashion.map(lambda x: eval(x)['reviewText']).map(lambda x: (x, model_b.value.predict([x])[0])).first() 200 | 201 | ################################### Spark DataFrame API and Spark SQL ################################### 202 | 203 | # Part 5 : Loading data to spark 204 | # We start by loading the files to spark 205 | # First, load them as text file to validate 206 | review_filepaths = 'Data/Reviews/*' 207 | textRDD = sc.textFile(review_filepaths) 208 | print 'number of reviews : {0}'.format(textRDD.count()) 209 | print 'sample row : \n{0}'.format(textRDD.first()) 210 | 211 | # You can let spark infer the schema of your DataFrame 212 | inferredDF = sqlContext.read.json(review_filepaths) 213 | inferredDF.first() 214 | 215 | # Or you can programmatically tell spark how the schema looks like 216 | # Define Schema 217 | REVIEWS_SCHEMA_DEF = StructType([ 218 | StructField('reviewerID', StringType(), True), 219 | StructField('asin', StringType(), True), 220 | StructField('reviewerName', StringType(), True), 221 | StructField('helpful', ArrayType( 222 | IntegerType(), True), 223 | True), 224 | StructField('reviewText', StringType(), True), 225 | StructField('reviewTime', StringType(), True), 226 | StructField('overall', DoubleType(), True) 227 | ]) 228 | # View schema definition 229 | print REVIEWS_SCHEMA_DEF 230 | 231 | # Apply schema to data 232 | appliedDF = sqlContext.read.json(review_filepaths,schema=REVIEWS_SCHEMA_DEF) 233 | appliedDF.first() 234 | 235 | 236 | # Part 6: DataFrame Operations 237 | 238 | # Spark DataFrame API allow you to do multiple operations on the Data. The primary advantage of using the DataFrame API is that you can do data transoformations with the high level API without having to use Python. Using the high level API has its advantages which will be explained later in the tutorial. 239 | 240 | # DataFrame API have functionality similar to that of Core RDD API. For example: 241 | # + map : foreach, Select 242 | # + mapPartition : foreachPartition 243 | # + filter : filter 244 | # + groupByKey, reduceByKey : groupBy 245 | 246 | # 6.1 Selecting columns 247 | 248 | # You can use SELECT statement to select columns from your dataframe 249 | 250 | columnDF = appliedDF.select(appliedDF.asin, 251 | appliedDF.overall, 252 | appliedDF.reviewText, 253 | appliedDF.reviewerID, 254 | appliedDF.unixReviewTime) 255 | columnDF.show() 256 | 257 | # 6.2 Missing Values 258 | 259 | # Similar to Pandas, DataFrames come equipped with functions to address missing data. 260 | # + dropna function: can be used to remove observations with missing values 261 | # + fillna function: can be used to fill missing values with a default value 262 | 263 | # get null observations out 264 | densedDF=columnDF.dropna(subset=["overall"]).fillna(0.0,subset=["helpful"]) 265 | densedDF.show() 266 | 267 | # 6.3 Filtering Rows 268 | 269 | # filter keywords allow you to filter rows in DFs 270 | filteredDF=densedDF.filter(densedDF.overall>=3) 271 | filteredDF.show() 272 | # CODE WILL BE SHARED DURING THE TUTORIAL AS THIS IS PART OF AN EXERCISE 273 | 274 | # 6.5 group by 275 | 276 | # Grouping is equivalent to the groupByKey in the core RDD API. You can transform the grouped values using a summary action such as: 277 | # + count 278 | # + sum 279 | # + average 280 | # + max and so on ... 281 | 282 | grouped = filteredDF.groupBy("overall").count() 283 | grouped.show() 284 | 285 | 286 | # 6.5. Joining DataFrames together 287 | 288 | # first, load the product dataset 289 | product_filepaths = 'Data/Products/*' 290 | productRDD = sc.textFile(product_filepaths) 291 | productRDD.first() 292 | 293 | # Load it as a dataframe 294 | # Load Dataset2 : Amazon Product information 295 | # First, define Schema for second Dataset 296 | PRODUCTS_SCHEMA_DEF = StructType([ 297 | StructField('asin', StringType(), True), 298 | StructField('title', StringType(), True), 299 | StructField('price', DoubleType(), True), 300 | StructField('categories', ArrayType(ArrayType( 301 | StringType(), True),True),True), 302 | StructField('related', MapType(StringType(), ArrayType( 303 | StringType(), True),True)), 304 | StructField('imUrl', StringType(), True), 305 | StructField('salesRank', MapType(StringType(), IntegerType(), True),True) 306 | ]) 307 | 308 | # Load the dataset 309 | productDF = sqlContext.read.json(product_filepaths,PRODUCTS_SCHEMA_DEF) 310 | # productDF.show() 311 | # productDF.first() 312 | 313 | """ 314 | *QUESTION*: What do you think will happen if we remove some fields from this schema? 315 | 316 | 1. The schema fails 317 | 2. The schema works fine 318 | 319 | ANSWER??? 320 | 321 | Now lets join the two datasets 322 | """ 323 | 324 | enrichedReviews = filteredDF.join(productDF, productDF.asin==filteredDF.asin).dropna(subset="title") 325 | enrichedReviews.count() 326 | 327 | enrichedReviews.show() 328 | 329 | 330 | # 7. Saving your DataFrame 331 | # Now that we have done some operations on the data, we can save the file for later use. Standard data formats are a 332 | # great way to opening up valuable data to your entire organization. Spark DataFrames can be saved in many different 333 | # formats including and not limited to JSON, parquet, Hive and etc... 334 | 335 | try: 336 | columnDF.write.parquet('Data/Outputs/reviews_filtered.parquet') 337 | except: 338 | pass 339 | 340 | print "Saved as parquet successfully" 341 | 342 | 343 | # 8. Using Spark SQL 344 | 345 | # Spark DataFrames also allow you to use Spark SQL to query from Petabytes of data. Spark comes with a SQL like query 346 | # language which can be used to query from Distributed DataFrames. A key advantage of using Spark SQL is that the 347 | # (https://databricks.com/blog/2015/04/13/deep-dive-into-spark-sqls-catalyst-optimizer.html) under the hood transforms 348 | # your SQL query to run it most efficiently. 349 | 350 | # Spark SQL can leverage the same functionality as the DataFrame API provides. In fact, it provides more functionality via SQL capabilities and HQL capabilities that are available to Spark SQL environment. 351 | 352 | # For the sake of time constrains, I will explain different functions available in Spark SQL environment by using examples that use multiple functions. This will benefit by: 353 | # + Covering many functions that are possible via spark SQL 354 | # + Giving an understanding about how to pipe multiple functions together 355 | 356 | # Read the reviews parquet file 357 | reviewsDF = sqlContext.read.parquet('Data/Outputs/reviews_filtered.parquet') 358 | 359 | # Register the DataFrames to be used in sql 360 | reviewsDF.registerAsTable("reviews") 361 | productDF.registerAsTable("products") 362 | 363 | print 'There are {0} reviews about {1} products'.format(reviewsDF.count(),productDF.count()) 364 | 365 | 366 | # NOW LET'S RUN A SQL QUERY 367 | 368 | sql_query = """SELECT reviews.asin, overall, reviewText, price 369 | FROM reviews JOIN products ON reviews.asin=products.asin 370 | WHERE price > 50.00 371 | """ 372 | 373 | result = sqlContext.sql(sql_query) 374 | result.show() 375 | 376 | # User defined functions 377 | # Spark SQL also provides the functionality similar to User Defined Functions (UDF) offering in Hive. 378 | # Spark uses registerFunction() function to register python functions in SQLContext. 379 | 380 | # user defined function 381 | def transform_review(review): 382 | x1 = re.sub('[^0-9a-zA-Z\s]+','',review) 383 | return x1.lower() 384 | 385 | # register table from above 386 | result.registerAsTable("result") 387 | 388 | # register function from above 389 | sqlContext.registerFunction("to_lowercase", lambda x:transform_review(x),returnType=ArrayType(StringType(), True)) 390 | 391 | # use the registered function inside SQL 392 | sql_query_transform = """SELECT asin, reviewText, to_lowercase(reviewText) as cleaned 393 | FROM result 394 | """ 395 | 396 | result_transform = sqlContext.sql(sql_query_transform) 397 | result_transform.show() 398 | 399 | # FINALLY, Mix and Match!! 400 | 401 | # You can also mix DataFrames, RDDs and SparkSQL to make it work for you. 402 | 403 | # Scenario: 404 | # We want to investigate the average rating of reviews in terms of the categories they belong to. In order to do this, we: 405 | # + query the needed data using DataFrames API 406 | # + classify the reviews into different categories using core RDD API 407 | # + query the avearage rating for each category using Spark SQL 408 | 409 | # load classifier and broadcast it 410 | model = pickle.load(open('Data/classifiers/classifier.pkl', 'r')) 411 | classifier_b = sc.broadcast(model) 412 | 413 | # DO CLASSIFICATION IN CORE RDD FORMAT 414 | # fashion.map(lambda x: eval(x)['reviewText']).map(lambda x: (x, model_b.value.predict([x])[0])).first() 415 | classifiedRDD = result_transform.map(lambda row: 416 | (row.asin,row.reviewText,str(classifier_b.value.predict(row.cleaned)[0])) 417 | ) 418 | 419 | classifiedRDD.first() 420 | 421 | # Transform the RDD into a DataFrame 422 | CLASSIFIED_SCHEMA = StructType([ 423 | StructField('asin', StringType(), True), 424 | StructField('review', StringType(), True), 425 | StructField('category', StringType(), True) 426 | ]) 427 | 428 | classifiedDF = sqlContext.createDataFrame(classifiedRDD,CLASSIFIED_SCHEMA) 429 | 430 | classifiedDF.show() 431 | 432 | # run a SQL query on the data 433 | classifiedDF.registerAsTable('enrichedReviews') 434 | 435 | sql_query_test = """SELECT category, avg(overall) as avgRating 436 | FROM reviews 437 | JOIN products ON reviews.asin=products.asin 438 | JOIN enrichedReviews ON products.asin=enrichedReviews.asin 439 | WHERE price > 50.0 440 | GROUP BY enrichedReviews.category 441 | """ 442 | 443 | resultTest = sqlContext.sql(sql_query_test) 444 | resultTest.show() 445 | 446 | 447 | 448 | 449 | --------------------------------------------------------------------------------