├── README.md └── SPARK ├── Dataframe ├── Emp.csv └── spark dataframe - All Most API Practice.ipynb └── RDD ├── .ipynb_checkpoints └── Spark_RDD_retaiDB_2-checkpoint.ipynb ├── SPARK_RDD.ipynb ├── o_items ├── orders └── products /README.md: -------------------------------------------------------------------------------- 1 | # Pyspark Dataframe made easy 😀 2 | > (Zero to Hero) 3 | 4 | - Vinay Chaudhari 5 | 6 | ___ 7 | 8 | > Before you start, 9 | 1) Open Google colab or Any IDE and hit following cmd. 10 | 2) pip install pyspark 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | ____ 20 | 21 | # A 22 | ### ○ agg 23 | ### ○ alias 24 | ### ○ agg 25 | 26 | 27 | # C 28 | ### ○ cache 29 | ### ○ coalesce 30 | ### ○ columns 31 | ### ○ corr 32 | ### ○ count 33 | ### ○ cov 34 | ### ○ crosstab 35 | ### ○ cube 36 | ### ○ coalesce 37 | 38 | # D 39 | ### ○ describe 40 | ### ○ destinct 41 | ### ○ drop 42 | ### ○ dropDuplicates 43 | ### ○ dropna 44 | ### ○ dtypes 45 | 46 | 47 | # E 48 | ### ○ explain 49 | 50 | 51 | # F 52 | ### ○ fillna 53 | ### ○ filter 54 | ### ○ first 55 | ### ○ flatmap 56 | ### ○ foreach 57 | ### ○ foreachPartition 58 | ### ○ freqItems 59 | 60 | # G 61 | ### ○ groupBy 62 | 63 | # H 64 | ### ○ head 65 | 66 | # I 67 | ### ○ intersect 68 | ### ○ isLocal 69 | 70 | # J 71 | ### ○ join 72 | 73 | # L 74 | ### ○ limit 75 | 76 | # M 77 | ### ○ map 78 | ### ○ mapPartitions 79 | 80 | # N 81 | ### ○ na 82 | 83 | # O 84 | ### ○ orderBy 85 | 86 | # P 87 | ### ○ persist 88 | ### ○ printSchema 89 | 90 | # R 91 | ### ○ randomSplit 92 | ### ○ rdd 93 | ### ○ registerTempTable 94 | ### ○ repartition 95 | ### ○ replace 96 | ### ○ rollup 97 | 98 | # S 99 | ### ○ sample 100 | ### ○ sampleBy 101 | ### ○ schema 102 | ### ○ select 103 | ### ○ selectExpr 104 | ### ○ show 105 | ### ○ sort 106 | ### ○ sortWithPartitions 107 | ### ○ stat 108 | ### ○ subtract 109 | 110 | 111 | ## ✔ CONVERSIONS 112 | 113 | 114 | # T 115 | ### ○ take 116 | ### ○ toDF 117 | ### ○ toJSON 118 | ### ○ toPANDAS 119 | 120 | # U 121 | ### ○ unionAll 122 | ### ○ upersist 123 | 124 | # W 125 | ### ○ where(filter) 126 | ### ○ withColumn 127 | ### ○ withColumnRenamed 128 | ### ○ write 129 | 130 | **** MAKE PR FOR CONTRIBUTION AND SUGGESTIONS :) **** 131 | 132 | 133 | ```python 134 | import IPython 135 | 136 | ``` 137 | 138 | 139 | ```python 140 | #agg 141 | 142 | x = sqlContext.createDataFrame([("vinay","sunny",100),("deepak","parag",200),("akash","pravin",300)],['from','to','amount']) 143 | y = x.agg({"amount":"avg"}) 144 | 145 | x.show() 146 | y.show() 147 | ``` 148 | 149 | +------+------+------+ 150 | | from| to|amount| 151 | +------+------+------+ 152 | | vinay| sunny| 100| 153 | |deepak| parag| 200| 154 | | akash|pravin| 300| 155 | +------+------+------+ 156 | 157 | +-----------+ 158 | |avg(amount)| 159 | +-----------+ 160 | | 200.0| 161 | +-----------+ 162 | 163 | 164 | 165 | 166 | ```python 167 | #alias 168 | from pyspark.sql.functions import col 169 | x = sqlContext.createDataFrame([("vinay","sunny",100),("deepak","parag",200),("akash","pravin",300)],['from','to','amount']) 170 | y = x.alias("transactions") 171 | 172 | x.show() 173 | y.select(col("transactions.to")).show() 174 | ``` 175 | 176 | +------+------+------+ 177 | | from| to|amount| 178 | +------+------+------+ 179 | | vinay| sunny| 100| 180 | |deepak| parag| 200| 181 | | akash|pravin| 300| 182 | +------+------+------+ 183 | 184 | +------+ 185 | | to| 186 | +------+ 187 | | sunny| 188 | | parag| 189 | |pravin| 190 | +------+ 191 | 192 | 193 | 194 | 195 | ```python 196 | #cache 197 | 198 | x = sqlContext.createDataFrame([("vinay","sunny",100),("deepak","parag",200),("akash","pravin",300)],['from','to','amount']) 199 | x.cache() 200 | 201 | print(x.count()) #first action materializes x in memory 202 | print(x.count()) #later actions avoid IO overhead 203 | ``` 204 | 205 | 3 206 | 3 207 | 208 | 209 | 210 | ```python 211 | #coalesce 212 | 213 | x_rdd = sc.parallelize([("vinay","sunny",100),("deepak","parag",200),("akash","pravin",300)],4) 214 | x = sqlContext.createDataFrame(x_rdd,['from','to','amount']) 215 | y = x.coalesce(numPartitions=1) 216 | 217 | print(x.rdd.getNumPartitions()) 218 | print(y.rdd.getNumPartitions()) 219 | 220 | x.show() 221 | y.show() 222 | ``` 223 | 224 | 4 225 | 1 226 | +------+------+------+ 227 | | from| to|amount| 228 | +------+------+------+ 229 | | vinay| sunny| 100| 230 | |deepak| parag| 200| 231 | | akash|pravin| 300| 232 | +------+------+------+ 233 | 234 | +------+------+------+ 235 | | from| to|amount| 236 | +------+------+------+ 237 | | vinay| sunny| 100| 238 | |deepak| parag| 200| 239 | | akash|pravin| 300| 240 | +------+------+------+ 241 | 242 | 243 | 244 | 245 | ```python 246 | #collect 247 | 248 | x = sqlContext.createDataFrame([("vinay","sunny",100),("deepak","parag",200),("akash","pravin",300)],['from','to','amount']) 249 | y = x.collect() # it creates list of rows. 250 | x.show() 251 | print(y) 252 | ``` 253 | 254 | +------+------+------+ 255 | | from| to|amount| 256 | +------+------+------+ 257 | | vinay| sunny| 100| 258 | |deepak| parag| 200| 259 | | akash|pravin| 300| 260 | +------+------+------+ 261 | 262 | [Row(from='vinay', to='sunny', amount=100), Row(from='deepak', to='parag', amount=200), Row(from='akash', to='pravin', amount=300)] 263 | 264 | 265 | 266 | ```python 267 | #columns 268 | 269 | x = sqlContext.createDataFrame([("vinay","sunny",100),("deepak","parag",200),("akash","pravin",300)],['from','to','amount']) 270 | 271 | y = x.columns 272 | x.show() 273 | print(y) 274 | ``` 275 | 276 | +------+------+------+ 277 | | from| to|amount| 278 | +------+------+------+ 279 | | vinay| sunny| 100| 280 | |deepak| parag| 200| 281 | | akash|pravin| 300| 282 | +------+------+------+ 283 | 284 | ['from', 'to', 'amount'] 285 | 286 | 287 | 288 | ```python 289 | #corr : Calculates the correlation of 290 | # two columns of a DataFrame as a double value. 291 | 292 | x = sqlContext.createDataFrame([("vinay","sunny",100,300),("deepak","parag",200,600),("akash","pravin",300,900)], ['from','to','amount','fees']) 293 | y = x.corr(col1="amount",col2="fees") 294 | x.show() 295 | print(y) 296 | ``` 297 | 298 | +------+------+------+----+ 299 | | from| to|amount|fees| 300 | +------+------+------+----+ 301 | | vinay| sunny| 100| 300| 302 | |deepak| parag| 200| 600| 303 | | akash|pravin| 300| 900| 304 | +------+------+------+----+ 305 | 306 | 1.0 307 | 308 | 309 | 310 | ```python 311 | #count 312 | #Returns the number of rows in this DataFrame. 313 | x = sqlContext.createDataFrame([("vinay","sunny",100),("deepak","parag",200),("akash","pravin",300)],['from','to','amount']) 314 | x.show() 315 | print(x.count()) 316 | ``` 317 | 318 | +------+------+------+ 319 | | from| to|amount| 320 | +------+------+------+ 321 | | vinay| sunny| 100| 322 | |deepak| parag| 200| 323 | | akash|pravin| 300| 324 | +------+------+------+ 325 | 326 | 3 327 | 328 | 329 | 330 | ```python 331 | #cov 332 | #Calculate the sample covariance for the given columns, 333 | #specified by their names, as a double value. 334 | 335 | x = sqlContext.createDataFrame([("vinay","sunny",100,300),("deepak","parag",200,600),("akash","pravin",300,900)], ['from','to','amount','fees']) 336 | y = x.cov(col1="amount",col2="fees") 337 | 338 | x.show() 339 | print(y) 340 | ``` 341 | 342 | +------+------+------+----+ 343 | | from| to|amount|fees| 344 | +------+------+------+----+ 345 | | vinay| sunny| 100| 300| 346 | |deepak| parag| 200| 600| 347 | | akash|pravin| 300| 900| 348 | +------+------+------+----+ 349 | 350 | 30000.0 351 | 352 | 353 | 354 | ```python 355 | #crosstab 356 | x = sqlContext.createDataFrame([("vinay","deepak",0.1),("sunny","pratik",0.2),("parag","akash",0.3)], ['from','to','amt']) 357 | y = x.crosstab(col1='from',col2='to') 358 | x.show() 359 | y.show() 360 | 361 | ``` 362 | 363 | +-----+------+---+ 364 | | from| to|amt| 365 | +-----+------+---+ 366 | |vinay|deepak|0.1| 367 | |sunny|pratik|0.2| 368 | |parag| akash|0.3| 369 | +-----+------+---+ 370 | 371 | +-------+-----+------+------+ 372 | |from_to|akash|deepak|pratik| 373 | +-------+-----+------+------+ 374 | | parag| 1| 0| 0| 375 | | vinay| 0| 1| 0| 376 | | sunny| 0| 0| 1| 377 | +-------+-----+------+------+ 378 | 379 | 380 | 381 | ### col1 – The name of the first column. Distinct items will make the first item of each row. 382 | ### col2 – The name of the second column. Distinct items will make the column names of the DataFrame. 383 | 384 | 385 | ```python 386 | #cube 387 | 388 | # Create a multi-dimensional cube for the current DataFrame using the specified columns, 389 | # so we can run aggregation on them 390 | 391 | x = sqlContext.createDataFrame([("vinay","deepak",1),("sunny","pratik",2),("parag","akash",3)], ['from','to','amt']) 392 | 393 | y = x.cube('from','to') 394 | x.show() 395 | print(y) 396 | y.sum().show() 397 | y.max().show() 398 | ``` 399 | 400 | +-----+------+---+ 401 | | from| to|amt| 402 | +-----+------+---+ 403 | |vinay|deepak| 1| 404 | |sunny|pratik| 2| 405 | |parag| akash| 3| 406 | +-----+------+---+ 407 | 408 | 409 | +-----+------+--------+ 410 | | from| to|sum(amt)| 411 | +-----+------+--------+ 412 | | null| akash| 3| 413 | | null| null| 6| 414 | |vinay|deepak| 1| 415 | |vinay| null| 1| 416 | | null|deepak| 1| 417 | |parag| akash| 3| 418 | | null|pratik| 2| 419 | |parag| null| 3| 420 | |sunny| null| 2| 421 | |sunny|pratik| 2| 422 | +-----+------+--------+ 423 | 424 | +-----+------+--------+ 425 | | from| to|max(amt)| 426 | +-----+------+--------+ 427 | | null| akash| 3| 428 | | null| null| 3| 429 | |vinay|deepak| 1| 430 | |vinay| null| 1| 431 | | null|deepak| 1| 432 | |parag| akash| 3| 433 | | null|pratik| 2| 434 | |parag| null| 3| 435 | |sunny| null| 2| 436 | |sunny|pratik| 2| 437 | +-----+------+--------+ 438 | 439 | 440 | 441 | 442 | ```python 443 | # Describe 444 | 445 | x = sqlContext.createDataFrame([("vinay","deepak",1),("sunny","pratik",2),("parag","akash",3)], ['from','to','amt']) 446 | 447 | x.show() 448 | x.describe().show() 449 | ``` 450 | 451 | +-----+------+---+ 452 | | from| to|amt| 453 | +-----+------+---+ 454 | |vinay|deepak| 1| 455 | |sunny|pratik| 2| 456 | |parag| akash| 3| 457 | +-----+------+---+ 458 | 459 | +-------+-----+------+---+ 460 | |summary| from| to|amt| 461 | +-------+-----+------+---+ 462 | | count| 3| 3| 3| 463 | | mean| null| null|2.0| 464 | | stddev| null| null|1.0| 465 | | min|parag| akash| 1| 466 | | max|vinay|pratik| 3| 467 | +-------+-----+------+---+ 468 | 469 | 470 | 471 | 472 | ```python 473 | # Distinct 474 | 475 | x = sqlContext.createDataFrame([("vinay","deepak",1),("sunny","pratik",2),("parag","akash",3),("parag","akash",3),("parag","akash",3)], ['from','to','amt']) 476 | y = x.distinct() 477 | 478 | x.show() 479 | y.show() 480 | ``` 481 | 482 | +-----+------+---+ 483 | | from| to|amt| 484 | +-----+------+---+ 485 | |vinay|deepak| 1| 486 | |sunny|pratik| 2| 487 | |parag| akash| 3| 488 | |parag| akash| 3| 489 | |parag| akash| 3| 490 | +-----+------+---+ 491 | 492 | +-----+------+---+ 493 | | from| to|amt| 494 | +-----+------+---+ 495 | |sunny|pratik| 2| 496 | |vinay|deepak| 1| 497 | |parag| akash| 3| 498 | +-----+------+---+ 499 | 500 | 501 | 502 | 503 | ```python 504 | # Drop 505 | 506 | x = sqlContext.createDataFrame([("vinay","deepak",1),("sunny","pratik",2),("parag","akash",3)], ['from','to','amt']) 507 | y = x.drop('amt') 508 | 509 | x.show() 510 | y.show() 511 | ``` 512 | 513 | +-----+------+---+ 514 | | from| to|amt| 515 | +-----+------+---+ 516 | |vinay|deepak| 1| 517 | |sunny|pratik| 2| 518 | |parag| akash| 3| 519 | +-----+------+---+ 520 | 521 | +-----+------+ 522 | | from| to| 523 | +-----+------+ 524 | |vinay|deepak| 525 | |sunny|pratik| 526 | |parag| akash| 527 | +-----+------+ 528 | 529 | 530 | 531 | 532 | ```python 533 | # dropDuplicates 534 | 535 | x = sqlContext.createDataFrame([("vinay","deepak",1),("sunny","pratik",2),("parag","akash",3),("parag","akash",3),("parag","akash",3)], ['from','to','amt']) 536 | y = x.dropDuplicates(subset=['from','to']) 537 | 538 | x.show() 539 | y.show() 540 | ``` 541 | 542 | +-----+------+---+ 543 | | from| to|amt| 544 | +-----+------+---+ 545 | |vinay|deepak| 1| 546 | |sunny|pratik| 2| 547 | |parag| akash| 3| 548 | |parag| akash| 3| 549 | |parag| akash| 3| 550 | +-----+------+---+ 551 | 552 | +-----+------+---+ 553 | | from| to|amt| 554 | +-----+------+---+ 555 | |vinay|deepak| 1| 556 | |sunny|pratik| 2| 557 | |parag| akash| 3| 558 | +-----+------+---+ 559 | 560 | 561 | 562 | 563 | ```python 564 | #dropna 565 | x = sqlContext.createDataFrame([(None,"vinay",0.1),("vinay","sunny",None),("Peter",None,0.3),("Mark","Steve",0.2)], ['from','to','amount']) 566 | y = x.dropna(how='any',subset=['from','to']) 567 | x.show() 568 | y.show() 569 | ``` 570 | 571 | +-----+-----+------+ 572 | | from| to|amount| 573 | +-----+-----+------+ 574 | | null|vinay| 0.1| 575 | |vinay|sunny| null| 576 | |Peter| null| 0.3| 577 | | Mark|Steve| 0.2| 578 | +-----+-----+------+ 579 | 580 | +-----+-----+------+ 581 | | from| to|amount| 582 | +-----+-----+------+ 583 | |vinay|sunny| null| 584 | | Mark|Steve| 0.2| 585 | +-----+-----+------+ 586 | 587 | 588 | 589 | 590 | ```python 591 | #dtypes 592 | 593 | x = sqlContext.createDataFrame([("vinay","deepak",1),("sunny","pratik",2),("parag","akash",3),("parag","akash",3),("parag","akash",3)], ['from','to','amt']) 594 | y = x.dtypes 595 | 596 | x.show() 597 | print(y) 598 | ``` 599 | 600 | +-----+------+---+ 601 | | from| to|amt| 602 | +-----+------+---+ 603 | |vinay|deepak| 1| 604 | |sunny|pratik| 2| 605 | |parag| akash| 3| 606 | |parag| akash| 3| 607 | |parag| akash| 3| 608 | +-----+------+---+ 609 | 610 | [('from', 'string'), ('to', 'string'), ('amt', 'bigint')] 611 | 612 | 613 | 614 | ```python 615 | #Explain 616 | 617 | x = sqlContext.createDataFrame([("vinay","deepak",1),("sunny","pratik",2),("parag","akash",3)], ['from','to','amt']) 618 | x.show() 619 | 620 | x.agg({"amt":"avg"}).explain(extended = True) 621 | 622 | 623 | ``` 624 | 625 | +-----+------+---+ 626 | | from| to|amt| 627 | +-----+------+---+ 628 | |vinay|deepak| 1| 629 | |sunny|pratik| 2| 630 | |parag| akash| 3| 631 | +-----+------+---+ 632 | 633 | == Parsed Logical Plan == 634 | 'Aggregate ['avg(amt#169L) AS avg(amt)#187] 635 | +- AnalysisBarrier 636 | +- LogicalRDD [from#167, to#168, amt#169L], false 637 | 638 | == Analyzed Logical Plan == 639 | avg(amt): double 640 | Aggregate [avg(amt#169L) AS avg(amt)#187] 641 | +- LogicalRDD [from#167, to#168, amt#169L], false 642 | 643 | == Optimized Logical Plan == 644 | Aggregate [avg(amt#169L) AS avg(amt)#187] 645 | +- Project [amt#169L] 646 | +- LogicalRDD [from#167, to#168, amt#169L], false 647 | 648 | == Physical Plan == 649 | *(2) HashAggregate(keys=[], functions=[avg(amt#169L)], output=[avg(amt)#187]) 650 | +- Exchange SinglePartition 651 | +- *(1) HashAggregate(keys=[], functions=[partial_avg(amt#169L)], output=[sum#192, count#193L]) 652 | +- *(1) Project [amt#169L] 653 | +- Scan ExistingRDD[from#167,to#168,amt#169L] 654 | 655 | 656 | 657 | ```python 658 | #fillna 659 | 660 | x = sqlContext.createDataFrame([(None,"deepak",1),("sunny",None,2),("parag",None,3)], ['from','to','amt']) 661 | y = x.fillna(value = '---',subset = ['from','to']) 662 | 663 | x.show() 664 | y.show() 665 | ``` 666 | 667 | +-----+------+---+ 668 | | from| to|amt| 669 | +-----+------+---+ 670 | | null|deepak| 1| 671 | |sunny| null| 2| 672 | |parag| null| 3| 673 | +-----+------+---+ 674 | 675 | +-----+------+---+ 676 | | from| to|amt| 677 | +-----+------+---+ 678 | | ---|deepak| 1| 679 | |sunny| ---| 2| 680 | |parag| ---| 3| 681 | +-----+------+---+ 682 | 683 | 684 | 685 | ## Filter (Most used api) 686 | 687 | 688 | ```python 689 | # Filter 690 | 691 | x = sqlContext.createDataFrame([("vinay","deepak",1),("sunny","pratik",2),("parag","akash",3)], ['from','to','amt']) 692 | y = x.filter("amt > 2 ") 693 | 694 | x.show() 695 | y.show() 696 | ``` 697 | 698 | +-----+------+---+ 699 | | from| to|amt| 700 | +-----+------+---+ 701 | |vinay|deepak| 1| 702 | |sunny|pratik| 2| 703 | |parag| akash| 3| 704 | +-----+------+---+ 705 | 706 | +-----+-----+---+ 707 | | from| to|amt| 708 | +-----+-----+---+ 709 | |parag|akash| 3| 710 | +-----+-----+---+ 711 | 712 | 713 | 714 | 715 | ```python 716 | # First 717 | 718 | x = sqlContext.createDataFrame([("vinay","deepak",1),("sunny","pratik",2),("parag","akash",3)], ['from','to','amt']) 719 | y = x.first() 720 | 721 | x.show() 722 | print(y) 723 | ``` 724 | 725 | +-----+------+---+ 726 | | from| to|amt| 727 | +-----+------+---+ 728 | |vinay|deepak| 1| 729 | |sunny|pratik| 2| 730 | |parag| akash| 3| 731 | +-----+------+---+ 732 | 733 | Row(from='vinay', to='deepak', amt=1) 734 | 735 | 736 | ## Foreach 737 | 738 | 739 | ```python 740 | # foreach 741 | from __future__ import print_function 742 | 743 | # setup 744 | fn = './foreachExampleDataFrames.txt' 745 | open(fn, 'w').close() # clear the file 746 | def fappend(el,f): 747 | '''appends el to file f''' 748 | print(el,file=open(f, 'a+') ) 749 | 750 | # example 751 | x = sqlContext.createDataFrame([("vinay","deepak",1),("sunny","pratik",2),("parag","akash",3)], ['from','to','amt']) 752 | 753 | y = x.foreach(lambda x: fappend(x,fn)) # writes into foreachExampleDataFrames.txt 754 | x.show() # original dataframe 755 | print(y) # foreach returns 'None' 756 | # print the contents of the file 757 | with open(fn, "r") as foreachExample: 758 | print (foreachExample.read()) 759 | ``` 760 | 761 | +-----+------+---+ 762 | | from| to|amt| 763 | +-----+------+---+ 764 | |vinay|deepak| 1| 765 | |sunny|pratik| 2| 766 | |parag| akash| 3| 767 | +-----+------+---+ 768 | 769 | None 770 | Row(from='vinay', to='deepak', amt=1) 771 | Row(from='sunny', to='pratik', amt=2) 772 | 773 | 774 | 775 | 776 | ```python 777 | # foreachPartition 778 | from __future__ import print_function 779 | 780 | # setup 781 | fn = './foreachExampleDataFrames.txt' 782 | open(fn, 'w').close() # clear the file 783 | def fappend(el,f): 784 | '''appends el to file f''' 785 | print(el,file=open(f, 'a+') ) 786 | 787 | # example 788 | x = sqlContext.createDataFrame([("vinay","deepak",1),("sunny","pratik",2),("parag","akash",3)], ['from','to','amt']) 789 | 790 | y = x.foreach(lambda x: fappend(x,fn)) # writes into foreachExampleDataFrames.txt 791 | x.show() # original dataframe 792 | print(y) # foreach returns 'None' 793 | # print the contents of the file 794 | with open(fn, "r") as foreachExample: 795 | print (foreachExample.read()) 796 | ``` 797 | 798 | +-----+------+---+ 799 | | from| to|amt| 800 | +-----+------+---+ 801 | |vinay|deepak| 1| 802 | |sunny|pratik| 2| 803 | |parag| akash| 3| 804 | +-----+------+---+ 805 | 806 | None 807 | Row(from='parag', to='akash', amt=3) 808 | Row(from='sunny', to='pratik', amt=2) 809 | Row(from='vinay', to='deepak', amt=1) 810 | 811 | 812 | 813 | 814 | ```python 815 | # freqItems 816 | 817 | x = sqlContext.createDataFrame([("Vinay","sunny",50), \ 818 | ("Deepak","sunny",30), \ 819 | ("Vinay","Parag",20), \ 820 | ("Vinay","ram",50), \ 821 | ("sham","sunny",90), \ 822 | ("Vinay","pushpak",50), \ 823 | ("om","sunny",50), \ 824 | ("sagar","sunny",50), \ 825 | ("Vinay","rahul",80), \ 826 | ("akash","sunny",50), \ 827 | ("puranik","pranav",70)],\ 828 | ['from','to','amount']) 829 | 830 | y = x.freqItems(cols=['from','amount'],support=0.8) 831 | 832 | x.show() 833 | y.show() 834 | ``` 835 | 836 | +-------+-------+------+ 837 | | from| to|amount| 838 | +-------+-------+------+ 839 | | Vinay| sunny| 50| 840 | | Deepak| sunny| 30| 841 | | Vinay| Parag| 20| 842 | | Vinay| ram| 50| 843 | | sham| sunny| 90| 844 | | Vinay|pushpak| 50| 845 | | om| sunny| 50| 846 | | sagar| sunny| 50| 847 | | Vinay| rahul| 80| 848 | | akash| sunny| 50| 849 | |puranik| pranav| 70| 850 | +-------+-------+------+ 851 | 852 | +--------------+----------------+ 853 | |from_freqItems|amount_freqItems| 854 | +--------------+----------------+ 855 | | [Vinay]| [50]| 856 | +--------------+----------------+ 857 | 858 | 859 | 860 | ## groupBy (most used api) 861 | 862 | 863 | ```python 864 | # groupBy 865 | 866 | x = sqlContext.createDataFrame([("vinay","deepak",1),("sunny","pratik",2),("parag","akash",3)], ['from','to','amt']) 867 | y = x.groupBy('amt') 868 | 869 | x.show() 870 | print(y) 871 | 872 | ``` 873 | 874 | +-----+------+---+ 875 | | from| to|amt| 876 | +-----+------+---+ 877 | |vinay|deepak| 1| 878 | |sunny|pratik| 2| 879 | |parag| akash| 3| 880 | +-----+------+---+ 881 | 882 | 883 | 884 | 885 | 886 | ```python 887 | # groupBy (col1).avg(col2) 888 | 889 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455)], ['from','to','amt']) 890 | y = x.groupBy('from').avg('amt') 891 | 892 | x.show() 893 | y.show() 894 | 895 | ``` 896 | 897 | +-----+------+--------+ 898 | | from| to| amt| 899 | +-----+------+--------+ 900 | |vinay|deepak|12466641| 901 | |sunny|pratik| 451232| 902 | |parag| akash| 2555455| 903 | +-----+------+--------+ 904 | 905 | +-----+-----------+ 906 | | from| avg(amt)| 907 | +-----+-----------+ 908 | |parag| 2555455.0| 909 | |sunny| 451232.0| 910 | |vinay|1.2466641E7| 911 | +-----+-----------+ 912 | 913 | 914 | 915 | 916 | ```python 917 | # head 918 | 919 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455)], ['from','to','amt']) 920 | 921 | y = x.head(2) 922 | x.show() 923 | print(y) 924 | ``` 925 | 926 | +-----+------+--------+ 927 | | from| to| amt| 928 | +-----+------+--------+ 929 | |vinay|deepak|12466641| 930 | |sunny|pratik| 451232| 931 | |parag| akash| 2555455| 932 | +-----+------+--------+ 933 | 934 | [Row(from='vinay', to='deepak', amt=12466641), Row(from='sunny', to='pratik', amt=451232)] 935 | 936 | 937 | 938 | ```python 939 | # intersect 940 | 941 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455),("parag","akash",2555455)], ['from','to','amt']) 942 | 943 | y = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455),("parag","akashay",2555455)], ['from','to','amt']) 944 | 945 | z = x.intersect(y) 946 | 947 | x.show() 948 | y.show() 949 | z.show() 950 | ``` 951 | 952 | +-----+------+--------+ 953 | | from| to| amt| 954 | +-----+------+--------+ 955 | |vinay|deepak|12466641| 956 | |sunny|pratik| 451232| 957 | |parag| akash| 2555455| 958 | |parag| akash| 2555455| 959 | +-----+------+--------+ 960 | 961 | +-----+-------+--------+ 962 | | from| to| amt| 963 | +-----+-------+--------+ 964 | |vinay| deepak|12466641| 965 | |sunny| pratik| 451232| 966 | |parag| akash| 2555455| 967 | |parag|akashay| 2555455| 968 | +-----+-------+--------+ 969 | 970 | +-----+------+--------+ 971 | | from| to| amt| 972 | +-----+------+--------+ 973 | |sunny|pratik| 451232| 974 | |vinay|deepak|12466641| 975 | |parag| akash| 2555455| 976 | +-----+------+--------+ 977 | 978 | 979 | 980 | 981 | ```python 982 | # isLocal 983 | 984 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455),("parag","akash",2555455)], ['from','to','amt']) 985 | 986 | 987 | y = x.isLocal() 988 | 989 | x.show() 990 | print(y) 991 | 992 | ``` 993 | 994 | +-----+------+--------+ 995 | | from| to| amt| 996 | +-----+------+--------+ 997 | |vinay|deepak|12466641| 998 | |sunny|pratik| 451232| 999 | |parag| akash| 2555455| 1000 | |parag| akash| 2555455| 1001 | +-----+------+--------+ 1002 | 1003 | False 1004 | 1005 | 1006 | ## join (Most used api) 1007 | 1008 | 1009 | ```python 1010 | # join 1011 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455),("Salman","akash",2555455)], ['from','to','amt']) 1012 | y = sqlContext.createDataFrame([('Andy',20),("Steve",40),("Elon",80)], ['name','age']) 1013 | z = x.join(y,x.to ==y.name,'inner').select('from','to','amt','age') 1014 | x.show() 1015 | y.show() 1016 | z.show() 1017 | ``` 1018 | 1019 | +------+------+--------+ 1020 | | from| to| amt| 1021 | +------+------+--------+ 1022 | | vinay|deepak|12466641| 1023 | | sunny|pratik| 451232| 1024 | | parag| akash| 2555455| 1025 | |Salman| akash| 2555455| 1026 | +------+------+--------+ 1027 | 1028 | +-----+---+ 1029 | | name|age| 1030 | +-----+---+ 1031 | | Andy| 20| 1032 | |Steve| 40| 1033 | | Elon| 80| 1034 | +-----+---+ 1035 | 1036 | +----+---+---+---+ 1037 | |from| to|amt|age| 1038 | +----+---+---+---+ 1039 | +----+---+---+---+ 1040 | 1041 | 1042 | 1043 | 1044 | ```python 1045 | # join 1046 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455),("Salman","akash",2555455)], ['from','to','amt']) 1047 | y = sqlContext.createDataFrame([('Andy',20),("Steve",40),("Elon",80)], ['name','age']) 1048 | z = x.join(y,x.to ==y.name,'outer').select('from','to','amt','age') 1049 | x.show() 1050 | y.show() 1051 | z.show() 1052 | ``` 1053 | 1054 | +------+------+--------+ 1055 | | from| to| amt| 1056 | +------+------+--------+ 1057 | | vinay|deepak|12466641| 1058 | | sunny|pratik| 451232| 1059 | | parag| akash| 2555455| 1060 | |Salman| akash| 2555455| 1061 | +------+------+--------+ 1062 | 1063 | +-----+---+ 1064 | | name|age| 1065 | +-----+---+ 1066 | | Andy| 20| 1067 | |Steve| 40| 1068 | | Elon| 80| 1069 | +-----+---+ 1070 | 1071 | +------+------+--------+----+ 1072 | | from| to| amt| age| 1073 | +------+------+--------+----+ 1074 | | null| null| null| 40| 1075 | | sunny|pratik| 451232|null| 1076 | | vinay|deepak|12466641|null| 1077 | | null| null| null| 20| 1078 | | parag| akash| 2555455|null| 1079 | |Salman| akash| 2555455|null| 1080 | | null| null| null| 80| 1081 | +------+------+--------+----+ 1082 | 1083 | 1084 | 1085 | 1086 | ```python 1087 | # Limit 1088 | 1089 | # join 1090 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455),("Salman","akash",2555455)], ['from','to','amt']) 1091 | y = x.limit(2) 1092 | x.show() 1093 | y.show() 1094 | ``` 1095 | 1096 | +------+------+--------+ 1097 | | from| to| amt| 1098 | +------+------+--------+ 1099 | | vinay|deepak|12466641| 1100 | | sunny|pratik| 451232| 1101 | | parag| akash| 2555455| 1102 | |Salman| akash| 2555455| 1103 | +------+------+--------+ 1104 | 1105 | +-----+------+--------+ 1106 | | from| to| amt| 1107 | +-----+------+--------+ 1108 | |vinay|deepak|12466641| 1109 | |sunny|pratik| 451232| 1110 | +-----+------+--------+ 1111 | 1112 | 1113 | 1114 | 1115 | 1116 | 1117 | ```python 1118 | # na 1119 | 1120 | x = sqlContext.createDataFrame([(None,"Bob",0.1),("Bob","Carol",None),("Carol",None,0.3),("Bob","Carol",0.2)], ['from','to','amt']) 1121 | y = x.na # returns an object for handling missing values, supports drop, fill, and replace methods 1122 | x.show() 1123 | print(y) 1124 | y.drop().show() 1125 | 1126 | y.fill({'from':'unknown','to':'unknown','amt':0}).show() 1127 | y.fill('--').show() 1128 | ``` 1129 | 1130 | +-----+-----+----+ 1131 | | from| to| amt| 1132 | +-----+-----+----+ 1133 | | null| Bob| 0.1| 1134 | | Bob|Carol|null| 1135 | |Carol| null| 0.3| 1136 | | Bob|Carol| 0.2| 1137 | +-----+-----+----+ 1138 | 1139 | 1140 | +----+-----+---+ 1141 | |from| to|amt| 1142 | +----+-----+---+ 1143 | | Bob|Carol|0.2| 1144 | +----+-----+---+ 1145 | 1146 | +-------+-------+---+ 1147 | | from| to|amt| 1148 | +-------+-------+---+ 1149 | |unknown| Bob|0.1| 1150 | | Bob| Carol|0.0| 1151 | | Carol|unknown|0.3| 1152 | | Bob| Carol|0.2| 1153 | +-------+-------+---+ 1154 | 1155 | +-----+-----+----+ 1156 | | from| to| amt| 1157 | +-----+-----+----+ 1158 | | --| Bob| 0.1| 1159 | | Bob|Carol|null| 1160 | |Carol| --| 0.3| 1161 | | Bob|Carol| 0.2| 1162 | +-----+-----+----+ 1163 | 1164 | 1165 | 1166 | 1167 | ```python 1168 | # orderBy 1169 | 1170 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455)], ['from','to','amt']) 1171 | 1172 | y = x.orderBy(['amt'],ascending=[False]) 1173 | z = x.orderBy(['amt'],ascending=[True]) 1174 | x.show() 1175 | y.show() 1176 | z.show() 1177 | 1178 | ``` 1179 | 1180 | +-----+------+--------+ 1181 | | from| to| amt| 1182 | +-----+------+--------+ 1183 | |vinay|deepak|12466641| 1184 | |sunny|pratik| 451232| 1185 | |parag| akash| 2555455| 1186 | +-----+------+--------+ 1187 | 1188 | +-----+------+--------+ 1189 | | from| to| amt| 1190 | +-----+------+--------+ 1191 | |vinay|deepak|12466641| 1192 | |parag| akash| 2555455| 1193 | |sunny|pratik| 451232| 1194 | +-----+------+--------+ 1195 | 1196 | +-----+------+--------+ 1197 | | from| to| amt| 1198 | +-----+------+--------+ 1199 | |sunny|pratik| 451232| 1200 | |parag| akash| 2555455| 1201 | |vinay|deepak|12466641| 1202 | +-----+------+--------+ 1203 | 1204 | 1205 | 1206 | 1207 | ```python 1208 | # PrintSchema 1209 | 1210 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455)], ['from','to','amt']) 1211 | x.show() 1212 | x.printSchema() 1213 | ``` 1214 | 1215 | +-----+------+--------+ 1216 | | from| to| amt| 1217 | +-----+------+--------+ 1218 | |vinay|deepak|12466641| 1219 | |sunny|pratik| 451232| 1220 | |parag| akash| 2555455| 1221 | +-----+------+--------+ 1222 | 1223 | root 1224 | |-- from: string (nullable = true) 1225 | |-- to: string (nullable = true) 1226 | |-- amt: long (nullable = true) 1227 | 1228 | 1229 | 1230 | 1231 | ```python 1232 | # randomSplit 1233 | 1234 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455)], ['from','to','amt']) 1235 | y = x.randomSplit([0.5,0.5]) 1236 | 1237 | x.show() 1238 | y[0].show() 1239 | y[1].show() 1240 | 1241 | 1242 | 1243 | ``` 1244 | 1245 | +-----+------+--------+ 1246 | | from| to| amt| 1247 | +-----+------+--------+ 1248 | |vinay|deepak|12466641| 1249 | |sunny|pratik| 451232| 1250 | |parag| akash| 2555455| 1251 | +-----+------+--------+ 1252 | 1253 | +-----+------+-------+ 1254 | | from| to| amt| 1255 | +-----+------+-------+ 1256 | |sunny|pratik| 451232| 1257 | |parag| akash|2555455| 1258 | +-----+------+-------+ 1259 | 1260 | +-----+------+--------+ 1261 | | from| to| amt| 1262 | +-----+------+--------+ 1263 | |vinay|deepak|12466641| 1264 | +-----+------+--------+ 1265 | 1266 | 1267 | 1268 | 1269 | ```python 1270 | # rdd 1271 | 1272 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455)], ['from','to','amt']) 1273 | y = x.rdd 1274 | 1275 | x.show() 1276 | print(y.collect()) 1277 | 1278 | ``` 1279 | 1280 | +-----+------+--------+ 1281 | | from| to| amt| 1282 | +-----+------+--------+ 1283 | |vinay|deepak|12466641| 1284 | |sunny|pratik| 451232| 1285 | |parag| akash| 2555455| 1286 | +-----+------+--------+ 1287 | 1288 | [Row(from='vinay', to='deepak', amt=12466641), Row(from='sunny', to='pratik', amt=451232), Row(from='parag', to='akash', amt=2555455)] 1289 | 1290 | 1291 | 1292 | ```python 1293 | # registerTempTable 1294 | 1295 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455)], ['from','to','amt']) 1296 | x.registerTempTable(name="TRANS") 1297 | y = sqlContext.sql('SELECT * FROM TRANS WHERE amt > 451232') 1298 | 1299 | x.show() 1300 | y.show() 1301 | ``` 1302 | 1303 | +-----+------+--------+ 1304 | | from| to| amt| 1305 | +-----+------+--------+ 1306 | |vinay|deepak|12466641| 1307 | |sunny|pratik| 451232| 1308 | |parag| akash| 2555455| 1309 | +-----+------+--------+ 1310 | 1311 | +-----+------+--------+ 1312 | | from| to| amt| 1313 | +-----+------+--------+ 1314 | |vinay|deepak|12466641| 1315 | |parag| akash| 2555455| 1316 | +-----+------+--------+ 1317 | 1318 | 1319 | 1320 | 1321 | ```python 1322 | # repartiton 1323 | 1324 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455)], ['from','to','amt']) 1325 | y = x.repartition(3) 1326 | 1327 | print(x.rdd.getNumPartitions()) 1328 | print(y.rdd.getNumPartitions()) 1329 | y.show() 1330 | ``` 1331 | 1332 | 4 1333 | 3 1334 | +-----+------+--------+ 1335 | | from| to| amt| 1336 | +-----+------+--------+ 1337 | |parag| akash| 2555455| 1338 | |vinay|deepak|12466641| 1339 | |sunny|pratik| 451232| 1340 | +-----+------+--------+ 1341 | 1342 | 1343 | 1344 | 1345 | ```python 1346 | # replace 1347 | 1348 | x = sqlContext.createDataFrame([("vinay","deepak",12466641),("sunny","pratik",451232),("parag","akash",2555455)], ['from','to','amt']) 1349 | y = x.replace('vinay','sunny',['from','to']) 1350 | 1351 | x.show() 1352 | y.show() 1353 | 1354 | 1355 | ``` 1356 | 1357 | +-----+------+--------+ 1358 | | from| to| amt| 1359 | +-----+------+--------+ 1360 | |vinay|deepak|12466641| 1361 | |sunny|pratik| 451232| 1362 | |parag| akash| 2555455| 1363 | +-----+------+--------+ 1364 | 1365 | +-----+------+--------+ 1366 | | from| to| amt| 1367 | +-----+------+--------+ 1368 | |sunny|deepak|12466641| 1369 | |sunny|pratik| 451232| 1370 | |parag| akash| 2555455| 1371 | +-----+------+--------+ 1372 | 1373 | 1374 | 1375 | 1376 | ```python 1377 | # replace 1378 | 1379 | x = sqlContext.createDataFrame([('Sunny',"chirag",0.1),("deepak","vinay",0.2),("Carol","Dave",0.3)], ['from','to','amt']) 1380 | y = x.replace('Sunny','Pranav',['from','to']) 1381 | 1382 | x.show() 1383 | y.show() 1384 | 1385 | ``` 1386 | 1387 | +------+------+---+ 1388 | | from| to|amt| 1389 | +------+------+---+ 1390 | | Sunny|chirag|0.1| 1391 | |deepak| vinay|0.2| 1392 | | Carol| Dave|0.3| 1393 | +------+------+---+ 1394 | 1395 | +------+------+---+ 1396 | | from| to|amt| 1397 | +------+------+---+ 1398 | |Pranav|chirag|0.1| 1399 | |deepak| vinay|0.2| 1400 | | Carol| Dave|0.3| 1401 | +------+------+---+ 1402 | 1403 | 1404 | 1405 | 1406 | ```python 1407 | #rollup 1408 | 1409 | x = sqlContext.createDataFrame([("vinay","deepak",1246.6641),("sunny","pratik",4512.32),("parag","akash",2555.455)], ['from','to','amt']) 1410 | y = x.rollup(['from','to']) 1411 | x.show() 1412 | 1413 | print(y) 1414 | #y is a grouped data object 1415 | #aggregations will be applied to all numerical columns 1416 | 1417 | y.sum().show() 1418 | y.max().show() 1419 | y.min().show() 1420 | ``` 1421 | 1422 | +-----+------+---------+ 1423 | | from| to| amt| 1424 | +-----+------+---------+ 1425 | |vinay|deepak|1246.6641| 1426 | |sunny|pratik| 4512.32| 1427 | |parag| akash| 2555.455| 1428 | +-----+------+---------+ 1429 | 1430 | 1431 | +-----+------+---------+ 1432 | | from| to| sum(amt)| 1433 | +-----+------+---------+ 1434 | | null| null|8314.4391| 1435 | |vinay|deepak|1246.6641| 1436 | |vinay| null|1246.6641| 1437 | |parag| akash| 2555.455| 1438 | |parag| null| 2555.455| 1439 | |sunny| null| 4512.32| 1440 | |sunny|pratik| 4512.32| 1441 | +-----+------+---------+ 1442 | 1443 | +-----+------+---------+ 1444 | | from| to| max(amt)| 1445 | +-----+------+---------+ 1446 | | null| null| 4512.32| 1447 | |vinay|deepak|1246.6641| 1448 | |vinay| null|1246.6641| 1449 | |parag| akash| 2555.455| 1450 | |parag| null| 2555.455| 1451 | |sunny| null| 4512.32| 1452 | |sunny|pratik| 4512.32| 1453 | +-----+------+---------+ 1454 | 1455 | +-----+------+---------+ 1456 | | from| to| min(amt)| 1457 | +-----+------+---------+ 1458 | | null| null|1246.6641| 1459 | |vinay|deepak|1246.6641| 1460 | |vinay| null|1246.6641| 1461 | |parag| akash| 2555.455| 1462 | |parag| null| 2555.455| 1463 | |sunny| null| 4512.32| 1464 | |sunny|pratik| 4512.32| 1465 | +-----+------+---------+ 1466 | 1467 | 1468 | 1469 | 1470 | ```python 1471 | # sample:- 1472 | # Returns a stratified sample without replacement based 1473 | # on the fraction given on each stratum. 1474 | 1475 | x = sqlContext.createDataFrame([("vinay","deepak",1246.6641),("sunny","pratik",4512.32),("parag","akash",2555.455)], ['from','to','amt']) 1476 | y = x.sample(False,0.5) 1477 | 1478 | x.show() 1479 | y.show() 1480 | 1481 | ``` 1482 | 1483 | +-----+------+---------+ 1484 | | from| to| amt| 1485 | +-----+------+---------+ 1486 | |vinay|deepak|1246.6641| 1487 | |sunny|pratik| 4512.32| 1488 | |parag| akash| 2555.455| 1489 | +-----+------+---------+ 1490 | 1491 | +-----+------+--------+ 1492 | | from| to| amt| 1493 | +-----+------+--------+ 1494 | |sunny|pratik| 4512.32| 1495 | |parag| akash|2555.455| 1496 | +-----+------+--------+ 1497 | 1498 | 1499 | 1500 | 1501 | ```python 1502 | #schema 1503 | x = sqlContext.createDataFrame([("vinay","deepak",1246.6641),("sunny","pratik",4512.32),("parag","akash",2555.455)], ['from','to','amt']) 1504 | y = x.schema 1505 | x.show() 1506 | print(y) 1507 | ``` 1508 | 1509 | +-----+------+---------+ 1510 | | from| to| amt| 1511 | +-----+------+---------+ 1512 | |vinay|deepak|1246.6641| 1513 | |sunny|pratik| 4512.32| 1514 | |parag| akash| 2555.455| 1515 | +-----+------+---------+ 1516 | 1517 | StructType(List(StructField(from,StringType,true),StructField(to,StringType,true),StructField(amt,DoubleType,true))) 1518 | 1519 | 1520 | 1521 | ```python 1522 | # SlectExpr 1523 | x = sqlContext.createDataFrame([("vinay","deepak",1246.6641),("sunny","pratik",4512.32),("parag","akash",2555.455)], ['from','to','amt']) 1524 | y = x.selectExpr(['substr(from,1,1)','amt+1000']) 1525 | 1526 | x.show() 1527 | y.show() 1528 | ``` 1529 | 1530 | +-----+------+---------+ 1531 | | from| to| amt| 1532 | +-----+------+---------+ 1533 | |vinay|deepak|1246.6641| 1534 | |sunny|pratik| 4512.32| 1535 | |parag| akash| 2555.455| 1536 | +-----+------+---------+ 1537 | 1538 | +---------------------+------------+ 1539 | |substring(from, 1, 1)|(amt + 1000)| 1540 | +---------------------+------------+ 1541 | | v| 2246.6641| 1542 | | s| 5512.32| 1543 | | p| 3555.455| 1544 | +---------------------+------------+ 1545 | 1546 | 1547 | 1548 | 1549 | ```python 1550 | # show 1551 | 1552 | x = sqlContext.createDataFrame([("vinay","deepak",1246.6641),("sunny","pratik",4512.32),("parag","akash",2555.455)], ['from','to','amt']) 1553 | x.show() 1554 | ``` 1555 | 1556 | +-----+------+---------+ 1557 | | from| to| amt| 1558 | +-----+------+---------+ 1559 | |vinay|deepak|1246.6641| 1560 | |sunny|pratik| 4512.32| 1561 | |parag| akash| 2555.455| 1562 | +-----+------+---------+ 1563 | 1564 | 1565 | 1566 | 1567 | ```python 1568 | # sort 1569 | 1570 | x = sqlContext.createDataFrame([("vinay","deepak",1246.6641),("sunny","pratik",4512.32),("parag","akash",2555.455)], ['from','to','amt']) 1571 | y = x.sort(['amt']) 1572 | 1573 | x.show() 1574 | y.show() 1575 | ``` 1576 | 1577 | +-----+------+---------+ 1578 | | from| to| amt| 1579 | +-----+------+---------+ 1580 | |vinay|deepak|1246.6641| 1581 | |sunny|pratik| 4512.32| 1582 | |parag| akash| 2555.455| 1583 | +-----+------+---------+ 1584 | 1585 | +-----+------+---------+ 1586 | | from| to| amt| 1587 | +-----+------+---------+ 1588 | |vinay|deepak|1246.6641| 1589 | |parag| akash| 2555.455| 1590 | |sunny|pratik| 4512.32| 1591 | +-----+------+---------+ 1592 | 1593 | 1594 | 1595 | 1596 | ```python 1597 | # sortWithinPartitions 1598 | x = sqlContext.createDataFrame([('vinay',"Bobby",0.1,1),("Bobby","sunny",0.2,2),("deepak","parag",0.3,2)], \ 1599 | ['from','to','amt','p_id']).repartition(2,'p_id') 1600 | y = x.sortWithinPartitions(['to']) 1601 | x.show() 1602 | y.show() 1603 | print(x.rdd.glom().collect()) # glom() flattens elements on the same partition 1604 | print("\n") 1605 | print(y.rdd.glom().collect()) 1606 | ``` 1607 | 1608 | +------+-----+---+----+ 1609 | | from| to|amt|p_id| 1610 | +------+-----+---+----+ 1611 | | Bobby|sunny|0.2| 2| 1612 | |deepak|parag|0.3| 2| 1613 | | vinay|Bobby|0.1| 1| 1614 | +------+-----+---+----+ 1615 | 1616 | +------+-----+---+----+ 1617 | | from| to|amt|p_id| 1618 | +------+-----+---+----+ 1619 | |deepak|parag|0.3| 2| 1620 | | Bobby|sunny|0.2| 2| 1621 | | vinay|Bobby|0.1| 1| 1622 | +------+-----+---+----+ 1623 | 1624 | [[Row(from='Bobby', to='sunny', amt=0.2, p_id=2), Row(from='deepak', to='parag', amt=0.3, p_id=2)], [Row(from='vinay', to='Bobby', amt=0.1, p_id=1)]] 1625 | 1626 | 1627 | [[Row(from='deepak', to='parag', amt=0.3, p_id=2), Row(from='Bobby', to='sunny', amt=0.2, p_id=2)], [Row(from='vinay', to='Bobby', amt=0.1, p_id=1)]] 1628 | 1629 | 1630 | 1631 | ```python 1632 | # Stat :-Returns a 1633 | # DataFrameStatFunctions for statistic functions. 1634 | 1635 | x = sqlContext.createDataFrame([("vinay","Bobby",0.1,0.001),("Bobby","sunny",0.2,0.02),("sunny","pranav",0.3,0.02)], ['from','to','amt','fees']) 1636 | y = x.stat 1637 | x.show() 1638 | print(y) 1639 | print(y.corr(col1="amt",col2="fees")) 1640 | 1641 | ``` 1642 | 1643 | +-----+------+---+-----+ 1644 | | from| to|amt| fees| 1645 | +-----+------+---+-----+ 1646 | |vinay| Bobby|0.1|0.001| 1647 | |Bobby| sunny|0.2| 0.02| 1648 | |sunny|pranav|0.3| 0.02| 1649 | +-----+------+---+-----+ 1650 | 1651 | 1652 | 0.8660254037844386 1653 | 1654 | 1655 | 1656 | ```python 1657 | # subtract 1658 | 1659 | x = sqlContext.createDataFrame([("vinay","Bobby",0.1,0.001),("Bobby","sunny",0.2,0.02),("sunny","pranav",0.3,0.02)], ['from','to','amt','fees']) 1660 | y = sqlContext.createDataFrame([("vinay","Bobby",0.1,0.001),("Bobby","sunny",0.2,0.02),("sunny","pranav",0.3,0.01)], ['from','to','amt','fees']) 1661 | 1662 | z = x.subtract(y) 1663 | x.show() 1664 | y.show() 1665 | z.show() 1666 | ``` 1667 | 1668 | +-----+------+---+-----+ 1669 | | from| to|amt| fees| 1670 | +-----+------+---+-----+ 1671 | |vinay| Bobby|0.1|0.001| 1672 | |Bobby| sunny|0.2| 0.02| 1673 | |sunny|pranav|0.3| 0.02| 1674 | +-----+------+---+-----+ 1675 | 1676 | +-----+------+---+-----+ 1677 | | from| to|amt| fees| 1678 | +-----+------+---+-----+ 1679 | |vinay| Bobby|0.1|0.001| 1680 | |Bobby| sunny|0.2| 0.02| 1681 | |sunny|pranav|0.3| 0.01| 1682 | +-----+------+---+-----+ 1683 | 1684 | +-----+------+---+----+ 1685 | | from| to|amt|fees| 1686 | +-----+------+---+----+ 1687 | |sunny|pranav|0.3|0.02| 1688 | +-----+------+---+----+ 1689 | 1690 | 1691 | 1692 | 1693 | ```python 1694 | x = sqlContext.createDataFrame([("vinay","Bobby",0.1,0.001),("Bobby","sunny",0.2,0.02),("sunny","pranav",0.3,0.02)], ['from','to','amt','fees']) 1695 | 1696 | y = x.take(num=2) 1697 | x.show() 1698 | print(y) 1699 | ``` 1700 | 1701 | +-----+------+---+-----+ 1702 | | from| to|amt| fees| 1703 | +-----+------+---+-----+ 1704 | |vinay| Bobby|0.1|0.001| 1705 | |Bobby| sunny|0.2| 0.02| 1706 | |sunny|pranav|0.3| 0.02| 1707 | +-----+------+---+-----+ 1708 | 1709 | [Row(from='vinay', to='Bobby', amt=0.1, fees=0.001), Row(from='Bobby', to='sunny', amt=0.2, fees=0.02)] 1710 | 1711 | 1712 | # Conversions 1713 | 1714 | 1715 | ```python 1716 | #toDF 1717 | 1718 | x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt']) 1719 | y = x.toDF("seller","buyer","amt") 1720 | x.show() 1721 | y.show() 1722 | ``` 1723 | 1724 | +-----+-----+---+ 1725 | | from| to|amt| 1726 | +-----+-----+---+ 1727 | |Alice| Bob|0.1| 1728 | | Bob|Carol|0.2| 1729 | |Carol| Dave|0.3| 1730 | +-----+-----+---+ 1731 | 1732 | +------+-----+---+ 1733 | |seller|buyer|amt| 1734 | +------+-----+---+ 1735 | | Alice| Bob|0.1| 1736 | | Bob|Carol|0.2| 1737 | | Carol| Dave|0.3| 1738 | +------+-----+---+ 1739 | 1740 | 1741 | 1742 | 1743 | ```python 1744 | # toJson 1745 | x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Alice",0.3)], ['from','to','amt']) 1746 | y = x.toJSON() 1747 | 1748 | x.show() 1749 | print(y) 1750 | print("\n") 1751 | print(y.collect()) 1752 | 1753 | ``` 1754 | 1755 | +-----+-----+---+ 1756 | | from| to|amt| 1757 | +-----+-----+---+ 1758 | |Alice| Bob|0.1| 1759 | | Bob|Carol|0.2| 1760 | |Carol|Alice|0.3| 1761 | +-----+-----+---+ 1762 | 1763 | MapPartitionsRDD[193] at toJavaRDD at NativeMethodAccessorImpl.java:0 1764 | 1765 | 1766 | ['{"from":"Alice","to":"Bob","amt":0.1}', '{"from":"Bob","to":"Carol","amt":0.2}', '{"from":"Carol","to":"Alice","amt":0.3}'] 1767 | 1768 | 1769 | 1770 | ```python 1771 | # toPandas 1772 | 1773 | x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Alice",0.3)], ['from','to','amt']) 1774 | y = x.toPandas 1775 | x.show() 1776 | print(type(y)) 1777 | y 1778 | ``` 1779 | 1780 | +-----+-----+---+ 1781 | | from| to|amt| 1782 | +-----+-----+---+ 1783 | |Alice| Bob|0.1| 1784 | | Bob|Carol|0.2| 1785 | |Carol|Alice|0.3| 1786 | +-----+-----+---+ 1787 | 1788 | 1789 | 1790 | 1791 | 1792 | 1793 | 1794 | 1795 | 1796 | 1797 | 1798 | 1799 | ```python 1800 | # unionAll 1801 | 1802 | x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Alice",0.3)], ['from','to','amt']) 1803 | y = sqlContext.createDataFrame([('sunny',"Bob",0.1),("vinay","Carol",0.2),("pranav","Alice",0.3)], ['from','to','amt']) 1804 | 1805 | z = x.unionAll(y) 1806 | 1807 | x.show() 1808 | y.show() 1809 | z.show() 1810 | 1811 | ``` 1812 | 1813 | +-----+-----+---+ 1814 | | from| to|amt| 1815 | +-----+-----+---+ 1816 | |Alice| Bob|0.1| 1817 | | Bob|Carol|0.2| 1818 | |Carol|Alice|0.3| 1819 | +-----+-----+---+ 1820 | 1821 | +------+-----+---+ 1822 | | from| to|amt| 1823 | +------+-----+---+ 1824 | | sunny| Bob|0.1| 1825 | | vinay|Carol|0.2| 1826 | |pranav|Alice|0.3| 1827 | +------+-----+---+ 1828 | 1829 | +------+-----+---+ 1830 | | from| to|amt| 1831 | +------+-----+---+ 1832 | | Alice| Bob|0.1| 1833 | | Bob|Carol|0.2| 1834 | | Carol|Alice|0.3| 1835 | | sunny| Bob|0.1| 1836 | | vinay|Carol|0.2| 1837 | |pranav|Alice|0.3| 1838 | +------+-----+---+ 1839 | 1840 | 1841 | 1842 | 1843 | ```python 1844 | # unpersist 1845 | 1846 | x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Alice",0.3)], ['from','to','amt']) 1847 | x.cache() 1848 | x.count() 1849 | x.show() 1850 | 1851 | print(x.is_cached) 1852 | x.unpersist() 1853 | print(x.is_cached) 1854 | ``` 1855 | 1856 | +-----+-----+---+ 1857 | | from| to|amt| 1858 | +-----+-----+---+ 1859 | |Alice| Bob|0.1| 1860 | | Bob|Carol|0.2| 1861 | |Carol|Alice|0.3| 1862 | +-----+-----+---+ 1863 | 1864 | True 1865 | False 1866 | 1867 | 1868 | 1869 | ```python 1870 | # where 1871 | x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Alice",0.3)], ['from','to','amt']) 1872 | y = x.where("amt > 0.2") 1873 | 1874 | x.show() 1875 | y.show() 1876 | ``` 1877 | 1878 | +-----+-----+---+ 1879 | | from| to|amt| 1880 | +-----+-----+---+ 1881 | |Alice| Bob|0.1| 1882 | | Bob|Carol|0.2| 1883 | |Carol|Alice|0.3| 1884 | +-----+-----+---+ 1885 | 1886 | +-----+-----+---+ 1887 | | from| to|amt| 1888 | +-----+-----+---+ 1889 | |Carol|Alice|0.3| 1890 | +-----+-----+---+ 1891 | 1892 | 1893 | 1894 | 1895 | ```python 1896 | # withColumn 1897 | 1898 | x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Alice",0.3)], ['from','to','amt']) 1899 | y = x.withColumn('conf',x.amt.isNotNull()) 1900 | 1901 | x.show() 1902 | y.show() 1903 | ``` 1904 | 1905 | +-----+-----+---+ 1906 | | from| to|amt| 1907 | +-----+-----+---+ 1908 | |Alice| Bob|0.1| 1909 | | Bob|Carol|0.2| 1910 | |Carol|Alice|0.3| 1911 | +-----+-----+---+ 1912 | 1913 | +-----+-----+---+----+ 1914 | | from| to|amt|conf| 1915 | +-----+-----+---+----+ 1916 | |Alice| Bob|0.1|true| 1917 | | Bob|Carol|0.2|true| 1918 | |Carol|Alice|0.3|true| 1919 | +-----+-----+---+----+ 1920 | 1921 | 1922 | 1923 | 1924 | ```python 1925 | # withColumnRenamed 1926 | x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt']) 1927 | y = x.withColumnRenamed('amt','amount') 1928 | x.show() 1929 | y.show() 1930 | ``` 1931 | 1932 | +-----+-----+---+ 1933 | | from| to|amt| 1934 | +-----+-----+---+ 1935 | |Alice| Bob|0.1| 1936 | | Bob|Carol|0.2| 1937 | |Carol| Dave|0.3| 1938 | +-----+-----+---+ 1939 | 1940 | +-----+-----+------+ 1941 | | from| to|amount| 1942 | +-----+-----+------+ 1943 | |Alice| Bob| 0.1| 1944 | | Bob|Carol| 0.2| 1945 | |Carol| Dave| 0.3| 1946 | +-----+-----+------+ 1947 | 1948 | 1949 | 1950 | 1951 | ```python 1952 | # write 1953 | import json 1954 | x = sqlContext.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt']) 1955 | y = x.write.mode('overwrite').json('./dataframeWriteExample.json') 1956 | x.show() 1957 | 1958 | 1959 | # Read the DF back in from file 1960 | sqlContext.read.json('./dataframeWriteExample.json').show() 1961 | ``` 1962 | 1963 | +-----+-----+---+ 1964 | | from| to|amt| 1965 | +-----+-----+---+ 1966 | |Alice| Bob|0.1| 1967 | | Bob|Carol|0.2| 1968 | |Carol| Dave|0.3| 1969 | +-----+-----+---+ 1970 | 1971 | +---+-----+-----+ 1972 | |amt| from| to| 1973 | +---+-----+-----+ 1974 | |0.3|Carol| Dave| 1975 | |0.1|Alice| Bob| 1976 | |0.2| Bob|Carol| 1977 | +---+-----+-----+ 1978 | 1979 | 1980 | 1981 | 1982 | ```python 1983 | 1984 | ``` 1985 | -------------------------------------------------------------------------------- /SPARK/Dataframe/spark dataframe - All Most API Practice.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# SPARK DATAFRAME API WITH PYTHON (Zero to Hero)\n", 8 | "\n", 9 | "#### - Vinay Chaudhari " 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "\n", 17 | "### Learn Dataframe API in FUN WAY ! \n", 18 | "HAPPY LEARNING ☺\n", 19 | "======================================================================================================\n", 20 | "\n", 21 | "# A\n", 22 | "### ○ agg \n", 23 | "### ○ alias \n", 24 | "### ○ agg\n", 25 | "\n", 26 | "======================================================================================================\n", 27 | "# C\n", 28 | "### ○ cache\n", 29 | "### ○ coalesce\n", 30 | "### ○ columns\n", 31 | "### ○ corr\n", 32 | "### ○ count\n", 33 | "### ○ cov\n", 34 | "### ○ crosstab\n", 35 | "### ○ cube\n", 36 | "### ○ coalesce\n", 37 | "======================================================================================================\n", 38 | "# D\n", 39 | "### ○ describe\n", 40 | "### ○ destinct\n", 41 | "### ○ drop\n", 42 | "### ○ dropDuplicates\n", 43 | "### ○ dropna\n", 44 | "### ○ dtypes\n", 45 | "======================================================================================================\n", 46 | "\n", 47 | "# E\n", 48 | "### ○ explain\n", 49 | "======================================================================================================\n", 50 | "\n", 51 | "# F\n", 52 | "### ○ fillna\n", 53 | "### ○ filter\n", 54 | "### ○ first\n", 55 | "### ○ flatmap\n", 56 | "### ○ foreach\n", 57 | "### ○ foreachPartition\n", 58 | "### ○ freqItems\n", 59 | "======================================================================================================\n", 60 | "# G\n", 61 | "### ○ groupBy \n", 62 | "======================================================================================================\n", 63 | "# H\n", 64 | "### ○ head\n", 65 | "======================================================================================================\n", 66 | "# I\n", 67 | "### ○ intersect\n", 68 | "### ○ isLocal\n", 69 | "======================================================================================================\n", 70 | "# J\n", 71 | "### ○ join\n", 72 | "======================================================================================================\n", 73 | "# L\n", 74 | "### ○ limit\n", 75 | "======================================================================================================\n", 76 | "# M\n", 77 | "### ○ map\n", 78 | "### ○ mapPartitions\n", 79 | "======================================================================================================\n", 80 | "# N\n", 81 | "### ○ na\n", 82 | "======================================================================================================\n", 83 | "# O\n", 84 | "### ○ orderBy\n", 85 | "======================================================================================================\n", 86 | "# P\n", 87 | "### ○ persist\n", 88 | "### ○ printSchema\n", 89 | "======================================================================================================\n", 90 | "# R\n", 91 | "### ○ randomSplit\n", 92 | "### ○ rdd\n", 93 | "### ○ registerTempTable\n", 94 | "### ○ repartition\n", 95 | "### ○ replace\n", 96 | "### ○ rollup\n", 97 | "======================================================================================================\n", 98 | "# S\n", 99 | "### ○ sample\n", 100 | "### ○ sampleBy\n", 101 | "### ○ schema\n", 102 | "### ○ select\n", 103 | "### ○ selectExpr\n", 104 | "### ○ show\n", 105 | "### ○ sort\n", 106 | "### ○ sortWithPartitions\n", 107 | "### ○ stat\n", 108 | "### ○ subtract\n", 109 | "======================================================================================================\n", 110 | "## ✔ CONVERSIONS \n", 111 | "======================================================================================================\n", 112 | "\n", 113 | "# T\n", 114 | "### ○ take\n", 115 | "### ○ toDF\n", 116 | "### ○ toJSON\n", 117 | "### ○ toPANDAS\n", 118 | "======================================================================================================\n", 119 | "# U\n", 120 | "### ○ unionAll\n", 121 | "### ○ upersist\n", 122 | "======================================================================================================\n", 123 | "# W\n", 124 | "### ○ where(filter)\n", 125 | "### ○ withColumn\n", 126 | "### ○ withColumnRenamed\n", 127 | "### ○ write\n", 128 | "\n", 129 | "**** IF YOU WANT TO CONTRIBUTE IN THIS FILE YOU ARE WELCOME :) ****" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 1, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "import IPython \n" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 3, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "+------+------+------+\n", 151 | "| from| to|amount|\n", 152 | "+------+------+------+\n", 153 | "| vinay| sunny| 100|\n", 154 | "|deepak| parag| 200|\n", 155 | "| akash|pravin| 300|\n", 156 | "+------+------+------+\n", 157 | "\n", 158 | "+-----------+\n", 159 | "|avg(amount)|\n", 160 | "+-----------+\n", 161 | "| 200.0|\n", 162 | "+-----------+\n", 163 | "\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "#agg\n", 169 | "\n", 170 | "x = sqlContext.createDataFrame([(\"vinay\",\"sunny\",100),(\"deepak\",\"parag\",200),(\"akash\",\"pravin\",300)],['from','to','amount'])\n", 171 | "y = x.agg({\"amount\":\"avg\"})\n", 172 | "\n", 173 | "x.show()\n", 174 | "y.show()" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 6, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "+------+------+------+\n", 187 | "| from| to|amount|\n", 188 | "+------+------+------+\n", 189 | "| vinay| sunny| 100|\n", 190 | "|deepak| parag| 200|\n", 191 | "| akash|pravin| 300|\n", 192 | "+------+------+------+\n", 193 | "\n", 194 | "+------+\n", 195 | "| to|\n", 196 | "+------+\n", 197 | "| sunny|\n", 198 | "| parag|\n", 199 | "|pravin|\n", 200 | "+------+\n", 201 | "\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "#alias\n", 207 | "from pyspark.sql.functions import col\n", 208 | "x = sqlContext.createDataFrame([(\"vinay\",\"sunny\",100),(\"deepak\",\"parag\",200),(\"akash\",\"pravin\",300)],['from','to','amount'])\n", 209 | "y = x.alias(\"transactions\")\n", 210 | "\n", 211 | "x.show()\n", 212 | "y.select(col(\"transactions.to\")).show()" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 7, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "3\n", 225 | "3\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "#cache \n", 231 | "\n", 232 | "x = sqlContext.createDataFrame([(\"vinay\",\"sunny\",100),(\"deepak\",\"parag\",200),(\"akash\",\"pravin\",300)],['from','to','amount'])\n", 233 | "x.cache()\n", 234 | "\n", 235 | "print(x.count()) #first action materializes x in memory\n", 236 | "print(x.count()) #later actions avoid IO overhead" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 10, 242 | "metadata": { 243 | "scrolled": true 244 | }, 245 | "outputs": [ 246 | { 247 | "name": "stdout", 248 | "output_type": "stream", 249 | "text": [ 250 | "4\n", 251 | "1\n", 252 | "+------+------+------+\n", 253 | "| from| to|amount|\n", 254 | "+------+------+------+\n", 255 | "| vinay| sunny| 100|\n", 256 | "|deepak| parag| 200|\n", 257 | "| akash|pravin| 300|\n", 258 | "+------+------+------+\n", 259 | "\n", 260 | "+------+------+------+\n", 261 | "| from| to|amount|\n", 262 | "+------+------+------+\n", 263 | "| vinay| sunny| 100|\n", 264 | "|deepak| parag| 200|\n", 265 | "| akash|pravin| 300|\n", 266 | "+------+------+------+\n", 267 | "\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "#coalesce\n", 273 | "\n", 274 | "x_rdd = sc.parallelize([(\"vinay\",\"sunny\",100),(\"deepak\",\"parag\",200),(\"akash\",\"pravin\",300)],4)\n", 275 | "x = sqlContext.createDataFrame(x_rdd,['from','to','amount'])\n", 276 | "y = x.coalesce(numPartitions=1)\n", 277 | "\n", 278 | "print(x.rdd.getNumPartitions())\n", 279 | "print(y.rdd.getNumPartitions())\n", 280 | "\n", 281 | "x.show()\n", 282 | "y.show()" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 13, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "+------+------+------+\n", 295 | "| from| to|amount|\n", 296 | "+------+------+------+\n", 297 | "| vinay| sunny| 100|\n", 298 | "|deepak| parag| 200|\n", 299 | "| akash|pravin| 300|\n", 300 | "+------+------+------+\n", 301 | "\n", 302 | "[Row(from='vinay', to='sunny', amount=100), Row(from='deepak', to='parag', amount=200), Row(from='akash', to='pravin', amount=300)]\n" 303 | ] 304 | } 305 | ], 306 | "source": [ 307 | "#collect\n", 308 | "\n", 309 | "x = sqlContext.createDataFrame([(\"vinay\",\"sunny\",100),(\"deepak\",\"parag\",200),(\"akash\",\"pravin\",300)],['from','to','amount'])\n", 310 | "y = x.collect() # it creates list of rows.\n", 311 | "x.show()\n", 312 | "print(y)" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 14, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "name": "stdout", 322 | "output_type": "stream", 323 | "text": [ 324 | "+------+------+------+\n", 325 | "| from| to|amount|\n", 326 | "+------+------+------+\n", 327 | "| vinay| sunny| 100|\n", 328 | "|deepak| parag| 200|\n", 329 | "| akash|pravin| 300|\n", 330 | "+------+------+------+\n", 331 | "\n", 332 | "['from', 'to', 'amount']\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "#columns \n", 338 | "\n", 339 | "x = sqlContext.createDataFrame([(\"vinay\",\"sunny\",100),(\"deepak\",\"parag\",200),(\"akash\",\"pravin\",300)],['from','to','amount'])\n", 340 | "\n", 341 | "y = x.columns\n", 342 | "x.show()\n", 343 | "print(y)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 18, 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | "+------+------+------+----+\n", 356 | "| from| to|amount|fees|\n", 357 | "+------+------+------+----+\n", 358 | "| vinay| sunny| 100| 300|\n", 359 | "|deepak| parag| 200| 600|\n", 360 | "| akash|pravin| 300| 900|\n", 361 | "+------+------+------+----+\n", 362 | "\n", 363 | "1.0\n" 364 | ] 365 | } 366 | ], 367 | "source": [ 368 | "#corr : Calculates the correlation of\n", 369 | "# two columns of a DataFrame as a double value. \n", 370 | "\n", 371 | "x = sqlContext.createDataFrame([(\"vinay\",\"sunny\",100,300),(\"deepak\",\"parag\",200,600),(\"akash\",\"pravin\",300,900)], ['from','to','amount','fees'])\n", 372 | "y = x.corr(col1=\"amount\",col2=\"fees\")\n", 373 | "x.show()\n", 374 | "print(y)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 19, 380 | "metadata": {}, 381 | "outputs": [ 382 | { 383 | "name": "stdout", 384 | "output_type": "stream", 385 | "text": [ 386 | "+------+------+------+\n", 387 | "| from| to|amount|\n", 388 | "+------+------+------+\n", 389 | "| vinay| sunny| 100|\n", 390 | "|deepak| parag| 200|\n", 391 | "| akash|pravin| 300|\n", 392 | "+------+------+------+\n", 393 | "\n", 394 | "3\n" 395 | ] 396 | } 397 | ], 398 | "source": [ 399 | "#count \n", 400 | "#Returns the number of rows in this DataFrame.\n", 401 | "x = sqlContext.createDataFrame([(\"vinay\",\"sunny\",100),(\"deepak\",\"parag\",200),(\"akash\",\"pravin\",300)],['from','to','amount'])\n", 402 | "x.show()\n", 403 | "print(x.count())" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 20, 409 | "metadata": {}, 410 | "outputs": [ 411 | { 412 | "name": "stdout", 413 | "output_type": "stream", 414 | "text": [ 415 | "+------+------+------+----+\n", 416 | "| from| to|amount|fees|\n", 417 | "+------+------+------+----+\n", 418 | "| vinay| sunny| 100| 300|\n", 419 | "|deepak| parag| 200| 600|\n", 420 | "| akash|pravin| 300| 900|\n", 421 | "+------+------+------+----+\n", 422 | "\n", 423 | "30000.0\n" 424 | ] 425 | } 426 | ], 427 | "source": [ 428 | "#cov\n", 429 | "#Calculate the sample covariance for the given columns,\n", 430 | "#specified by their names, as a double value. \n", 431 | "\n", 432 | "x = sqlContext.createDataFrame([(\"vinay\",\"sunny\",100,300),(\"deepak\",\"parag\",200,600),(\"akash\",\"pravin\",300,900)], ['from','to','amount','fees'])\n", 433 | "y = x.cov(col1=\"amount\",col2=\"fees\")\n", 434 | "\n", 435 | "x.show()\n", 436 | "print(y)" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 24, 442 | "metadata": {}, 443 | "outputs": [ 444 | { 445 | "name": "stdout", 446 | "output_type": "stream", 447 | "text": [ 448 | "+-----+------+---+\n", 449 | "| from| to|amt|\n", 450 | "+-----+------+---+\n", 451 | "|vinay|deepak|0.1|\n", 452 | "|sunny|pratik|0.2|\n", 453 | "|parag| akash|0.3|\n", 454 | "+-----+------+---+\n", 455 | "\n", 456 | "+-------+-----+------+------+\n", 457 | "|from_to|akash|deepak|pratik|\n", 458 | "+-------+-----+------+------+\n", 459 | "| parag| 1| 0| 0|\n", 460 | "| vinay| 0| 1| 0|\n", 461 | "| sunny| 0| 0| 1|\n", 462 | "+-------+-----+------+------+\n", 463 | "\n" 464 | ] 465 | } 466 | ], 467 | "source": [ 468 | "#crosstab\n", 469 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",0.1),(\"sunny\",\"pratik\",0.2),(\"parag\",\"akash\",0.3)], ['from','to','amt'])\n", 470 | "y = x.crosstab(col1='from',col2='to')\n", 471 | "x.show()\n", 472 | "y.show()\n" 473 | ] 474 | }, 475 | { 476 | "cell_type": "markdown", 477 | "metadata": {}, 478 | "source": [ 479 | "### col1 – The name of the first column. Distinct items will make the first item of each row.\n", 480 | "### col2 – The name of the second column. Distinct items will make the column names of the DataFrame." 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": 27, 486 | "metadata": {}, 487 | "outputs": [ 488 | { 489 | "name": "stdout", 490 | "output_type": "stream", 491 | "text": [ 492 | "+-----+------+---+\n", 493 | "| from| to|amt|\n", 494 | "+-----+------+---+\n", 495 | "|vinay|deepak| 1|\n", 496 | "|sunny|pratik| 2|\n", 497 | "|parag| akash| 3|\n", 498 | "+-----+------+---+\n", 499 | "\n", 500 | "\n", 501 | "+-----+------+--------+\n", 502 | "| from| to|sum(amt)|\n", 503 | "+-----+------+--------+\n", 504 | "| null| akash| 3|\n", 505 | "| null| null| 6|\n", 506 | "|vinay|deepak| 1|\n", 507 | "|vinay| null| 1|\n", 508 | "| null|deepak| 1|\n", 509 | "|parag| akash| 3|\n", 510 | "| null|pratik| 2|\n", 511 | "|parag| null| 3|\n", 512 | "|sunny| null| 2|\n", 513 | "|sunny|pratik| 2|\n", 514 | "+-----+------+--------+\n", 515 | "\n", 516 | "+-----+------+--------+\n", 517 | "| from| to|max(amt)|\n", 518 | "+-----+------+--------+\n", 519 | "| null| akash| 3|\n", 520 | "| null| null| 3|\n", 521 | "|vinay|deepak| 1|\n", 522 | "|vinay| null| 1|\n", 523 | "| null|deepak| 1|\n", 524 | "|parag| akash| 3|\n", 525 | "| null|pratik| 2|\n", 526 | "|parag| null| 3|\n", 527 | "|sunny| null| 2|\n", 528 | "|sunny|pratik| 2|\n", 529 | "+-----+------+--------+\n", 530 | "\n" 531 | ] 532 | } 533 | ], 534 | "source": [ 535 | "#cube\n", 536 | "\n", 537 | "# Create a multi-dimensional cube for the current DataFrame using the specified columns,\n", 538 | "# so we can run aggregation on them\n", 539 | "\n", 540 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1),(\"sunny\",\"pratik\",2),(\"parag\",\"akash\",3)], ['from','to','amt'])\n", 541 | "\n", 542 | "y = x.cube('from','to')\n", 543 | "x.show()\n", 544 | "print(y)\n", 545 | "y.sum().show()\n", 546 | "y.max().show()" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 28, 552 | "metadata": {}, 553 | "outputs": [ 554 | { 555 | "name": "stdout", 556 | "output_type": "stream", 557 | "text": [ 558 | "+-----+------+---+\n", 559 | "| from| to|amt|\n", 560 | "+-----+------+---+\n", 561 | "|vinay|deepak| 1|\n", 562 | "|sunny|pratik| 2|\n", 563 | "|parag| akash| 3|\n", 564 | "+-----+------+---+\n", 565 | "\n", 566 | "+-------+-----+------+---+\n", 567 | "|summary| from| to|amt|\n", 568 | "+-------+-----+------+---+\n", 569 | "| count| 3| 3| 3|\n", 570 | "| mean| null| null|2.0|\n", 571 | "| stddev| null| null|1.0|\n", 572 | "| min|parag| akash| 1|\n", 573 | "| max|vinay|pratik| 3|\n", 574 | "+-------+-----+------+---+\n", 575 | "\n" 576 | ] 577 | } 578 | ], 579 | "source": [ 580 | "# Describe \n", 581 | "\n", 582 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1),(\"sunny\",\"pratik\",2),(\"parag\",\"akash\",3)], ['from','to','amt'])\n", 583 | "\n", 584 | "x.show()\n", 585 | "x.describe().show()" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 30, 591 | "metadata": {}, 592 | "outputs": [ 593 | { 594 | "name": "stdout", 595 | "output_type": "stream", 596 | "text": [ 597 | "+-----+------+---+\n", 598 | "| from| to|amt|\n", 599 | "+-----+------+---+\n", 600 | "|vinay|deepak| 1|\n", 601 | "|sunny|pratik| 2|\n", 602 | "|parag| akash| 3|\n", 603 | "|parag| akash| 3|\n", 604 | "|parag| akash| 3|\n", 605 | "+-----+------+---+\n", 606 | "\n", 607 | "+-----+------+---+\n", 608 | "| from| to|amt|\n", 609 | "+-----+------+---+\n", 610 | "|sunny|pratik| 2|\n", 611 | "|vinay|deepak| 1|\n", 612 | "|parag| akash| 3|\n", 613 | "+-----+------+---+\n", 614 | "\n" 615 | ] 616 | } 617 | ], 618 | "source": [ 619 | "# Distinct \n", 620 | "\n", 621 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1),(\"sunny\",\"pratik\",2),(\"parag\",\"akash\",3),(\"parag\",\"akash\",3),(\"parag\",\"akash\",3)], ['from','to','amt'])\n", 622 | "y = x.distinct()\n", 623 | "\n", 624 | "x.show()\n", 625 | "y.show()" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 32, 631 | "metadata": {}, 632 | "outputs": [ 633 | { 634 | "name": "stdout", 635 | "output_type": "stream", 636 | "text": [ 637 | "+-----+------+---+\n", 638 | "| from| to|amt|\n", 639 | "+-----+------+---+\n", 640 | "|vinay|deepak| 1|\n", 641 | "|sunny|pratik| 2|\n", 642 | "|parag| akash| 3|\n", 643 | "+-----+------+---+\n", 644 | "\n", 645 | "+-----+------+\n", 646 | "| from| to|\n", 647 | "+-----+------+\n", 648 | "|vinay|deepak|\n", 649 | "|sunny|pratik|\n", 650 | "|parag| akash|\n", 651 | "+-----+------+\n", 652 | "\n" 653 | ] 654 | } 655 | ], 656 | "source": [ 657 | "# Drop \n", 658 | "\n", 659 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1),(\"sunny\",\"pratik\",2),(\"parag\",\"akash\",3)], ['from','to','amt'])\n", 660 | "y = x.drop('amt')\n", 661 | "\n", 662 | "x.show()\n", 663 | "y.show()" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": 1, 669 | "metadata": {}, 670 | "outputs": [ 671 | { 672 | "name": "stdout", 673 | "output_type": "stream", 674 | "text": [ 675 | "+-----+------+---+\n", 676 | "| from| to|amt|\n", 677 | "+-----+------+---+\n", 678 | "|vinay|deepak| 1|\n", 679 | "|sunny|pratik| 2|\n", 680 | "|parag| akash| 3|\n", 681 | "|parag| akash| 3|\n", 682 | "|parag| akash| 3|\n", 683 | "+-----+------+---+\n", 684 | "\n", 685 | "+-----+------+---+\n", 686 | "| from| to|amt|\n", 687 | "+-----+------+---+\n", 688 | "|vinay|deepak| 1|\n", 689 | "|sunny|pratik| 2|\n", 690 | "|parag| akash| 3|\n", 691 | "+-----+------+---+\n", 692 | "\n" 693 | ] 694 | } 695 | ], 696 | "source": [ 697 | "# dropDuplicates\n", 698 | "\n", 699 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1),(\"sunny\",\"pratik\",2),(\"parag\",\"akash\",3),(\"parag\",\"akash\",3),(\"parag\",\"akash\",3)], ['from','to','amt'])\n", 700 | "y = x.dropDuplicates(subset=['from','to'])\n", 701 | "\n", 702 | "x.show()\n", 703 | "y.show()" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 4, 709 | "metadata": {}, 710 | "outputs": [ 711 | { 712 | "name": "stdout", 713 | "output_type": "stream", 714 | "text": [ 715 | "+-----+-----+------+\n", 716 | "| from| to|amount|\n", 717 | "+-----+-----+------+\n", 718 | "| null|vinay| 0.1|\n", 719 | "|vinay|sunny| null|\n", 720 | "|Peter| null| 0.3|\n", 721 | "| Mark|Steve| 0.2|\n", 722 | "+-----+-----+------+\n", 723 | "\n", 724 | "+-----+-----+------+\n", 725 | "| from| to|amount|\n", 726 | "+-----+-----+------+\n", 727 | "|vinay|sunny| null|\n", 728 | "| Mark|Steve| 0.2|\n", 729 | "+-----+-----+------+\n", 730 | "\n" 731 | ] 732 | } 733 | ], 734 | "source": [ 735 | "#dropna \n", 736 | "x = sqlContext.createDataFrame([(None,\"vinay\",0.1),(\"vinay\",\"sunny\",None),(\"Peter\",None,0.3),(\"Mark\",\"Steve\",0.2)], ['from','to','amount'])\n", 737 | "y = x.dropna(how='any',subset=['from','to'])\n", 738 | "x.show()\n", 739 | "y.show()" 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": 5, 745 | "metadata": {}, 746 | "outputs": [ 747 | { 748 | "name": "stdout", 749 | "output_type": "stream", 750 | "text": [ 751 | "+-----+------+---+\n", 752 | "| from| to|amt|\n", 753 | "+-----+------+---+\n", 754 | "|vinay|deepak| 1|\n", 755 | "|sunny|pratik| 2|\n", 756 | "|parag| akash| 3|\n", 757 | "|parag| akash| 3|\n", 758 | "|parag| akash| 3|\n", 759 | "+-----+------+---+\n", 760 | "\n", 761 | "[('from', 'string'), ('to', 'string'), ('amt', 'bigint')]\n" 762 | ] 763 | } 764 | ], 765 | "source": [ 766 | "#dtypes\n", 767 | "\n", 768 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1),(\"sunny\",\"pratik\",2),(\"parag\",\"akash\",3),(\"parag\",\"akash\",3),(\"parag\",\"akash\",3)], ['from','to','amt'])\n", 769 | "y = x.dtypes\n", 770 | "\n", 771 | "x.show()\n", 772 | "print(y)" 773 | ] 774 | }, 775 | { 776 | "cell_type": "code", 777 | "execution_count": 8, 778 | "metadata": {}, 779 | "outputs": [ 780 | { 781 | "name": "stdout", 782 | "output_type": "stream", 783 | "text": [ 784 | "+-----+------+---+\n", 785 | "| from| to|amt|\n", 786 | "+-----+------+---+\n", 787 | "|vinay|deepak| 1|\n", 788 | "|sunny|pratik| 2|\n", 789 | "|parag| akash| 3|\n", 790 | "+-----+------+---+\n", 791 | "\n", 792 | "== Parsed Logical Plan ==\n", 793 | "'Aggregate ['avg(amt#169L) AS avg(amt)#187]\n", 794 | "+- AnalysisBarrier\n", 795 | " +- LogicalRDD [from#167, to#168, amt#169L], false\n", 796 | "\n", 797 | "== Analyzed Logical Plan ==\n", 798 | "avg(amt): double\n", 799 | "Aggregate [avg(amt#169L) AS avg(amt)#187]\n", 800 | "+- LogicalRDD [from#167, to#168, amt#169L], false\n", 801 | "\n", 802 | "== Optimized Logical Plan ==\n", 803 | "Aggregate [avg(amt#169L) AS avg(amt)#187]\n", 804 | "+- Project [amt#169L]\n", 805 | " +- LogicalRDD [from#167, to#168, amt#169L], false\n", 806 | "\n", 807 | "== Physical Plan ==\n", 808 | "*(2) HashAggregate(keys=[], functions=[avg(amt#169L)], output=[avg(amt)#187])\n", 809 | "+- Exchange SinglePartition\n", 810 | " +- *(1) HashAggregate(keys=[], functions=[partial_avg(amt#169L)], output=[sum#192, count#193L])\n", 811 | " +- *(1) Project [amt#169L]\n", 812 | " +- Scan ExistingRDD[from#167,to#168,amt#169L]\n" 813 | ] 814 | } 815 | ], 816 | "source": [ 817 | "#Explain\n", 818 | "\n", 819 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1),(\"sunny\",\"pratik\",2),(\"parag\",\"akash\",3)], ['from','to','amt'])\n", 820 | "x.show()\n", 821 | "\n", 822 | "x.agg({\"amt\":\"avg\"}).explain(extended = True)\n", 823 | "\n" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": 10, 829 | "metadata": {}, 830 | "outputs": [ 831 | { 832 | "name": "stdout", 833 | "output_type": "stream", 834 | "text": [ 835 | "+-----+------+---+\n", 836 | "| from| to|amt|\n", 837 | "+-----+------+---+\n", 838 | "| null|deepak| 1|\n", 839 | "|sunny| null| 2|\n", 840 | "|parag| null| 3|\n", 841 | "+-----+------+---+\n", 842 | "\n", 843 | "+-----+------+---+\n", 844 | "| from| to|amt|\n", 845 | "+-----+------+---+\n", 846 | "| ---|deepak| 1|\n", 847 | "|sunny| ---| 2|\n", 848 | "|parag| ---| 3|\n", 849 | "+-----+------+---+\n", 850 | "\n" 851 | ] 852 | } 853 | ], 854 | "source": [ 855 | "#fillna \n", 856 | "\n", 857 | "x = sqlContext.createDataFrame([(None,\"deepak\",1),(\"sunny\",None,2),(\"parag\",None,3)], ['from','to','amt'])\n", 858 | "y = x.fillna(value = '---',subset = ['from','to'])\n", 859 | "\n", 860 | "x.show()\n", 861 | "y.show()" 862 | ] 863 | }, 864 | { 865 | "cell_type": "markdown", 866 | "metadata": {}, 867 | "source": [ 868 | "## Filter (Most used api)" 869 | ] 870 | }, 871 | { 872 | "cell_type": "code", 873 | "execution_count": 11, 874 | "metadata": {}, 875 | "outputs": [ 876 | { 877 | "name": "stdout", 878 | "output_type": "stream", 879 | "text": [ 880 | "+-----+------+---+\n", 881 | "| from| to|amt|\n", 882 | "+-----+------+---+\n", 883 | "|vinay|deepak| 1|\n", 884 | "|sunny|pratik| 2|\n", 885 | "|parag| akash| 3|\n", 886 | "+-----+------+---+\n", 887 | "\n", 888 | "+-----+-----+---+\n", 889 | "| from| to|amt|\n", 890 | "+-----+-----+---+\n", 891 | "|parag|akash| 3|\n", 892 | "+-----+-----+---+\n", 893 | "\n" 894 | ] 895 | } 896 | ], 897 | "source": [ 898 | "# Filter \n", 899 | "\n", 900 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1),(\"sunny\",\"pratik\",2),(\"parag\",\"akash\",3)], ['from','to','amt'])\n", 901 | "y = x.filter(\"amt > 2 \")\n", 902 | "\n", 903 | "x.show()\n", 904 | "y.show()" 905 | ] 906 | }, 907 | { 908 | "cell_type": "code", 909 | "execution_count": 12, 910 | "metadata": {}, 911 | "outputs": [ 912 | { 913 | "name": "stdout", 914 | "output_type": "stream", 915 | "text": [ 916 | "+-----+------+---+\n", 917 | "| from| to|amt|\n", 918 | "+-----+------+---+\n", 919 | "|vinay|deepak| 1|\n", 920 | "|sunny|pratik| 2|\n", 921 | "|parag| akash| 3|\n", 922 | "+-----+------+---+\n", 923 | "\n", 924 | "Row(from='vinay', to='deepak', amt=1)\n" 925 | ] 926 | } 927 | ], 928 | "source": [ 929 | "# First\n", 930 | "\n", 931 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1),(\"sunny\",\"pratik\",2),(\"parag\",\"akash\",3)], ['from','to','amt'])\n", 932 | "y = x.first()\n", 933 | "\n", 934 | "x.show()\n", 935 | "print(y)" 936 | ] 937 | }, 938 | { 939 | "cell_type": "markdown", 940 | "metadata": {}, 941 | "source": [ 942 | "## Foreach " 943 | ] 944 | }, 945 | { 946 | "cell_type": "code", 947 | "execution_count": 16, 948 | "metadata": {}, 949 | "outputs": [ 950 | { 951 | "name": "stdout", 952 | "output_type": "stream", 953 | "text": [ 954 | "+-----+------+---+\n", 955 | "| from| to|amt|\n", 956 | "+-----+------+---+\n", 957 | "|vinay|deepak| 1|\n", 958 | "|sunny|pratik| 2|\n", 959 | "|parag| akash| 3|\n", 960 | "+-----+------+---+\n", 961 | "\n", 962 | "None\n", 963 | "Row(from='vinay', to='deepak', amt=1)\n", 964 | "Row(from='sunny', to='pratik', amt=2)\n", 965 | "\n" 966 | ] 967 | } 968 | ], 969 | "source": [ 970 | "# foreach\n", 971 | "from __future__ import print_function\n", 972 | "\n", 973 | "# setup\n", 974 | "fn = './foreachExampleDataFrames.txt' \n", 975 | "open(fn, 'w').close() # clear the file\n", 976 | "def fappend(el,f):\n", 977 | " '''appends el to file f'''\n", 978 | " print(el,file=open(f, 'a+') )\n", 979 | "\n", 980 | "# example\n", 981 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1),(\"sunny\",\"pratik\",2),(\"parag\",\"akash\",3)], ['from','to','amt'])\n", 982 | "\n", 983 | "y = x.foreach(lambda x: fappend(x,fn)) # writes into foreachExampleDataFrames.txt\n", 984 | "x.show() # original dataframe\n", 985 | "print(y) # foreach returns 'None'\n", 986 | "# print the contents of the file\n", 987 | "with open(fn, \"r\") as foreachExample:\n", 988 | " print (foreachExample.read())" 989 | ] 990 | }, 991 | { 992 | "cell_type": "code", 993 | "execution_count": 1, 994 | "metadata": {}, 995 | "outputs": [ 996 | { 997 | "name": "stdout", 998 | "output_type": "stream", 999 | "text": [ 1000 | "+-----+------+---+\n", 1001 | "| from| to|amt|\n", 1002 | "+-----+------+---+\n", 1003 | "|vinay|deepak| 1|\n", 1004 | "|sunny|pratik| 2|\n", 1005 | "|parag| akash| 3|\n", 1006 | "+-----+------+---+\n", 1007 | "\n", 1008 | "None\n", 1009 | "Row(from='parag', to='akash', amt=3)\n", 1010 | "Row(from='sunny', to='pratik', amt=2)\n", 1011 | "Row(from='vinay', to='deepak', amt=1)\n", 1012 | "\n" 1013 | ] 1014 | } 1015 | ], 1016 | "source": [ 1017 | "# foreachPartition\n", 1018 | "from __future__ import print_function\n", 1019 | "\n", 1020 | "# setup\n", 1021 | "fn = './foreachExampleDataFrames.txt' \n", 1022 | "open(fn, 'w').close() # clear the file\n", 1023 | "def fappend(el,f):\n", 1024 | " '''appends el to file f'''\n", 1025 | " print(el,file=open(f, 'a+') )\n", 1026 | "\n", 1027 | "# example\n", 1028 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1),(\"sunny\",\"pratik\",2),(\"parag\",\"akash\",3)], ['from','to','amt'])\n", 1029 | "\n", 1030 | "y = x.foreach(lambda x: fappend(x,fn)) # writes into foreachExampleDataFrames.txt\n", 1031 | "x.show() # original dataframe\n", 1032 | "print(y) # foreach returns 'None'\n", 1033 | "# print the contents of the file\n", 1034 | "with open(fn, \"r\") as foreachExample:\n", 1035 | " print (foreachExample.read())" 1036 | ] 1037 | }, 1038 | { 1039 | "cell_type": "code", 1040 | "execution_count": 3, 1041 | "metadata": {}, 1042 | "outputs": [ 1043 | { 1044 | "name": "stdout", 1045 | "output_type": "stream", 1046 | "text": [ 1047 | "+-------+-------+------+\n", 1048 | "| from| to|amount|\n", 1049 | "+-------+-------+------+\n", 1050 | "| Vinay| sunny| 50|\n", 1051 | "| Deepak| sunny| 30|\n", 1052 | "| Vinay| Parag| 20|\n", 1053 | "| Vinay| ram| 50|\n", 1054 | "| sham| sunny| 90|\n", 1055 | "| Vinay|pushpak| 50|\n", 1056 | "| om| sunny| 50|\n", 1057 | "| sagar| sunny| 50|\n", 1058 | "| Vinay| rahul| 80|\n", 1059 | "| akash| sunny| 50|\n", 1060 | "|puranik| pranav| 70|\n", 1061 | "+-------+-------+------+\n", 1062 | "\n", 1063 | "+--------------+----------------+\n", 1064 | "|from_freqItems|amount_freqItems|\n", 1065 | "+--------------+----------------+\n", 1066 | "| [Vinay]| [50]|\n", 1067 | "+--------------+----------------+\n", 1068 | "\n" 1069 | ] 1070 | } 1071 | ], 1072 | "source": [ 1073 | "# freqItems \n", 1074 | "\n", 1075 | "x = sqlContext.createDataFrame([(\"Vinay\",\"sunny\",50), \\\n", 1076 | " (\"Deepak\",\"sunny\",30), \\\n", 1077 | " (\"Vinay\",\"Parag\",20), \\\n", 1078 | " (\"Vinay\",\"ram\",50), \\\n", 1079 | " (\"sham\",\"sunny\",90), \\\n", 1080 | " (\"Vinay\",\"pushpak\",50), \\\n", 1081 | " (\"om\",\"sunny\",50), \\\n", 1082 | " (\"sagar\",\"sunny\",50), \\\n", 1083 | " (\"Vinay\",\"rahul\",80), \\\n", 1084 | " (\"akash\",\"sunny\",50), \\\n", 1085 | " (\"puranik\",\"pranav\",70)],\\\n", 1086 | " ['from','to','amount'])\n", 1087 | "\n", 1088 | "y = x.freqItems(cols=['from','amount'],support=0.8)\n", 1089 | "\n", 1090 | "x.show()\n", 1091 | "y.show()" 1092 | ] 1093 | }, 1094 | { 1095 | "cell_type": "markdown", 1096 | "metadata": {}, 1097 | "source": [ 1098 | "## groupBy (most used api)" 1099 | ] 1100 | }, 1101 | { 1102 | "cell_type": "code", 1103 | "execution_count": 7, 1104 | "metadata": {}, 1105 | "outputs": [ 1106 | { 1107 | "name": "stdout", 1108 | "output_type": "stream", 1109 | "text": [ 1110 | "+-----+------+---+\n", 1111 | "| from| to|amt|\n", 1112 | "+-----+------+---+\n", 1113 | "|vinay|deepak| 1|\n", 1114 | "|sunny|pratik| 2|\n", 1115 | "|parag| akash| 3|\n", 1116 | "+-----+------+---+\n", 1117 | "\n", 1118 | "\n" 1119 | ] 1120 | } 1121 | ], 1122 | "source": [ 1123 | "# groupBy\n", 1124 | "\n", 1125 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1),(\"sunny\",\"pratik\",2),(\"parag\",\"akash\",3)], ['from','to','amt'])\n", 1126 | "y = x.groupBy('amt')\n", 1127 | "\n", 1128 | "x.show()\n", 1129 | "print(y)\n" 1130 | ] 1131 | }, 1132 | { 1133 | "cell_type": "code", 1134 | "execution_count": 9, 1135 | "metadata": {}, 1136 | "outputs": [ 1137 | { 1138 | "name": "stdout", 1139 | "output_type": "stream", 1140 | "text": [ 1141 | "+-----+------+--------+\n", 1142 | "| from| to| amt|\n", 1143 | "+-----+------+--------+\n", 1144 | "|vinay|deepak|12466641|\n", 1145 | "|sunny|pratik| 451232|\n", 1146 | "|parag| akash| 2555455|\n", 1147 | "+-----+------+--------+\n", 1148 | "\n", 1149 | "+-----+-----------+\n", 1150 | "| from| avg(amt)|\n", 1151 | "+-----+-----------+\n", 1152 | "|parag| 2555455.0|\n", 1153 | "|sunny| 451232.0|\n", 1154 | "|vinay|1.2466641E7|\n", 1155 | "+-----+-----------+\n", 1156 | "\n" 1157 | ] 1158 | } 1159 | ], 1160 | "source": [ 1161 | "# groupBy (col1).avg(col2)\n", 1162 | "\n", 1163 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455)], ['from','to','amt'])\n", 1164 | "y = x.groupBy('from').avg('amt')\n", 1165 | "\n", 1166 | "x.show()\n", 1167 | "y.show()\n" 1168 | ] 1169 | }, 1170 | { 1171 | "cell_type": "code", 1172 | "execution_count": 10, 1173 | "metadata": {}, 1174 | "outputs": [ 1175 | { 1176 | "name": "stdout", 1177 | "output_type": "stream", 1178 | "text": [ 1179 | "+-----+------+--------+\n", 1180 | "| from| to| amt|\n", 1181 | "+-----+------+--------+\n", 1182 | "|vinay|deepak|12466641|\n", 1183 | "|sunny|pratik| 451232|\n", 1184 | "|parag| akash| 2555455|\n", 1185 | "+-----+------+--------+\n", 1186 | "\n", 1187 | "[Row(from='vinay', to='deepak', amt=12466641), Row(from='sunny', to='pratik', amt=451232)]\n" 1188 | ] 1189 | } 1190 | ], 1191 | "source": [ 1192 | "# head\n", 1193 | "\n", 1194 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455)], ['from','to','amt'])\n", 1195 | "\n", 1196 | "y = x.head(2)\n", 1197 | "x.show()\n", 1198 | "print(y)" 1199 | ] 1200 | }, 1201 | { 1202 | "cell_type": "code", 1203 | "execution_count": 14, 1204 | "metadata": { 1205 | "scrolled": true 1206 | }, 1207 | "outputs": [ 1208 | { 1209 | "name": "stdout", 1210 | "output_type": "stream", 1211 | "text": [ 1212 | "+-----+------+--------+\n", 1213 | "| from| to| amt|\n", 1214 | "+-----+------+--------+\n", 1215 | "|vinay|deepak|12466641|\n", 1216 | "|sunny|pratik| 451232|\n", 1217 | "|parag| akash| 2555455|\n", 1218 | "|parag| akash| 2555455|\n", 1219 | "+-----+------+--------+\n", 1220 | "\n", 1221 | "+-----+-------+--------+\n", 1222 | "| from| to| amt|\n", 1223 | "+-----+-------+--------+\n", 1224 | "|vinay| deepak|12466641|\n", 1225 | "|sunny| pratik| 451232|\n", 1226 | "|parag| akash| 2555455|\n", 1227 | "|parag|akashay| 2555455|\n", 1228 | "+-----+-------+--------+\n", 1229 | "\n", 1230 | "+-----+------+--------+\n", 1231 | "| from| to| amt|\n", 1232 | "+-----+------+--------+\n", 1233 | "|sunny|pratik| 451232|\n", 1234 | "|vinay|deepak|12466641|\n", 1235 | "|parag| akash| 2555455|\n", 1236 | "+-----+------+--------+\n", 1237 | "\n" 1238 | ] 1239 | } 1240 | ], 1241 | "source": [ 1242 | "# intersect\n", 1243 | "\n", 1244 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455),(\"parag\",\"akash\",2555455)], ['from','to','amt'])\n", 1245 | "\n", 1246 | "y = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455),(\"parag\",\"akashay\",2555455)], ['from','to','amt'])\n", 1247 | "\n", 1248 | "z = x.intersect(y)\n", 1249 | "\n", 1250 | "x.show()\n", 1251 | "y.show()\n", 1252 | "z.show()" 1253 | ] 1254 | }, 1255 | { 1256 | "cell_type": "code", 1257 | "execution_count": 17, 1258 | "metadata": {}, 1259 | "outputs": [ 1260 | { 1261 | "name": "stdout", 1262 | "output_type": "stream", 1263 | "text": [ 1264 | "+-----+------+--------+\n", 1265 | "| from| to| amt|\n", 1266 | "+-----+------+--------+\n", 1267 | "|vinay|deepak|12466641|\n", 1268 | "|sunny|pratik| 451232|\n", 1269 | "|parag| akash| 2555455|\n", 1270 | "|parag| akash| 2555455|\n", 1271 | "+-----+------+--------+\n", 1272 | "\n", 1273 | "False\n" 1274 | ] 1275 | } 1276 | ], 1277 | "source": [ 1278 | "# isLocal\n", 1279 | "\n", 1280 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455),(\"parag\",\"akash\",2555455)], ['from','to','amt'])\n", 1281 | "\n", 1282 | "\n", 1283 | "y = x.isLocal()\n", 1284 | "\n", 1285 | "x.show()\n", 1286 | "print(y)\n" 1287 | ] 1288 | }, 1289 | { 1290 | "cell_type": "markdown", 1291 | "metadata": {}, 1292 | "source": [ 1293 | "## join (Most used api)" 1294 | ] 1295 | }, 1296 | { 1297 | "cell_type": "code", 1298 | "execution_count": 19, 1299 | "metadata": {}, 1300 | "outputs": [ 1301 | { 1302 | "name": "stdout", 1303 | "output_type": "stream", 1304 | "text": [ 1305 | "+------+------+--------+\n", 1306 | "| from| to| amt|\n", 1307 | "+------+------+--------+\n", 1308 | "| vinay|deepak|12466641|\n", 1309 | "| sunny|pratik| 451232|\n", 1310 | "| parag| akash| 2555455|\n", 1311 | "|Salman| akash| 2555455|\n", 1312 | "+------+------+--------+\n", 1313 | "\n", 1314 | "+-----+---+\n", 1315 | "| name|age|\n", 1316 | "+-----+---+\n", 1317 | "| Andy| 20|\n", 1318 | "|Steve| 40|\n", 1319 | "| Elon| 80|\n", 1320 | "+-----+---+\n", 1321 | "\n", 1322 | "+----+---+---+---+\n", 1323 | "|from| to|amt|age|\n", 1324 | "+----+---+---+---+\n", 1325 | "+----+---+---+---+\n", 1326 | "\n" 1327 | ] 1328 | } 1329 | ], 1330 | "source": [ 1331 | "# join \n", 1332 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455),(\"Salman\",\"akash\",2555455)], ['from','to','amt'])\n", 1333 | "y = sqlContext.createDataFrame([('Andy',20),(\"Steve\",40),(\"Elon\",80)], ['name','age'])\n", 1334 | "z = x.join(y,x.to ==y.name,'inner').select('from','to','amt','age')\n", 1335 | "x.show()\n", 1336 | "y.show()\n", 1337 | "z.show()" 1338 | ] 1339 | }, 1340 | { 1341 | "cell_type": "code", 1342 | "execution_count": 20, 1343 | "metadata": {}, 1344 | "outputs": [ 1345 | { 1346 | "name": "stdout", 1347 | "output_type": "stream", 1348 | "text": [ 1349 | "+------+------+--------+\n", 1350 | "| from| to| amt|\n", 1351 | "+------+------+--------+\n", 1352 | "| vinay|deepak|12466641|\n", 1353 | "| sunny|pratik| 451232|\n", 1354 | "| parag| akash| 2555455|\n", 1355 | "|Salman| akash| 2555455|\n", 1356 | "+------+------+--------+\n", 1357 | "\n", 1358 | "+-----+---+\n", 1359 | "| name|age|\n", 1360 | "+-----+---+\n", 1361 | "| Andy| 20|\n", 1362 | "|Steve| 40|\n", 1363 | "| Elon| 80|\n", 1364 | "+-----+---+\n", 1365 | "\n", 1366 | "+------+------+--------+----+\n", 1367 | "| from| to| amt| age|\n", 1368 | "+------+------+--------+----+\n", 1369 | "| null| null| null| 40|\n", 1370 | "| sunny|pratik| 451232|null|\n", 1371 | "| vinay|deepak|12466641|null|\n", 1372 | "| null| null| null| 20|\n", 1373 | "| parag| akash| 2555455|null|\n", 1374 | "|Salman| akash| 2555455|null|\n", 1375 | "| null| null| null| 80|\n", 1376 | "+------+------+--------+----+\n", 1377 | "\n" 1378 | ] 1379 | } 1380 | ], 1381 | "source": [ 1382 | "# join \n", 1383 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455),(\"Salman\",\"akash\",2555455)], ['from','to','amt'])\n", 1384 | "y = sqlContext.createDataFrame([('Andy',20),(\"Steve\",40),(\"Elon\",80)], ['name','age'])\n", 1385 | "z = x.join(y,x.to ==y.name,'outer').select('from','to','amt','age')\n", 1386 | "x.show()\n", 1387 | "y.show()\n", 1388 | "z.show()" 1389 | ] 1390 | }, 1391 | { 1392 | "cell_type": "code", 1393 | "execution_count": 21, 1394 | "metadata": {}, 1395 | "outputs": [ 1396 | { 1397 | "name": "stdout", 1398 | "output_type": "stream", 1399 | "text": [ 1400 | "+------+------+--------+\n", 1401 | "| from| to| amt|\n", 1402 | "+------+------+--------+\n", 1403 | "| vinay|deepak|12466641|\n", 1404 | "| sunny|pratik| 451232|\n", 1405 | "| parag| akash| 2555455|\n", 1406 | "|Salman| akash| 2555455|\n", 1407 | "+------+------+--------+\n", 1408 | "\n", 1409 | "+-----+------+--------+\n", 1410 | "| from| to| amt|\n", 1411 | "+-----+------+--------+\n", 1412 | "|vinay|deepak|12466641|\n", 1413 | "|sunny|pratik| 451232|\n", 1414 | "+-----+------+--------+\n", 1415 | "\n" 1416 | ] 1417 | } 1418 | ], 1419 | "source": [ 1420 | "# Limit\n", 1421 | "\n", 1422 | "# join \n", 1423 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455),(\"Salman\",\"akash\",2555455)], ['from','to','amt'])\n", 1424 | "y = x.limit(2)\n", 1425 | "x.show()\n", 1426 | "y.show()" 1427 | ] 1428 | }, 1429 | { 1430 | "cell_type": "markdown", 1431 | "metadata": {}, 1432 | "source": [] 1433 | }, 1434 | { 1435 | "cell_type": "code", 1436 | "execution_count": 27, 1437 | "metadata": {}, 1438 | "outputs": [ 1439 | { 1440 | "name": "stdout", 1441 | "output_type": "stream", 1442 | "text": [ 1443 | "+-----+-----+----+\n", 1444 | "| from| to| amt|\n", 1445 | "+-----+-----+----+\n", 1446 | "| null| Bob| 0.1|\n", 1447 | "| Bob|Carol|null|\n", 1448 | "|Carol| null| 0.3|\n", 1449 | "| Bob|Carol| 0.2|\n", 1450 | "+-----+-----+----+\n", 1451 | "\n", 1452 | "\n", 1453 | "+----+-----+---+\n", 1454 | "|from| to|amt|\n", 1455 | "+----+-----+---+\n", 1456 | "| Bob|Carol|0.2|\n", 1457 | "+----+-----+---+\n", 1458 | "\n", 1459 | "+-------+-------+---+\n", 1460 | "| from| to|amt|\n", 1461 | "+-------+-------+---+\n", 1462 | "|unknown| Bob|0.1|\n", 1463 | "| Bob| Carol|0.0|\n", 1464 | "| Carol|unknown|0.3|\n", 1465 | "| Bob| Carol|0.2|\n", 1466 | "+-------+-------+---+\n", 1467 | "\n", 1468 | "+-----+-----+----+\n", 1469 | "| from| to| amt|\n", 1470 | "+-----+-----+----+\n", 1471 | "| --| Bob| 0.1|\n", 1472 | "| Bob|Carol|null|\n", 1473 | "|Carol| --| 0.3|\n", 1474 | "| Bob|Carol| 0.2|\n", 1475 | "+-----+-----+----+\n", 1476 | "\n" 1477 | ] 1478 | } 1479 | ], 1480 | "source": [ 1481 | "# na\n", 1482 | "\n", 1483 | "x = sqlContext.createDataFrame([(None,\"Bob\",0.1),(\"Bob\",\"Carol\",None),(\"Carol\",None,0.3),(\"Bob\",\"Carol\",0.2)], ['from','to','amt'])\n", 1484 | "y = x.na # returns an object for handling missing values, supports drop, fill, and replace methods\n", 1485 | "x.show()\n", 1486 | "print(y)\n", 1487 | "y.drop().show()\n", 1488 | "\n", 1489 | "y.fill({'from':'unknown','to':'unknown','amt':0}).show()\n", 1490 | "y.fill('--').show()" 1491 | ] 1492 | }, 1493 | { 1494 | "cell_type": "code", 1495 | "execution_count": 4, 1496 | "metadata": {}, 1497 | "outputs": [ 1498 | { 1499 | "name": "stdout", 1500 | "output_type": "stream", 1501 | "text": [ 1502 | "+-----+------+--------+\n", 1503 | "| from| to| amt|\n", 1504 | "+-----+------+--------+\n", 1505 | "|vinay|deepak|12466641|\n", 1506 | "|sunny|pratik| 451232|\n", 1507 | "|parag| akash| 2555455|\n", 1508 | "+-----+------+--------+\n", 1509 | "\n", 1510 | "+-----+------+--------+\n", 1511 | "| from| to| amt|\n", 1512 | "+-----+------+--------+\n", 1513 | "|vinay|deepak|12466641|\n", 1514 | "|parag| akash| 2555455|\n", 1515 | "|sunny|pratik| 451232|\n", 1516 | "+-----+------+--------+\n", 1517 | "\n", 1518 | "+-----+------+--------+\n", 1519 | "| from| to| amt|\n", 1520 | "+-----+------+--------+\n", 1521 | "|sunny|pratik| 451232|\n", 1522 | "|parag| akash| 2555455|\n", 1523 | "|vinay|deepak|12466641|\n", 1524 | "+-----+------+--------+\n", 1525 | "\n" 1526 | ] 1527 | } 1528 | ], 1529 | "source": [ 1530 | "# orderBy\n", 1531 | "\n", 1532 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455)], ['from','to','amt'])\n", 1533 | "\n", 1534 | "y = x.orderBy(['amt'],ascending=[False])\n", 1535 | "z = x.orderBy(['amt'],ascending=[True])\n", 1536 | "x.show()\n", 1537 | "y.show()\n", 1538 | "z.show()\n" 1539 | ] 1540 | }, 1541 | { 1542 | "cell_type": "code", 1543 | "execution_count": 5, 1544 | "metadata": {}, 1545 | "outputs": [ 1546 | { 1547 | "name": "stdout", 1548 | "output_type": "stream", 1549 | "text": [ 1550 | "+-----+------+--------+\n", 1551 | "| from| to| amt|\n", 1552 | "+-----+------+--------+\n", 1553 | "|vinay|deepak|12466641|\n", 1554 | "|sunny|pratik| 451232|\n", 1555 | "|parag| akash| 2555455|\n", 1556 | "+-----+------+--------+\n", 1557 | "\n", 1558 | "root\n", 1559 | " |-- from: string (nullable = true)\n", 1560 | " |-- to: string (nullable = true)\n", 1561 | " |-- amt: long (nullable = true)\n", 1562 | "\n" 1563 | ] 1564 | } 1565 | ], 1566 | "source": [ 1567 | "# PrintSchema\n", 1568 | "\n", 1569 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455)], ['from','to','amt'])\n", 1570 | "x.show()\n", 1571 | "x.printSchema()" 1572 | ] 1573 | }, 1574 | { 1575 | "cell_type": "code", 1576 | "execution_count": 6, 1577 | "metadata": {}, 1578 | "outputs": [ 1579 | { 1580 | "name": "stdout", 1581 | "output_type": "stream", 1582 | "text": [ 1583 | "+-----+------+--------+\n", 1584 | "| from| to| amt|\n", 1585 | "+-----+------+--------+\n", 1586 | "|vinay|deepak|12466641|\n", 1587 | "|sunny|pratik| 451232|\n", 1588 | "|parag| akash| 2555455|\n", 1589 | "+-----+------+--------+\n", 1590 | "\n", 1591 | "+-----+------+-------+\n", 1592 | "| from| to| amt|\n", 1593 | "+-----+------+-------+\n", 1594 | "|sunny|pratik| 451232|\n", 1595 | "|parag| akash|2555455|\n", 1596 | "+-----+------+-------+\n", 1597 | "\n", 1598 | "+-----+------+--------+\n", 1599 | "| from| to| amt|\n", 1600 | "+-----+------+--------+\n", 1601 | "|vinay|deepak|12466641|\n", 1602 | "+-----+------+--------+\n", 1603 | "\n" 1604 | ] 1605 | } 1606 | ], 1607 | "source": [ 1608 | "# randomSplit\n", 1609 | "\n", 1610 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455)], ['from','to','amt'])\n", 1611 | "y = x.randomSplit([0.5,0.5])\n", 1612 | "\n", 1613 | "x.show()\n", 1614 | "y[0].show()\n", 1615 | "y[1].show()\n", 1616 | "\n", 1617 | "\n" 1618 | ] 1619 | }, 1620 | { 1621 | "cell_type": "code", 1622 | "execution_count": 7, 1623 | "metadata": {}, 1624 | "outputs": [ 1625 | { 1626 | "name": "stdout", 1627 | "output_type": "stream", 1628 | "text": [ 1629 | "+-----+------+--------+\n", 1630 | "| from| to| amt|\n", 1631 | "+-----+------+--------+\n", 1632 | "|vinay|deepak|12466641|\n", 1633 | "|sunny|pratik| 451232|\n", 1634 | "|parag| akash| 2555455|\n", 1635 | "+-----+------+--------+\n", 1636 | "\n", 1637 | "[Row(from='vinay', to='deepak', amt=12466641), Row(from='sunny', to='pratik', amt=451232), Row(from='parag', to='akash', amt=2555455)]\n" 1638 | ] 1639 | } 1640 | ], 1641 | "source": [ 1642 | "# rdd \n", 1643 | "\n", 1644 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455)], ['from','to','amt'])\n", 1645 | "y = x.rdd\n", 1646 | "\n", 1647 | "x.show()\n", 1648 | "print(y.collect())\n" 1649 | ] 1650 | }, 1651 | { 1652 | "cell_type": "code", 1653 | "execution_count": 9, 1654 | "metadata": {}, 1655 | "outputs": [ 1656 | { 1657 | "name": "stdout", 1658 | "output_type": "stream", 1659 | "text": [ 1660 | "+-----+------+--------+\n", 1661 | "| from| to| amt|\n", 1662 | "+-----+------+--------+\n", 1663 | "|vinay|deepak|12466641|\n", 1664 | "|sunny|pratik| 451232|\n", 1665 | "|parag| akash| 2555455|\n", 1666 | "+-----+------+--------+\n", 1667 | "\n", 1668 | "+-----+------+--------+\n", 1669 | "| from| to| amt|\n", 1670 | "+-----+------+--------+\n", 1671 | "|vinay|deepak|12466641|\n", 1672 | "|parag| akash| 2555455|\n", 1673 | "+-----+------+--------+\n", 1674 | "\n" 1675 | ] 1676 | } 1677 | ], 1678 | "source": [ 1679 | "# registerTempTable\n", 1680 | "\n", 1681 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455)], ['from','to','amt'])\n", 1682 | "x.registerTempTable(name=\"TRANS\")\n", 1683 | "y = sqlContext.sql('SELECT * FROM TRANS WHERE amt > 451232')\n", 1684 | "\n", 1685 | "x.show()\n", 1686 | "y.show()" 1687 | ] 1688 | }, 1689 | { 1690 | "cell_type": "code", 1691 | "execution_count": 11, 1692 | "metadata": {}, 1693 | "outputs": [ 1694 | { 1695 | "name": "stdout", 1696 | "output_type": "stream", 1697 | "text": [ 1698 | "4\n", 1699 | "3\n", 1700 | "+-----+------+--------+\n", 1701 | "| from| to| amt|\n", 1702 | "+-----+------+--------+\n", 1703 | "|parag| akash| 2555455|\n", 1704 | "|vinay|deepak|12466641|\n", 1705 | "|sunny|pratik| 451232|\n", 1706 | "+-----+------+--------+\n", 1707 | "\n" 1708 | ] 1709 | } 1710 | ], 1711 | "source": [ 1712 | "# repartiton \n", 1713 | "\n", 1714 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455)], ['from','to','amt'])\n", 1715 | "y = x.repartition(3)\n", 1716 | "\n", 1717 | "print(x.rdd.getNumPartitions())\n", 1718 | "print(y.rdd.getNumPartitions())\n", 1719 | "y.show()" 1720 | ] 1721 | }, 1722 | { 1723 | "cell_type": "code", 1724 | "execution_count": 14, 1725 | "metadata": {}, 1726 | "outputs": [ 1727 | { 1728 | "name": "stdout", 1729 | "output_type": "stream", 1730 | "text": [ 1731 | "+-----+------+--------+\n", 1732 | "| from| to| amt|\n", 1733 | "+-----+------+--------+\n", 1734 | "|vinay|deepak|12466641|\n", 1735 | "|sunny|pratik| 451232|\n", 1736 | "|parag| akash| 2555455|\n", 1737 | "+-----+------+--------+\n", 1738 | "\n", 1739 | "+-----+------+--------+\n", 1740 | "| from| to| amt|\n", 1741 | "+-----+------+--------+\n", 1742 | "|sunny|deepak|12466641|\n", 1743 | "|sunny|pratik| 451232|\n", 1744 | "|parag| akash| 2555455|\n", 1745 | "+-----+------+--------+\n", 1746 | "\n" 1747 | ] 1748 | } 1749 | ], 1750 | "source": [ 1751 | "# replace\n", 1752 | "\n", 1753 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",12466641),(\"sunny\",\"pratik\",451232),(\"parag\",\"akash\",2555455)], ['from','to','amt'])\n", 1754 | "y = x.replace('vinay','sunny',['from','to'])\n", 1755 | "\n", 1756 | "x.show()\n", 1757 | "y.show()\n", 1758 | "\n" 1759 | ] 1760 | }, 1761 | { 1762 | "cell_type": "code", 1763 | "execution_count": 16, 1764 | "metadata": {}, 1765 | "outputs": [ 1766 | { 1767 | "name": "stdout", 1768 | "output_type": "stream", 1769 | "text": [ 1770 | "+------+------+---+\n", 1771 | "| from| to|amt|\n", 1772 | "+------+------+---+\n", 1773 | "| Sunny|chirag|0.1|\n", 1774 | "|deepak| vinay|0.2|\n", 1775 | "| Carol| Dave|0.3|\n", 1776 | "+------+------+---+\n", 1777 | "\n", 1778 | "+------+------+---+\n", 1779 | "| from| to|amt|\n", 1780 | "+------+------+---+\n", 1781 | "|Pranav|chirag|0.1|\n", 1782 | "|deepak| vinay|0.2|\n", 1783 | "| Carol| Dave|0.3|\n", 1784 | "+------+------+---+\n", 1785 | "\n" 1786 | ] 1787 | } 1788 | ], 1789 | "source": [ 1790 | "# replace\n", 1791 | "\n", 1792 | "x = sqlContext.createDataFrame([('Sunny',\"chirag\",0.1),(\"deepak\",\"vinay\",0.2),(\"Carol\",\"Dave\",0.3)], ['from','to','amt'])\n", 1793 | "y = x.replace('Sunny','Pranav',['from','to'])\n", 1794 | "\n", 1795 | "x.show()\n", 1796 | "y.show()\n" 1797 | ] 1798 | }, 1799 | { 1800 | "cell_type": "code", 1801 | "execution_count": 18, 1802 | "metadata": {}, 1803 | "outputs": [ 1804 | { 1805 | "name": "stdout", 1806 | "output_type": "stream", 1807 | "text": [ 1808 | "+-----+------+---------+\n", 1809 | "| from| to| amt|\n", 1810 | "+-----+------+---------+\n", 1811 | "|vinay|deepak|1246.6641|\n", 1812 | "|sunny|pratik| 4512.32|\n", 1813 | "|parag| akash| 2555.455|\n", 1814 | "+-----+------+---------+\n", 1815 | "\n", 1816 | "\n", 1817 | "+-----+------+---------+\n", 1818 | "| from| to| sum(amt)|\n", 1819 | "+-----+------+---------+\n", 1820 | "| null| null|8314.4391|\n", 1821 | "|vinay|deepak|1246.6641|\n", 1822 | "|vinay| null|1246.6641|\n", 1823 | "|parag| akash| 2555.455|\n", 1824 | "|parag| null| 2555.455|\n", 1825 | "|sunny| null| 4512.32|\n", 1826 | "|sunny|pratik| 4512.32|\n", 1827 | "+-----+------+---------+\n", 1828 | "\n", 1829 | "+-----+------+---------+\n", 1830 | "| from| to| max(amt)|\n", 1831 | "+-----+------+---------+\n", 1832 | "| null| null| 4512.32|\n", 1833 | "|vinay|deepak|1246.6641|\n", 1834 | "|vinay| null|1246.6641|\n", 1835 | "|parag| akash| 2555.455|\n", 1836 | "|parag| null| 2555.455|\n", 1837 | "|sunny| null| 4512.32|\n", 1838 | "|sunny|pratik| 4512.32|\n", 1839 | "+-----+------+---------+\n", 1840 | "\n", 1841 | "+-----+------+---------+\n", 1842 | "| from| to| min(amt)|\n", 1843 | "+-----+------+---------+\n", 1844 | "| null| null|1246.6641|\n", 1845 | "|vinay|deepak|1246.6641|\n", 1846 | "|vinay| null|1246.6641|\n", 1847 | "|parag| akash| 2555.455|\n", 1848 | "|parag| null| 2555.455|\n", 1849 | "|sunny| null| 4512.32|\n", 1850 | "|sunny|pratik| 4512.32|\n", 1851 | "+-----+------+---------+\n", 1852 | "\n" 1853 | ] 1854 | } 1855 | ], 1856 | "source": [ 1857 | "#rollup\n", 1858 | "\n", 1859 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1246.6641),(\"sunny\",\"pratik\",4512.32),(\"parag\",\"akash\",2555.455)], ['from','to','amt'])\n", 1860 | "y = x.rollup(['from','to'])\n", 1861 | "x.show()\n", 1862 | "\n", 1863 | "print(y)\n", 1864 | "#y is a grouped data object \n", 1865 | "#aggregations will be applied to all numerical columns\n", 1866 | "\n", 1867 | "y.sum().show()\n", 1868 | "y.max().show()\n", 1869 | "y.min().show()" 1870 | ] 1871 | }, 1872 | { 1873 | "cell_type": "code", 1874 | "execution_count": 19, 1875 | "metadata": {}, 1876 | "outputs": [ 1877 | { 1878 | "name": "stdout", 1879 | "output_type": "stream", 1880 | "text": [ 1881 | "+-----+------+---------+\n", 1882 | "| from| to| amt|\n", 1883 | "+-----+------+---------+\n", 1884 | "|vinay|deepak|1246.6641|\n", 1885 | "|sunny|pratik| 4512.32|\n", 1886 | "|parag| akash| 2555.455|\n", 1887 | "+-----+------+---------+\n", 1888 | "\n", 1889 | "+-----+------+--------+\n", 1890 | "| from| to| amt|\n", 1891 | "+-----+------+--------+\n", 1892 | "|sunny|pratik| 4512.32|\n", 1893 | "|parag| akash|2555.455|\n", 1894 | "+-----+------+--------+\n", 1895 | "\n" 1896 | ] 1897 | } 1898 | ], 1899 | "source": [ 1900 | "# sample:-\n", 1901 | "# Returns a stratified sample without replacement based \n", 1902 | "# on the fraction given on each stratum.\n", 1903 | "\n", 1904 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1246.6641),(\"sunny\",\"pratik\",4512.32),(\"parag\",\"akash\",2555.455)], ['from','to','amt'])\n", 1905 | "y = x.sample(False,0.5)\n", 1906 | "\n", 1907 | "x.show()\n", 1908 | "y.show()\n" 1909 | ] 1910 | }, 1911 | { 1912 | "cell_type": "code", 1913 | "execution_count": 23, 1914 | "metadata": {}, 1915 | "outputs": [ 1916 | { 1917 | "name": "stdout", 1918 | "output_type": "stream", 1919 | "text": [ 1920 | "+-----+------+---------+\n", 1921 | "| from| to| amt|\n", 1922 | "+-----+------+---------+\n", 1923 | "|vinay|deepak|1246.6641|\n", 1924 | "|sunny|pratik| 4512.32|\n", 1925 | "|parag| akash| 2555.455|\n", 1926 | "+-----+------+---------+\n", 1927 | "\n", 1928 | "StructType(List(StructField(from,StringType,true),StructField(to,StringType,true),StructField(amt,DoubleType,true)))\n" 1929 | ] 1930 | } 1931 | ], 1932 | "source": [ 1933 | "#schema \n", 1934 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1246.6641),(\"sunny\",\"pratik\",4512.32),(\"parag\",\"akash\",2555.455)], ['from','to','amt'])\n", 1935 | "y = x.schema\n", 1936 | "x.show()\n", 1937 | "print(y)" 1938 | ] 1939 | }, 1940 | { 1941 | "cell_type": "code", 1942 | "execution_count": 26, 1943 | "metadata": {}, 1944 | "outputs": [ 1945 | { 1946 | "name": "stdout", 1947 | "output_type": "stream", 1948 | "text": [ 1949 | "+-----+------+---------+\n", 1950 | "| from| to| amt|\n", 1951 | "+-----+------+---------+\n", 1952 | "|vinay|deepak|1246.6641|\n", 1953 | "|sunny|pratik| 4512.32|\n", 1954 | "|parag| akash| 2555.455|\n", 1955 | "+-----+------+---------+\n", 1956 | "\n", 1957 | "+---------------------+------------+\n", 1958 | "|substring(from, 1, 1)|(amt + 1000)|\n", 1959 | "+---------------------+------------+\n", 1960 | "| v| 2246.6641|\n", 1961 | "| s| 5512.32|\n", 1962 | "| p| 3555.455|\n", 1963 | "+---------------------+------------+\n", 1964 | "\n" 1965 | ] 1966 | } 1967 | ], 1968 | "source": [ 1969 | "# SlectExpr\n", 1970 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1246.6641),(\"sunny\",\"pratik\",4512.32),(\"parag\",\"akash\",2555.455)], ['from','to','amt'])\n", 1971 | "y = x.selectExpr(['substr(from,1,1)','amt+1000'])\n", 1972 | "\n", 1973 | "x.show()\n", 1974 | "y.show()" 1975 | ] 1976 | }, 1977 | { 1978 | "cell_type": "code", 1979 | "execution_count": 27, 1980 | "metadata": {}, 1981 | "outputs": [ 1982 | { 1983 | "name": "stdout", 1984 | "output_type": "stream", 1985 | "text": [ 1986 | "+-----+------+---------+\n", 1987 | "| from| to| amt|\n", 1988 | "+-----+------+---------+\n", 1989 | "|vinay|deepak|1246.6641|\n", 1990 | "|sunny|pratik| 4512.32|\n", 1991 | "|parag| akash| 2555.455|\n", 1992 | "+-----+------+---------+\n", 1993 | "\n" 1994 | ] 1995 | } 1996 | ], 1997 | "source": [ 1998 | "# show\n", 1999 | "\n", 2000 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1246.6641),(\"sunny\",\"pratik\",4512.32),(\"parag\",\"akash\",2555.455)], ['from','to','amt'])\n", 2001 | "x.show()" 2002 | ] 2003 | }, 2004 | { 2005 | "cell_type": "code", 2006 | "execution_count": 29, 2007 | "metadata": {}, 2008 | "outputs": [ 2009 | { 2010 | "name": "stdout", 2011 | "output_type": "stream", 2012 | "text": [ 2013 | "+-----+------+---------+\n", 2014 | "| from| to| amt|\n", 2015 | "+-----+------+---------+\n", 2016 | "|vinay|deepak|1246.6641|\n", 2017 | "|sunny|pratik| 4512.32|\n", 2018 | "|parag| akash| 2555.455|\n", 2019 | "+-----+------+---------+\n", 2020 | "\n", 2021 | "+-----+------+---------+\n", 2022 | "| from| to| amt|\n", 2023 | "+-----+------+---------+\n", 2024 | "|vinay|deepak|1246.6641|\n", 2025 | "|parag| akash| 2555.455|\n", 2026 | "|sunny|pratik| 4512.32|\n", 2027 | "+-----+------+---------+\n", 2028 | "\n" 2029 | ] 2030 | } 2031 | ], 2032 | "source": [ 2033 | "# sort\n", 2034 | "\n", 2035 | "x = sqlContext.createDataFrame([(\"vinay\",\"deepak\",1246.6641),(\"sunny\",\"pratik\",4512.32),(\"parag\",\"akash\",2555.455)], ['from','to','amt'])\n", 2036 | "y = x.sort(['amt'])\n", 2037 | "\n", 2038 | "x.show()\n", 2039 | "y.show()" 2040 | ] 2041 | }, 2042 | { 2043 | "cell_type": "code", 2044 | "execution_count": 2, 2045 | "metadata": {}, 2046 | "outputs": [ 2047 | { 2048 | "name": "stdout", 2049 | "output_type": "stream", 2050 | "text": [ 2051 | "+------+-----+---+----+\n", 2052 | "| from| to|amt|p_id|\n", 2053 | "+------+-----+---+----+\n", 2054 | "| Bobby|sunny|0.2| 2|\n", 2055 | "|deepak|parag|0.3| 2|\n", 2056 | "| vinay|Bobby|0.1| 1|\n", 2057 | "+------+-----+---+----+\n", 2058 | "\n", 2059 | "+------+-----+---+----+\n", 2060 | "| from| to|amt|p_id|\n", 2061 | "+------+-----+---+----+\n", 2062 | "|deepak|parag|0.3| 2|\n", 2063 | "| Bobby|sunny|0.2| 2|\n", 2064 | "| vinay|Bobby|0.1| 1|\n", 2065 | "+------+-----+---+----+\n", 2066 | "\n", 2067 | "[[Row(from='Bobby', to='sunny', amt=0.2, p_id=2), Row(from='deepak', to='parag', amt=0.3, p_id=2)], [Row(from='vinay', to='Bobby', amt=0.1, p_id=1)]]\n", 2068 | "\n", 2069 | "\n", 2070 | "[[Row(from='deepak', to='parag', amt=0.3, p_id=2), Row(from='Bobby', to='sunny', amt=0.2, p_id=2)], [Row(from='vinay', to='Bobby', amt=0.1, p_id=1)]]\n" 2071 | ] 2072 | } 2073 | ], 2074 | "source": [ 2075 | "# sortWithinPartitions\n", 2076 | "x = sqlContext.createDataFrame([('vinay',\"Bobby\",0.1,1),(\"Bobby\",\"sunny\",0.2,2),(\"deepak\",\"parag\",0.3,2)], \\\n", 2077 | " ['from','to','amt','p_id']).repartition(2,'p_id')\n", 2078 | "y = x.sortWithinPartitions(['to'])\n", 2079 | "x.show()\n", 2080 | "y.show()\n", 2081 | "print(x.rdd.glom().collect()) # glom() flattens elements on the same partition\n", 2082 | "print(\"\\n\")\n", 2083 | "print(y.rdd.glom().collect())" 2084 | ] 2085 | }, 2086 | { 2087 | "cell_type": "code", 2088 | "execution_count": 7, 2089 | "metadata": {}, 2090 | "outputs": [ 2091 | { 2092 | "name": "stdout", 2093 | "output_type": "stream", 2094 | "text": [ 2095 | "+-----+------+---+-----+\n", 2096 | "| from| to|amt| fees|\n", 2097 | "+-----+------+---+-----+\n", 2098 | "|vinay| Bobby|0.1|0.001|\n", 2099 | "|Bobby| sunny|0.2| 0.02|\n", 2100 | "|sunny|pranav|0.3| 0.02|\n", 2101 | "+-----+------+---+-----+\n", 2102 | "\n", 2103 | "\n", 2104 | "0.8660254037844386\n" 2105 | ] 2106 | } 2107 | ], 2108 | "source": [ 2109 | "# Stat :-Returns a \n", 2110 | "# DataFrameStatFunctions for statistic functions.\n", 2111 | "\n", 2112 | "x = sqlContext.createDataFrame([(\"vinay\",\"Bobby\",0.1,0.001),(\"Bobby\",\"sunny\",0.2,0.02),(\"sunny\",\"pranav\",0.3,0.02)], ['from','to','amt','fees'])\n", 2113 | "y = x.stat\n", 2114 | "x.show()\n", 2115 | "print(y)\n", 2116 | "print(y.corr(col1=\"amt\",col2=\"fees\"))\n" 2117 | ] 2118 | }, 2119 | { 2120 | "cell_type": "code", 2121 | "execution_count": 8, 2122 | "metadata": {}, 2123 | "outputs": [ 2124 | { 2125 | "name": "stdout", 2126 | "output_type": "stream", 2127 | "text": [ 2128 | "+-----+------+---+-----+\n", 2129 | "| from| to|amt| fees|\n", 2130 | "+-----+------+---+-----+\n", 2131 | "|vinay| Bobby|0.1|0.001|\n", 2132 | "|Bobby| sunny|0.2| 0.02|\n", 2133 | "|sunny|pranav|0.3| 0.02|\n", 2134 | "+-----+------+---+-----+\n", 2135 | "\n", 2136 | "+-----+------+---+-----+\n", 2137 | "| from| to|amt| fees|\n", 2138 | "+-----+------+---+-----+\n", 2139 | "|vinay| Bobby|0.1|0.001|\n", 2140 | "|Bobby| sunny|0.2| 0.02|\n", 2141 | "|sunny|pranav|0.3| 0.01|\n", 2142 | "+-----+------+---+-----+\n", 2143 | "\n", 2144 | "+-----+------+---+----+\n", 2145 | "| from| to|amt|fees|\n", 2146 | "+-----+------+---+----+\n", 2147 | "|sunny|pranav|0.3|0.02|\n", 2148 | "+-----+------+---+----+\n", 2149 | "\n" 2150 | ] 2151 | } 2152 | ], 2153 | "source": [ 2154 | "# subtract\n", 2155 | "\n", 2156 | "x = sqlContext.createDataFrame([(\"vinay\",\"Bobby\",0.1,0.001),(\"Bobby\",\"sunny\",0.2,0.02),(\"sunny\",\"pranav\",0.3,0.02)], ['from','to','amt','fees'])\n", 2157 | "y = sqlContext.createDataFrame([(\"vinay\",\"Bobby\",0.1,0.001),(\"Bobby\",\"sunny\",0.2,0.02),(\"sunny\",\"pranav\",0.3,0.01)], ['from','to','amt','fees'])\n", 2158 | "\n", 2159 | "z = x.subtract(y)\n", 2160 | "x.show()\n", 2161 | "y.show()\n", 2162 | "z.show()" 2163 | ] 2164 | }, 2165 | { 2166 | "cell_type": "code", 2167 | "execution_count": 9, 2168 | "metadata": {}, 2169 | "outputs": [ 2170 | { 2171 | "name": "stdout", 2172 | "output_type": "stream", 2173 | "text": [ 2174 | "+-----+------+---+-----+\n", 2175 | "| from| to|amt| fees|\n", 2176 | "+-----+------+---+-----+\n", 2177 | "|vinay| Bobby|0.1|0.001|\n", 2178 | "|Bobby| sunny|0.2| 0.02|\n", 2179 | "|sunny|pranav|0.3| 0.02|\n", 2180 | "+-----+------+---+-----+\n", 2181 | "\n", 2182 | "[Row(from='vinay', to='Bobby', amt=0.1, fees=0.001), Row(from='Bobby', to='sunny', amt=0.2, fees=0.02)]\n" 2183 | ] 2184 | } 2185 | ], 2186 | "source": [ 2187 | "x = sqlContext.createDataFrame([(\"vinay\",\"Bobby\",0.1,0.001),(\"Bobby\",\"sunny\",0.2,0.02),(\"sunny\",\"pranav\",0.3,0.02)], ['from','to','amt','fees'])\n", 2188 | "\n", 2189 | "y = x.take(num=2)\n", 2190 | "x.show()\n", 2191 | "print(y)" 2192 | ] 2193 | }, 2194 | { 2195 | "cell_type": "markdown", 2196 | "metadata": {}, 2197 | "source": [ 2198 | "# Conversions " 2199 | ] 2200 | }, 2201 | { 2202 | "cell_type": "code", 2203 | "execution_count": 11, 2204 | "metadata": {}, 2205 | "outputs": [ 2206 | { 2207 | "name": "stdout", 2208 | "output_type": "stream", 2209 | "text": [ 2210 | "+-----+-----+---+\n", 2211 | "| from| to|amt|\n", 2212 | "+-----+-----+---+\n", 2213 | "|Alice| Bob|0.1|\n", 2214 | "| Bob|Carol|0.2|\n", 2215 | "|Carol| Dave|0.3|\n", 2216 | "+-----+-----+---+\n", 2217 | "\n", 2218 | "+------+-----+---+\n", 2219 | "|seller|buyer|amt|\n", 2220 | "+------+-----+---+\n", 2221 | "| Alice| Bob|0.1|\n", 2222 | "| Bob|Carol|0.2|\n", 2223 | "| Carol| Dave|0.3|\n", 2224 | "+------+-----+---+\n", 2225 | "\n" 2226 | ] 2227 | } 2228 | ], 2229 | "source": [ 2230 | "#toDF\n", 2231 | "\n", 2232 | "x = sqlContext.createDataFrame([('Alice',\"Bob\",0.1),(\"Bob\",\"Carol\",0.2),(\"Carol\",\"Dave\",0.3)], ['from','to','amt'])\n", 2233 | "y = x.toDF(\"seller\",\"buyer\",\"amt\")\n", 2234 | "x.show()\n", 2235 | "y.show()" 2236 | ] 2237 | }, 2238 | { 2239 | "cell_type": "code", 2240 | "execution_count": 13, 2241 | "metadata": {}, 2242 | "outputs": [ 2243 | { 2244 | "name": "stdout", 2245 | "output_type": "stream", 2246 | "text": [ 2247 | "+-----+-----+---+\n", 2248 | "| from| to|amt|\n", 2249 | "+-----+-----+---+\n", 2250 | "|Alice| Bob|0.1|\n", 2251 | "| Bob|Carol|0.2|\n", 2252 | "|Carol|Alice|0.3|\n", 2253 | "+-----+-----+---+\n", 2254 | "\n", 2255 | "MapPartitionsRDD[193] at toJavaRDD at NativeMethodAccessorImpl.java:0\n", 2256 | "\n", 2257 | "\n", 2258 | "['{\"from\":\"Alice\",\"to\":\"Bob\",\"amt\":0.1}', '{\"from\":\"Bob\",\"to\":\"Carol\",\"amt\":0.2}', '{\"from\":\"Carol\",\"to\":\"Alice\",\"amt\":0.3}']\n" 2259 | ] 2260 | } 2261 | ], 2262 | "source": [ 2263 | "# toJson\n", 2264 | "x = sqlContext.createDataFrame([('Alice',\"Bob\",0.1),(\"Bob\",\"Carol\",0.2),(\"Carol\",\"Alice\",0.3)], ['from','to','amt'])\n", 2265 | "y = x.toJSON()\n", 2266 | "\n", 2267 | "x.show()\n", 2268 | "print(y)\n", 2269 | "print(\"\\n\")\n", 2270 | "print(y.collect())\n" 2271 | ] 2272 | }, 2273 | { 2274 | "cell_type": "code", 2275 | "execution_count": 14, 2276 | "metadata": {}, 2277 | "outputs": [ 2278 | { 2279 | "name": "stdout", 2280 | "output_type": "stream", 2281 | "text": [ 2282 | "+-----+-----+---+\n", 2283 | "| from| to|amt|\n", 2284 | "+-----+-----+---+\n", 2285 | "|Alice| Bob|0.1|\n", 2286 | "| Bob|Carol|0.2|\n", 2287 | "|Carol|Alice|0.3|\n", 2288 | "+-----+-----+---+\n", 2289 | "\n", 2290 | "\n" 2291 | ] 2292 | }, 2293 | { 2294 | "data": { 2295 | "text/plain": [ 2296 | "" 2297 | ] 2298 | }, 2299 | "execution_count": 14, 2300 | "metadata": {}, 2301 | "output_type": "execute_result" 2302 | } 2303 | ], 2304 | "source": [ 2305 | "# toPandas\n", 2306 | "\n", 2307 | "x = sqlContext.createDataFrame([('Alice',\"Bob\",0.1),(\"Bob\",\"Carol\",0.2),(\"Carol\",\"Alice\",0.3)], ['from','to','amt'])\n", 2308 | "y = x.toPandas\n", 2309 | "x.show()\n", 2310 | "print(type(y))\n", 2311 | "y" 2312 | ] 2313 | }, 2314 | { 2315 | "cell_type": "code", 2316 | "execution_count": 15, 2317 | "metadata": {}, 2318 | "outputs": [ 2319 | { 2320 | "name": "stdout", 2321 | "output_type": "stream", 2322 | "text": [ 2323 | "+-----+-----+---+\n", 2324 | "| from| to|amt|\n", 2325 | "+-----+-----+---+\n", 2326 | "|Alice| Bob|0.1|\n", 2327 | "| Bob|Carol|0.2|\n", 2328 | "|Carol|Alice|0.3|\n", 2329 | "+-----+-----+---+\n", 2330 | "\n", 2331 | "+------+-----+---+\n", 2332 | "| from| to|amt|\n", 2333 | "+------+-----+---+\n", 2334 | "| sunny| Bob|0.1|\n", 2335 | "| vinay|Carol|0.2|\n", 2336 | "|pranav|Alice|0.3|\n", 2337 | "+------+-----+---+\n", 2338 | "\n", 2339 | "+------+-----+---+\n", 2340 | "| from| to|amt|\n", 2341 | "+------+-----+---+\n", 2342 | "| Alice| Bob|0.1|\n", 2343 | "| Bob|Carol|0.2|\n", 2344 | "| Carol|Alice|0.3|\n", 2345 | "| sunny| Bob|0.1|\n", 2346 | "| vinay|Carol|0.2|\n", 2347 | "|pranav|Alice|0.3|\n", 2348 | "+------+-----+---+\n", 2349 | "\n" 2350 | ] 2351 | } 2352 | ], 2353 | "source": [ 2354 | "# unionAll\n", 2355 | "\n", 2356 | "x = sqlContext.createDataFrame([('Alice',\"Bob\",0.1),(\"Bob\",\"Carol\",0.2),(\"Carol\",\"Alice\",0.3)], ['from','to','amt'])\n", 2357 | "y = sqlContext.createDataFrame([('sunny',\"Bob\",0.1),(\"vinay\",\"Carol\",0.2),(\"pranav\",\"Alice\",0.3)], ['from','to','amt'])\n", 2358 | "\n", 2359 | "z = x.unionAll(y)\n", 2360 | "\n", 2361 | "x.show()\n", 2362 | "y.show()\n", 2363 | "z.show()\n" 2364 | ] 2365 | }, 2366 | { 2367 | "cell_type": "code", 2368 | "execution_count": 16, 2369 | "metadata": {}, 2370 | "outputs": [ 2371 | { 2372 | "name": "stdout", 2373 | "output_type": "stream", 2374 | "text": [ 2375 | "+-----+-----+---+\n", 2376 | "| from| to|amt|\n", 2377 | "+-----+-----+---+\n", 2378 | "|Alice| Bob|0.1|\n", 2379 | "| Bob|Carol|0.2|\n", 2380 | "|Carol|Alice|0.3|\n", 2381 | "+-----+-----+---+\n", 2382 | "\n", 2383 | "True\n", 2384 | "False\n" 2385 | ] 2386 | } 2387 | ], 2388 | "source": [ 2389 | "# unpersist\n", 2390 | "\n", 2391 | "x = sqlContext.createDataFrame([('Alice',\"Bob\",0.1),(\"Bob\",\"Carol\",0.2),(\"Carol\",\"Alice\",0.3)], ['from','to','amt'])\n", 2392 | "x.cache()\n", 2393 | "x.count()\n", 2394 | "x.show()\n", 2395 | "\n", 2396 | "print(x.is_cached)\n", 2397 | "x.unpersist()\n", 2398 | "print(x.is_cached)" 2399 | ] 2400 | }, 2401 | { 2402 | "cell_type": "code", 2403 | "execution_count": 17, 2404 | "metadata": {}, 2405 | "outputs": [ 2406 | { 2407 | "name": "stdout", 2408 | "output_type": "stream", 2409 | "text": [ 2410 | "+-----+-----+---+\n", 2411 | "| from| to|amt|\n", 2412 | "+-----+-----+---+\n", 2413 | "|Alice| Bob|0.1|\n", 2414 | "| Bob|Carol|0.2|\n", 2415 | "|Carol|Alice|0.3|\n", 2416 | "+-----+-----+---+\n", 2417 | "\n", 2418 | "+-----+-----+---+\n", 2419 | "| from| to|amt|\n", 2420 | "+-----+-----+---+\n", 2421 | "|Carol|Alice|0.3|\n", 2422 | "+-----+-----+---+\n", 2423 | "\n" 2424 | ] 2425 | } 2426 | ], 2427 | "source": [ 2428 | "# where \n", 2429 | "x = sqlContext.createDataFrame([('Alice',\"Bob\",0.1),(\"Bob\",\"Carol\",0.2),(\"Carol\",\"Alice\",0.3)], ['from','to','amt'])\n", 2430 | "y = x.where(\"amt > 0.2\")\n", 2431 | "\n", 2432 | "x.show()\n", 2433 | "y.show()" 2434 | ] 2435 | }, 2436 | { 2437 | "cell_type": "code", 2438 | "execution_count": 18, 2439 | "metadata": {}, 2440 | "outputs": [ 2441 | { 2442 | "name": "stdout", 2443 | "output_type": "stream", 2444 | "text": [ 2445 | "+-----+-----+---+\n", 2446 | "| from| to|amt|\n", 2447 | "+-----+-----+---+\n", 2448 | "|Alice| Bob|0.1|\n", 2449 | "| Bob|Carol|0.2|\n", 2450 | "|Carol|Alice|0.3|\n", 2451 | "+-----+-----+---+\n", 2452 | "\n", 2453 | "+-----+-----+---+----+\n", 2454 | "| from| to|amt|conf|\n", 2455 | "+-----+-----+---+----+\n", 2456 | "|Alice| Bob|0.1|true|\n", 2457 | "| Bob|Carol|0.2|true|\n", 2458 | "|Carol|Alice|0.3|true|\n", 2459 | "+-----+-----+---+----+\n", 2460 | "\n" 2461 | ] 2462 | } 2463 | ], 2464 | "source": [ 2465 | "# withColumn\n", 2466 | "\n", 2467 | "x = sqlContext.createDataFrame([('Alice',\"Bob\",0.1),(\"Bob\",\"Carol\",0.2),(\"Carol\",\"Alice\",0.3)], ['from','to','amt'])\n", 2468 | "y = x.withColumn('conf',x.amt.isNotNull())\n", 2469 | "\n", 2470 | "x.show()\n", 2471 | "y.show()" 2472 | ] 2473 | }, 2474 | { 2475 | "cell_type": "code", 2476 | "execution_count": 19, 2477 | "metadata": {}, 2478 | "outputs": [ 2479 | { 2480 | "name": "stdout", 2481 | "output_type": "stream", 2482 | "text": [ 2483 | "+-----+-----+---+\n", 2484 | "| from| to|amt|\n", 2485 | "+-----+-----+---+\n", 2486 | "|Alice| Bob|0.1|\n", 2487 | "| Bob|Carol|0.2|\n", 2488 | "|Carol| Dave|0.3|\n", 2489 | "+-----+-----+---+\n", 2490 | "\n", 2491 | "+-----+-----+------+\n", 2492 | "| from| to|amount|\n", 2493 | "+-----+-----+------+\n", 2494 | "|Alice| Bob| 0.1|\n", 2495 | "| Bob|Carol| 0.2|\n", 2496 | "|Carol| Dave| 0.3|\n", 2497 | "+-----+-----+------+\n", 2498 | "\n" 2499 | ] 2500 | } 2501 | ], 2502 | "source": [ 2503 | "# withColumnRenamed\n", 2504 | "x = sqlContext.createDataFrame([('Alice',\"Bob\",0.1),(\"Bob\",\"Carol\",0.2),(\"Carol\",\"Dave\",0.3)], ['from','to','amt'])\n", 2505 | "y = x.withColumnRenamed('amt','amount')\n", 2506 | "x.show()\n", 2507 | "y.show()" 2508 | ] 2509 | }, 2510 | { 2511 | "cell_type": "code", 2512 | "execution_count": 22, 2513 | "metadata": {}, 2514 | "outputs": [ 2515 | { 2516 | "name": "stdout", 2517 | "output_type": "stream", 2518 | "text": [ 2519 | "+-----+-----+---+\n", 2520 | "| from| to|amt|\n", 2521 | "+-----+-----+---+\n", 2522 | "|Alice| Bob|0.1|\n", 2523 | "| Bob|Carol|0.2|\n", 2524 | "|Carol| Dave|0.3|\n", 2525 | "+-----+-----+---+\n", 2526 | "\n", 2527 | "+---+-----+-----+\n", 2528 | "|amt| from| to|\n", 2529 | "+---+-----+-----+\n", 2530 | "|0.3|Carol| Dave|\n", 2531 | "|0.1|Alice| Bob|\n", 2532 | "|0.2| Bob|Carol|\n", 2533 | "+---+-----+-----+\n", 2534 | "\n" 2535 | ] 2536 | } 2537 | ], 2538 | "source": [ 2539 | "# write\n", 2540 | "import json\n", 2541 | "x = sqlContext.createDataFrame([('Alice',\"Bob\",0.1),(\"Bob\",\"Carol\",0.2),(\"Carol\",\"Dave\",0.3)], ['from','to','amt'])\n", 2542 | "y = x.write.mode('overwrite').json('./dataframeWriteExample.json')\n", 2543 | "x.show()\n", 2544 | "\n", 2545 | "\n", 2546 | "# Read the DF back in from file\n", 2547 | "sqlContext.read.json('./dataframeWriteExample.json').show()" 2548 | ] 2549 | }, 2550 | { 2551 | "cell_type": "code", 2552 | "execution_count": null, 2553 | "metadata": {}, 2554 | "outputs": [], 2555 | "source": [] 2556 | } 2557 | ], 2558 | "metadata": { 2559 | "kernelspec": { 2560 | "display_name": "Python 3", 2561 | "language": "python", 2562 | "name": "python3" 2563 | }, 2564 | "language_info": { 2565 | "codemirror_mode": { 2566 | "name": "ipython", 2567 | "version": 3 2568 | }, 2569 | "file_extension": ".py", 2570 | "mimetype": "text/x-python", 2571 | "name": "python", 2572 | "nbconvert_exporter": "python", 2573 | "pygments_lexer": "ipython3", 2574 | "version": "3.6.5" 2575 | } 2576 | }, 2577 | "nbformat": 4, 2578 | "nbformat_minor": 2 2579 | } 2580 | -------------------------------------------------------------------------------- /SPARK/RDD/.ipynb_checkpoints/Spark_RDD_retaiDB_2-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /SPARK/RDD/SPARK_RDD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# SPARK RDD PROJECT ON \n", 8 | "### (Retail_db)" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "\n", 18 | "#Get total count of records\n", 19 | "\n", 20 | "ordersRDD = sc.textFile(\"orders\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "ordersRDD.count()" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 8, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "34322619.9300295" 39 | ] 40 | }, 41 | "execution_count": 8, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "orderitemsRDD = sc.textFile(\"o_items\")\n", 48 | "map1 = orderitemsRDD.map(lambda a: float(a.split(\",\")[4]))\n", 49 | "map1.reduce(lambda rev1 , rev2 :rev1 + rev2)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Get the total revenue" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 18, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "orditmsRDD = sc.textFile(\"o_items\")" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 22, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "totalrevnue = orditmsRDD.map(lambda rec: float(rec.split(\",\")[4])).reduce(lambda acc, val:acc + val) " 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 23, 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "34322619.9300295" 86 | ] 87 | }, 88 | "execution_count": 23, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "totalrevnue" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 24, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "['1,1,957,1,299.98,299.98',\n", 106 | " '2,2,1073,1,199.99,199.99',\n", 107 | " '3,2,502,5,250.0,50.0',\n", 108 | " '4,2,403,1,129.99,129.99',\n", 109 | " '5,4,897,2,49.98,24.99',\n", 110 | " '6,4,365,5,299.95,59.99',\n", 111 | " '7,4,502,3,150.0,50.0',\n", 112 | " '8,4,1014,4,199.92,49.98',\n", 113 | " '9,5,957,1,299.98,299.98',\n", 114 | " '10,5,365,5,299.95,59.99']" 115 | ] 116 | }, 117 | "execution_count": 24, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "orditmsRDD.take(10)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## Get the total distinct orders" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 32, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "total_ord = orditmsRDD.map(lambda rec: rec.split(\",\")[1])" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 33, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "['1', '2', '2', '2', '4']" 151 | ] 152 | }, 153 | "execution_count": 33, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "total_ord.take(5)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 34, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "data": { 169 | "text/plain": [ 170 | "57431" 171 | ] 172 | }, 173 | "execution_count": 34, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "total_ord.distinct().count()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": {}, 185 | "source": [ 186 | "## CountByKey" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 30, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "ordRDD = sc.textFile(\"orders\")\n", 196 | "ordmap = ordRDD.map(lambda rec: (rec.split(\",\")[3],0)) \n", 197 | "\n", 198 | "#In place of zero, any number can be used. Even 1 can be used !" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 31, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "data": { 208 | "text/plain": [ 209 | "[('CLOSED', 0),\n", 210 | " ('PENDING_PAYMENT', 0),\n", 211 | " ('COMPLETE', 0),\n", 212 | " ('CLOSED', 0),\n", 213 | " ('COMPLETE', 0)]" 214 | ] 215 | }, 216 | "execution_count": 31, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "ordmap.take(5)\n", 223 | "\n", 224 | "# STRUCTURE OF ORDERS : -\n", 225 | "\n", 226 | "#'1, 2013-07-25 00:00:00.0, 11599, CLOSED'" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 42, 232 | "metadata": {}, 233 | "outputs": [ 234 | { 235 | "data": { 236 | "text/plain": [ 237 | "defaultdict(int,\n", 238 | " {'CLOSED': 7556,\n", 239 | " 'PENDING_PAYMENT': 15030,\n", 240 | " 'COMPLETE': 22899,\n", 241 | " 'PROCESSING': 8275,\n", 242 | " 'PAYMENT_REVIEW': 729,\n", 243 | " 'PENDING': 7610,\n", 244 | " 'ON_HOLD': 3798,\n", 245 | " 'CANCELED': 1428,\n", 246 | " 'SUSPECTED_FRAUD': 1558})" 247 | ] 248 | }, 249 | "execution_count": 42, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "ordmap.countByKey()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "#### If the items functions is not used then only the key is displayed. Using items gives the total number of records in the table for each order status." 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 43, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "name": "stdout", 272 | "output_type": "stream", 273 | "text": [ 274 | "('CLOSED', 7556)\n", 275 | "('PENDING_PAYMENT', 15030)\n", 276 | "('COMPLETE', 22899)\n", 277 | "('PROCESSING', 8275)\n", 278 | "('PAYMENT_REVIEW', 729)\n", 279 | "('PENDING', 7610)\n", 280 | "('ON_HOLD', 3798)\n", 281 | "('CANCELED', 1428)\n", 282 | "('SUSPECTED_FRAUD', 1558)\n" 283 | ] 284 | } 285 | ], 286 | "source": [ 287 | "for i in ordmap.countByKey().items():\n", 288 | " print(i)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "##### 1) groupByKey() - \n", 296 | "i/p data for aggregation is very large and after aggregation the number of recs is more or less the same (does not use combiner)\n", 297 | "\n", 298 | "##### 2) reduceBYKey() - \n", 299 | "i/p data for aggregation is very large and after aggregation the number of recs is small\n", 300 | "\n", 301 | "##### 3) aggregateByKey()\n", 302 | "\n", 303 | "##### 4) combineByKey() -\n", 304 | "i/p data for aggregation is very large and after aggregation the number of recs is small\n" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 54, 310 | "metadata": {}, 311 | "outputs": [ 312 | { 313 | "data": { 314 | "text/plain": [ 315 | "[('CLOSED', 1),\n", 316 | " ('PENDING_PAYMENT', 1),\n", 317 | " ('COMPLETE', 1),\n", 318 | " ('CLOSED', 1),\n", 319 | " ('COMPLETE', 1)]" 320 | ] 321 | }, 322 | "execution_count": 54, 323 | "metadata": {}, 324 | "output_type": "execute_result" 325 | } 326 | ], 327 | "source": [ 328 | "#groupByKey : - \n", 329 | "\n", 330 | "ordmap = ordRDD.map(lambda rec : (rec.split(\",\")[3],1))\n", 331 | "\n", 332 | "ordmap.take(5)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 57, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "ordbystat = ordmap.groupByKey().map\\\n", 342 | "(lambda rec: (rec[0],sum(rec[1])))" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 58, 348 | "metadata": {}, 349 | "outputs": [ 350 | { 351 | "data": { 352 | "text/plain": [ 353 | "[('CLOSED', 7556),\n", 354 | " ('CANCELED', 1428),\n", 355 | " ('PENDING_PAYMENT', 15030),\n", 356 | " ('COMPLETE', 22899),\n", 357 | " ('PROCESSING', 8275)]" 358 | ] 359 | }, 360 | "execution_count": 58, 361 | "metadata": {}, 362 | "output_type": "execute_result" 363 | } 364 | ], 365 | "source": [ 366 | "ordbystat.take(5)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 59, 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "data": { 376 | "text/plain": [ 377 | "[('CLOSED', 7556),\n", 378 | " ('CANCELED', 1428),\n", 379 | " ('PENDING_PAYMENT', 15030),\n", 380 | " ('COMPLETE', 22899),\n", 381 | " ('PROCESSING', 8275),\n", 382 | " ('PAYMENT_REVIEW', 729),\n", 383 | " ('PENDING', 7610),\n", 384 | " ('ON_HOLD', 3798),\n", 385 | " ('SUSPECTED_FRAUD', 1558)]" 386 | ] 387 | }, 388 | "execution_count": 59, 389 | "metadata": {}, 390 | "output_type": "execute_result" 391 | } 392 | ], 393 | "source": [ 394 | "# reduceByKey():- \n", 395 | "#takes 2 parameters accumulator and value. \n", 396 | "#aggregation like counting the sum \n", 397 | "#is simpler here compared to groupByKey().\n", 398 | "\n", 399 | "#It also uses combiner. It uses 1 parameter (tuple of acc & val)\n", 400 | "\n", 401 | "\n", 402 | "ordbystat = ordmap.reduceByKey(lambda acc,val :acc+val)\n", 403 | "ordbystat.take(10)\n", 404 | "\n" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 60, 410 | "metadata": {}, 411 | "outputs": [ 412 | { 413 | "data": { 414 | "text/plain": [ 415 | "[('CLOSED', '1,2013-07-25 00:00:00.0,11599,CLOSED'),\n", 416 | " ('PENDING_PAYMENT', '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT'),\n", 417 | " ('COMPLETE', '3,2013-07-25 00:00:00.0,12111,COMPLETE'),\n", 418 | " ('CLOSED', '4,2013-07-25 00:00:00.0,8827,CLOSED'),\n", 419 | " ('COMPLETE', '5,2013-07-25 00:00:00.0,11318,COMPLETE'),\n", 420 | " ('COMPLETE', '6,2013-07-25 00:00:00.0,7130,COMPLETE'),\n", 421 | " ('COMPLETE', '7,2013-07-25 00:00:00.0,4530,COMPLETE'),\n", 422 | " ('PROCESSING', '8,2013-07-25 00:00:00.0,2911,PROCESSING'),\n", 423 | " ('PENDING_PAYMENT', '9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT'),\n", 424 | " ('PENDING_PAYMENT', '10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT')]" 425 | ] 426 | }, 427 | "execution_count": 60, 428 | "metadata": {}, 429 | "output_type": "execute_result" 430 | } 431 | ], 432 | "source": [ 433 | "# aggregateByKey(): takes 2 parameters\n", 434 | "\n", 435 | "ordmap = ordRDD.map(lambda rec: (rec.split(\",\")[3], rec))\n", 436 | "ordmap.take(10)\n" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "### a) CountByKey () " 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 61, 449 | "metadata": {}, 450 | "outputs": [ 451 | { 452 | "data": { 453 | "text/plain": [ 454 | "PythonRDD[82] at RDD at PythonRDD.scala:49" 455 | ] 456 | }, 457 | "execution_count": 61, 458 | "metadata": {}, 459 | "output_type": "execute_result" 460 | } 461 | ], 462 | "source": [ 463 | "\n", 464 | "ordRDD = sc.textFile(\"orders\")\n", 465 | "ordmap = ordRDD.map(lambda a:(a.split(\",\")[3],1)) \n", 466 | "ordRDD.map(lambda a:(a.split(\",\")[3],2)) \n" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 63, 472 | "metadata": {}, 473 | "outputs": [ 474 | { 475 | "name": "stdout", 476 | "output_type": "stream", 477 | "text": [ 478 | "defaultdict(, {'CLOSED': 7556, 'PENDING_PAYMENT': 15030, 'COMPLETE': 22899, 'PROCESSING': 8275, 'PAYMENT_REVIEW': 729, 'PENDING': 7610, 'ON_HOLD': 3798, 'CANCELED': 1428, 'SUSPECTED_FRAUD': 1558})\n" 479 | ] 480 | } 481 | ], 482 | "source": [ 483 | "ordcnt = ordmap.countByKey()\n", 484 | "print(ordcnt)\n", 485 | "\n" 486 | ] 487 | }, 488 | { 489 | "cell_type": "markdown", 490 | "metadata": {}, 491 | "source": [ 492 | "#### The output from CountByKey is a { Dictionary } The items need to be extracted." 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 64, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [ 501 | "ordcnt = ordmap.countByKey().items()" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 65, 507 | "metadata": {}, 508 | "outputs": [ 509 | { 510 | "name": "stdout", 511 | "output_type": "stream", 512 | "text": [ 513 | "dict_items([('CLOSED', 7556), ('PENDING_PAYMENT', 15030), ('COMPLETE', 22899), ('PROCESSING', 8275), ('PAYMENT_REVIEW', 729), ('PENDING', 7610), ('ON_HOLD', 3798), ('CANCELED', 1428), ('SUSPECTED_FRAUD', 1558)])\n" 514 | ] 515 | } 516 | ], 517 | "source": [ 518 | "print(ordcnt)" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": 66, 524 | "metadata": {}, 525 | "outputs": [ 526 | { 527 | "name": "stdout", 528 | "output_type": "stream", 529 | "text": [ 530 | "('CLOSED', 7556)\n", 531 | "('PENDING_PAYMENT', 15030)\n", 532 | "('COMPLETE', 22899)\n", 533 | "('PROCESSING', 8275)\n", 534 | "('PAYMENT_REVIEW', 729)\n", 535 | "('PENDING', 7610)\n", 536 | "('ON_HOLD', 3798)\n", 537 | "('CANCELED', 1428)\n", 538 | "('SUSPECTED_FRAUD', 1558)\n" 539 | ] 540 | } 541 | ], 542 | "source": [ 543 | "for i in ordmap.countByKey().items():print(i)" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "### b) GroupByKey()\n" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 67, 556 | "metadata": {}, 557 | "outputs": [], 558 | "source": [ 559 | "ordRDD = sc.textFile(\"orders\")\n", 560 | "ordmap = ordRDD.map(lambda a:(a.split(\",\")[3],1))\n", 561 | "ordcnt = ordmap.groupByKey().map(lambda t: (t[0], sum(t[1])))" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 68, 567 | "metadata": {}, 568 | "outputs": [ 569 | { 570 | "data": { 571 | "text/plain": [ 572 | "[('CLOSED', 7556),\n", 573 | " ('CANCELED', 1428),\n", 574 | " ('PENDING_PAYMENT', 15030),\n", 575 | " ('COMPLETE', 22899),\n", 576 | " ('PROCESSING', 8275)]" 577 | ] 578 | }, 579 | "execution_count": 68, 580 | "metadata": {}, 581 | "output_type": "execute_result" 582 | } 583 | ], 584 | "source": [ 585 | "ordcnt.take(5)" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 69, 591 | "metadata": {}, 592 | "outputs": [ 593 | { 594 | "name": "stdout", 595 | "output_type": "stream", 596 | "text": [ 597 | "('CLOSED', 7556)\n", 598 | "('CANCELED', 1428)\n", 599 | "('PENDING_PAYMENT', 15030)\n", 600 | "('COMPLETE', 22899)\n", 601 | "('PROCESSING', 8275)\n", 602 | "('PAYMENT_REVIEW', 729)\n", 603 | "('PENDING', 7610)\n", 604 | "('ON_HOLD', 3798)\n", 605 | "('SUSPECTED_FRAUD', 1558)\n" 606 | ] 607 | } 608 | ], 609 | "source": [ 610 | "for i in ordcnt.collect():print(i)\n" 611 | ] 612 | }, 613 | { 614 | "cell_type": "markdown", 615 | "metadata": {}, 616 | "source": [ 617 | "### c) ReduceByKey()\n" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 70, 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [ 626 | "ordRDD = sc.textFile(\"orders\")\n", 627 | "ordmap = ordRDD.map(lambda a:(a.split(\",\")[3],1)) \n", 628 | "ordcnt = ordmap.reduceByKey(lambda a,b: a+b)" 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 71, 634 | "metadata": {}, 635 | "outputs": [ 636 | { 637 | "name": "stdout", 638 | "output_type": "stream", 639 | "text": [ 640 | "('CLOSED', 7556)\n", 641 | "('CANCELED', 1428)\n", 642 | "('PENDING_PAYMENT', 15030)\n", 643 | "('COMPLETE', 22899)\n", 644 | "('PROCESSING', 8275)\n", 645 | "('PAYMENT_REVIEW', 729)\n", 646 | "('PENDING', 7610)\n", 647 | "('ON_HOLD', 3798)\n", 648 | "('SUSPECTED_FRAUD', 1558)\n" 649 | ] 650 | } 651 | ], 652 | "source": [ 653 | "for i in ordcnt.collect():print(i)\n" 654 | ] 655 | }, 656 | { 657 | "cell_type": "markdown", 658 | "metadata": {}, 659 | "source": [ 660 | "### d) aggregateByKey()\n" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": 72, 666 | "metadata": {}, 667 | "outputs": [], 668 | "source": [ 669 | "ordRDD = sc.textFile(\"orders\")\n", 670 | "ordmap = ordRDD.map(lambda a: (a.split(\",\")[3], a))\n", 671 | "ordcnt = ordmap.aggregateByKey(0, lambda acc, val: acc+1, lambda acc,val:acc+val)\n" 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": 73, 677 | "metadata": {}, 678 | "outputs": [ 679 | { 680 | "name": "stdout", 681 | "output_type": "stream", 682 | "text": [ 683 | "('CLOSED', 7556)\n", 684 | "('CANCELED', 1428)\n", 685 | "('PENDING_PAYMENT', 15030)\n", 686 | "('COMPLETE', 22899)\n", 687 | "('PROCESSING', 8275)\n", 688 | "('PAYMENT_REVIEW', 729)\n", 689 | "('PENDING', 7610)\n", 690 | "('ON_HOLD', 3798)\n", 691 | "('SUSPECTED_FRAUD', 1558)\n" 692 | ] 693 | } 694 | ], 695 | "source": [ 696 | "for i in ordcnt.collect():print(i)\n" 697 | ] 698 | }, 699 | { 700 | "cell_type": "markdown", 701 | "metadata": {}, 702 | "source": [ 703 | "# Number of orders by Order date and Order status" 704 | ] 705 | }, 706 | { 707 | "cell_type": "code", 708 | "execution_count": 81, 709 | "metadata": {}, 710 | "outputs": [], 711 | "source": [ 712 | "# The table to be referred is orders. \n", 713 | "ordRDD = sc.textFile(\"orders\")" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": 82, 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [ 722 | "ordmap = ordRDD.map(lambda a: ((a.split(\",\")[1], a.split(\",\")[3]),1))\n" 723 | ] 724 | }, 725 | { 726 | "cell_type": "code", 727 | "execution_count": 83, 728 | "metadata": {}, 729 | "outputs": [], 730 | "source": [ 731 | "ordcnt = ordmap.reduceByKey(lambda a,b: a + b)" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": 85, 737 | "metadata": {}, 738 | "outputs": [ 739 | { 740 | "data": { 741 | "text/plain": [ 742 | "[(('2013-07-25 00:00:00.0', 'PENDING_PAYMENT'), 41),\n", 743 | " (('2013-07-25 00:00:00.0', 'COMPLETE'), 42),\n", 744 | " (('2013-07-25 00:00:00.0', 'PROCESSING'), 16),\n", 745 | " (('2013-07-25 00:00:00.0', 'PAYMENT_REVIEW'), 3),\n", 746 | " (('2013-07-25 00:00:00.0', 'PENDING'), 13)]" 747 | ] 748 | }, 749 | "execution_count": 85, 750 | "metadata": {}, 751 | "output_type": "execute_result" 752 | } 753 | ], 754 | "source": [ 755 | "ordcnt.take(5)" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": 86, 761 | "metadata": {}, 762 | "outputs": [ 763 | { 764 | "data": { 765 | "text/plain": [ 766 | "[(('PENDING_PAYMENT', '2013-07-25 00:00:00.0'), 41),\n", 767 | " (('COMPLETE', '2013-07-25 00:00:00.0'), 42),\n", 768 | " (('PROCESSING', '2013-07-25 00:00:00.0'), 16),\n", 769 | " (('PAYMENT_REVIEW', '2013-07-25 00:00:00.0'), 3),\n", 770 | " (('PENDING', '2013-07-25 00:00:00.0'), 13)]" 771 | ] 772 | }, 773 | "execution_count": 86, 774 | "metadata": {}, 775 | "output_type": "execute_result" 776 | } 777 | ], 778 | "source": [ 779 | "ordmap = ordRDD.map(lambda a:((a.split(\",\")[3], a.split(\",\")[1]),1))\n", 780 | "ordcnt = ordmap.reduceByKey(lambda a,b:a+b)\n", 781 | "ordcnt.take(5)" 782 | ] 783 | }, 784 | { 785 | "cell_type": "markdown", 786 | "metadata": {}, 787 | "source": [ 788 | "#### The group by keys is included under the variable 'K' in (K,V). Can be extended to any number.As it involves aggregation, groupByKey is eliminated.\n", 789 | "\n", 790 | "#### But we can obtain the same result using countByKey() and groupByKey().The best option is reduceByKey() as the logic for combiner and reducer is the same.\n", 791 | "\n" 792 | ] 793 | }, 794 | { 795 | "cell_type": "code", 796 | "execution_count": 92, 797 | "metadata": {}, 798 | "outputs": [ 799 | { 800 | "data": { 801 | "text/plain": [ 802 | "[(('2013-07-25 00:00:00.0', 'CLOSED'), 1),\n", 803 | " (('2013-07-25 00:00:00.0', 'PENDING_PAYMENT'), 1),\n", 804 | " (('2013-07-25 00:00:00.0', 'COMPLETE'), 1),\n", 805 | " (('2013-07-25 00:00:00.0', 'CLOSED'), 1),\n", 806 | " (('2013-07-25 00:00:00.0', 'COMPLETE'), 1)]" 807 | ] 808 | }, 809 | "execution_count": 92, 810 | "metadata": {}, 811 | "output_type": "execute_result" 812 | } 813 | ], 814 | "source": [ 815 | "ordRDD = sc.textFile(\"orders\")\n", 816 | "ordmap = ordRDD.map(lambda a: ((a.split(\",\")[1], a.split(\",\")[3]),1))\n", 817 | "ordmap.take(5)" 818 | ] 819 | }, 820 | { 821 | "cell_type": "code", 822 | "execution_count": 93, 823 | "metadata": {}, 824 | "outputs": [ 825 | { 826 | "data": { 827 | "text/plain": [ 828 | "[(('2013-07-25 00:00:00.0', 'PENDING_PAYMENT'), 41),\n", 829 | " (('2013-07-25 00:00:00.0', 'COMPLETE'), 42),\n", 830 | " (('2013-07-25 00:00:00.0', 'PROCESSING'), 16),\n", 831 | " (('2013-07-25 00:00:00.0', 'PAYMENT_REVIEW'), 3),\n", 832 | " (('2013-07-25 00:00:00.0', 'PENDING'), 13)]" 833 | ] 834 | }, 835 | "execution_count": 93, 836 | "metadata": {}, 837 | "output_type": "execute_result" 838 | } 839 | ], 840 | "source": [ 841 | "ordRDD = sc.textFile(\"orders\")\n", 842 | "ordmap = ordRDD.map(lambda a: ((a.split(\",\")[1], a.split(\",\")[3]),1))\n", 843 | "ordcnt = ordmap.groupByKey().map(lambda t: (t[0], sum(t[1])))\n", 844 | "ordcnt.take(5)" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 94, 850 | "metadata": {}, 851 | "outputs": [ 852 | { 853 | "data": { 854 | "text/plain": [ 855 | "[(('2013-07-25 00:00:00.0', 'PENDING_PAYMENT'), 41),\n", 856 | " (('2013-07-25 00:00:00.0', 'COMPLETE'), 42),\n", 857 | " (('2013-07-25 00:00:00.0', 'PROCESSING'), 16),\n", 858 | " (('2013-07-25 00:00:00.0', 'PAYMENT_REVIEW'), 3),\n", 859 | " (('2013-07-25 00:00:00.0', 'PENDING'), 13)]" 860 | ] 861 | }, 862 | "execution_count": 94, 863 | "metadata": {}, 864 | "output_type": "execute_result" 865 | } 866 | ], 867 | "source": [ 868 | "ordRDD = sc.textFile(\"orders\")\n", 869 | "ordmap = ordRDD.map(lambda a: ((a.split(\",\")[1], a.split(\",\")[3]),a))\n", 870 | "ordcnt = ordmap.aggregateByKey(0, lambda acc, val: acc+1, lambda acc, val: acc+val)\n", 871 | "ordcnt.take(5)" 872 | ] 873 | }, 874 | { 875 | "cell_type": "code", 876 | "execution_count": 95, 877 | "metadata": {}, 878 | "outputs": [ 879 | { 880 | "data": { 881 | "text/plain": [ 882 | "[(('2013-07-25 00:00:00.0', 'PENDING_PAYMENT'), 41),\n", 883 | " (('2013-07-25 00:00:00.0', 'COMPLETE'), 42),\n", 884 | " (('2013-07-25 00:00:00.0', 'PROCESSING'), 16),\n", 885 | " (('2013-07-25 00:00:00.0', 'PAYMENT_REVIEW'), 3),\n", 886 | " (('2013-07-25 00:00:00.0', 'PENDING'), 13)]" 887 | ] 888 | }, 889 | "execution_count": 95, 890 | "metadata": {}, 891 | "output_type": "execute_result" 892 | } 893 | ], 894 | "source": [ 895 | "ordRDD = sc.textFile(\"orders\")\n", 896 | "ordmap = ordRDD.map(lambda a : ((a.split(\",\")[1], a.split(\",\")[3]),a))\n", 897 | "ordcnt = ordmap.combineByKey(lambda val:1, lambda acc, val:acc+1, lambda acc, val: acc+val)\n", 898 | "ordcnt.take(5)\n" 899 | ] 900 | }, 901 | { 902 | "cell_type": "markdown", 903 | "metadata": {}, 904 | "source": [ 905 | "# Generate average revenue per Day" 906 | ] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": 97, 911 | "metadata": {}, 912 | "outputs": [], 913 | "source": [ 914 | "ordRDD = sc.textFile(\"orders\")\n", 915 | "orditmRDD = sc.textFile(\"o_items\")\n", 916 | "ordmap = ordRDD.map(lambda a: (a.split(\",\")[0], a))\n", 917 | "orditmap = orditmRDD.map(lambda a: (a.split(\",\")[1], a))\n", 918 | "joinds = orditmap.join(ordmap)\n" 919 | ] 920 | }, 921 | { 922 | "cell_type": "code", 923 | "execution_count": 98, 924 | "metadata": {}, 925 | "outputs": [ 926 | { 927 | "data": { 928 | "text/plain": [ 929 | "[('4', ('5,4,897,2,49.98,24.99', '4,2013-07-25 00:00:00.0,8827,CLOSED')),\n", 930 | " ('4', ('6,4,365,5,299.95,59.99', '4,2013-07-25 00:00:00.0,8827,CLOSED')),\n", 931 | " ('4', ('7,4,502,3,150.0,50.0', '4,2013-07-25 00:00:00.0,8827,CLOSED')),\n", 932 | " ('4', ('8,4,1014,4,199.92,49.98', '4,2013-07-25 00:00:00.0,8827,CLOSED')),\n", 933 | " ('10',\n", 934 | " ('24,10,1073,1,199.99,199.99',\n", 935 | " '10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT'))]" 936 | ] 937 | }, 938 | "execution_count": 98, 939 | "metadata": {}, 940 | "output_type": "execute_result" 941 | } 942 | ], 943 | "source": [ 944 | "joinds.take(5)" 945 | ] 946 | }, 947 | { 948 | "cell_type": "code", 949 | "execution_count": 99, 950 | "metadata": {}, 951 | "outputs": [], 952 | "source": [ 953 | "map1 = joinds.map(lambda t: ((t[1][1].split(\",\")[1], t[1][1].split(\",\")[0]), float(t[1][0].split(\",\")[4])))\n" 954 | ] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "execution_count": 100, 959 | "metadata": {}, 960 | "outputs": [ 961 | { 962 | "data": { 963 | "text/plain": [ 964 | "[(('2013-07-25 00:00:00.0', '4'), 49.98),\n", 965 | " (('2013-07-25 00:00:00.0', '4'), 299.95),\n", 966 | " (('2013-07-25 00:00:00.0', '4'), 150.0),\n", 967 | " (('2013-07-25 00:00:00.0', '4'), 199.92),\n", 968 | " (('2013-07-25 00:00:00.0', '10'), 199.99)]" 969 | ] 970 | }, 971 | "execution_count": 100, 972 | "metadata": {}, 973 | "output_type": "execute_result" 974 | } 975 | ], 976 | "source": [ 977 | "map1.take(5)" 978 | ] 979 | }, 980 | { 981 | "cell_type": "code", 982 | "execution_count": 107, 983 | "metadata": {}, 984 | "outputs": [ 985 | { 986 | "data": { 987 | "text/plain": [ 988 | "[(('2013-07-26 00:00:00.0', '110'), 594.93),\n", 989 | " (('2013-07-26 00:00:00.0', '111'), 249.9),\n", 990 | " (('2013-07-26 00:00:00.0', '112'), 979.8800000000001),\n", 991 | " (('2013-07-26 00:00:00.0', '113'), 619.87),\n", 992 | " (('2013-07-26 00:00:00.0', '115'), 599.96)]" 993 | ] 994 | }, 995 | "execution_count": 107, 996 | "metadata": {}, 997 | "output_type": "execute_result" 998 | } 999 | ], 1000 | "source": [ 1001 | "revperord = map1.reduceByKey(lambda a,b: a+b)\n", 1002 | "revperord.take(5)\n" 1003 | ] 1004 | }, 1005 | { 1006 | "cell_type": "code", 1007 | "execution_count": 108, 1008 | "metadata": {}, 1009 | "outputs": [ 1010 | { 1011 | "data": { 1012 | "text/plain": [ 1013 | "[('2013-07-26 00:00:00.0', 594.93),\n", 1014 | " ('2013-07-26 00:00:00.0', 249.9),\n", 1015 | " ('2013-07-26 00:00:00.0', 979.8800000000001),\n", 1016 | " ('2013-07-26 00:00:00.0', 619.87),\n", 1017 | " ('2013-07-26 00:00:00.0', 599.96)]" 1018 | ] 1019 | }, 1020 | "execution_count": 108, 1021 | "metadata": {}, 1022 | "output_type": "execute_result" 1023 | } 1024 | ], 1025 | "source": [ 1026 | "rpomap = revperord.map(lambda a: (a[0][0], a[1]))\n", 1027 | "rpomap.take(5)" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "code", 1032 | "execution_count": 109, 1033 | "metadata": {}, 1034 | "outputs": [ 1035 | { 1036 | "data": { 1037 | "text/plain": [ 1038 | "[('2013-07-25 00:00:00.0', 579.98),\n", 1039 | " ('2013-07-25 00:00:00.0', 1129.8600000000001),\n", 1040 | " ('2013-07-25 00:00:00.0', 919.79),\n", 1041 | " ('2013-07-25 00:00:00.0', 127.96),\n", 1042 | " ('2013-07-25 00:00:00.0', 749.97)]" 1043 | ] 1044 | }, 1045 | "execution_count": 109, 1046 | "metadata": {}, 1047 | "output_type": "execute_result" 1048 | } 1049 | ], 1050 | "source": [ 1051 | "rpomaps = rpomap.sortByKey()\n", 1052 | "rpomaps.take(5)" 1053 | ] 1054 | }, 1055 | { 1056 | "cell_type": "code", 1057 | "execution_count": 110, 1058 | "metadata": {}, 1059 | "outputs": [ 1060 | { 1061 | "data": { 1062 | "text/plain": [ 1063 | "57431" 1064 | ] 1065 | }, 1066 | "execution_count": 110, 1067 | "metadata": {}, 1068 | "output_type": "execute_result" 1069 | } 1070 | ], 1071 | "source": [ 1072 | "rpomaps.count()" 1073 | ] 1074 | } 1075 | ], 1076 | "metadata": { 1077 | "kernelspec": { 1078 | "display_name": "Python 3", 1079 | "language": "python", 1080 | "name": "python3" 1081 | }, 1082 | "language_info": { 1083 | "codemirror_mode": { 1084 | "name": "ipython", 1085 | "version": 3 1086 | }, 1087 | "file_extension": ".py", 1088 | "mimetype": "text/x-python", 1089 | "name": "python", 1090 | "nbconvert_exporter": "python", 1091 | "pygments_lexer": "ipython3", 1092 | "version": "3.6.5" 1093 | } 1094 | }, 1095 | "nbformat": 4, 1096 | "nbformat_minor": 2 1097 | } 1098 | --------------------------------------------------------------------------------