├── .gitignore
├── README.md
├── build.sbt
├── project
    ├── build.properties
    └── plugins.sbt
├── src
    └── main
    │   ├── resources
    │       ├── adult.csv
    │       ├── applestock.csv
    │       ├── customers.csv
    │       ├── data.txt
    │       ├── eventimedata.md
    │       └── sales.csv
    │   └── scala
    │       └── com
    │           └── madhukaraphatak
    │               └── examples
    │                   └── sparktwo
    │                       ├── CatalogExample.scala
    │                       ├── CustomOptimizationExample.scala
    │                       ├── DataSetWordCount.scala
    │                       ├── DataSourceV2Example.scala
    │                       ├── DatasetVsDataFrame.scala
    │                       ├── MysqlTransactionExample.scala
    │                       ├── RDDToDataSet.scala
    │                       ├── SparkSessionExample.scala
    │                       ├── TimeWindowExample.scala
    │                       ├── datasourcev2
    │                           ├── MysqlWithTransaction.scala
    │                           ├── SimpleCsvDataSource.scala
    │                           ├── SimpleDataSource.scala
    │                           ├── SimpleDataSourceWithPartitionAffinity.scala
    │                           ├── SimpleMultiDataSource.scala
    │                           ├── SimpleMysqlDataSource.scala
    │                           └── SimpleMysqlWriter.scala
    │                       ├── ml
    │                           ├── MultiColumnTransformation.scala
    │                           └── ParallelCrossValidation.scala
    │                       └── streaming
    │                           ├── EventTimeExample.scala
    │                           ├── FileStreamExample.scala
    │                           ├── IngestionTimeWindow.scala
    │                           ├── ProcessingTimeWindow.scala
    │                           ├── RecoverableAggregation.scala
    │                           ├── SessionisationExample.scala
    │                           ├── SocketMiniBatchExample.scala
    │                           ├── SocketReadExample.scala
    │                           ├── SocketWordCount.scala
    │                           ├── StatelessWordCount.scala
    │                           ├── StreamJoin.scala
    │                           └── WaterMarkExample.scala
└── todo.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.iml
 2 | .idea/
 3 | *.iml
 4 | target/
 5 | project/target
 6 | dependency-reduced-pom.xml
 7 | *.pdf
 8 | *.swp
 9 | *.sw*
10 | metastore_db
11 | *.log
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Examples for Spark 2.0 release.
2 | 
3 | # Build
4 | 
5 |     sbt clean package


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "SparkTwoExperiments"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.11.8"
 6 | 
 7 | val sparkVersion = "2.3.0"
 8 | 
 9 | 
10 | resolvers ++= Seq(
11 |   "apache-snapshots" at "http://repository.apache.org/snapshots/"
12 | )
13 | 
14 | libraryDependencies ++= Seq(
15 |   "org.apache.spark" %% "spark-core" % sparkVersion,
16 |   "org.apache.spark" %% "spark-sql" % sparkVersion,
17 |   "org.apache.spark" %% "spark-mllib" % sparkVersion,
18 |   "org.apache.spark" %% "spark-streaming" % sparkVersion,
19 |   "org.apache.spark" %% "spark-hive" % sparkVersion,
20 |   "mysql" % "mysql-connector-java" % "5.1.6"
21 | )
22 |     
23 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.8


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn


--------------------------------------------------------------------------------
/src/main/resources/adult.csv:
--------------------------------------------------------------------------------
  1 | age,workclass,fnlwgt,education,education_num,martial_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
  2 | 39, State-gov, 77516, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K
  3 | 50, Self-emp-not-inc, 83311, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 13, United-States, <=50K
  4 | 38, Private, 215646, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
  5 | 53, Private, 234721, 11th, 7, Married-civ-spouse, Handlers-cleaners, Husband, Black, Male, 0, 0, 40, United-States, <=50K
  6 | 28, Private, 338409, Bachelors, 13, Married-civ-spouse, Prof-specialty, Wife, Black, Female, 0, 0, 40, Cuba, <=50K
  7 | 37, Private, 284582, Masters, 14, Married-civ-spouse, Exec-managerial, Wife, White, Female, 0, 0, 40, United-States, <=50K
  8 | 49, Private, 160187, 9th, 5, Married-spouse-absent, Other-service, Not-in-family, Black, Female, 0, 0, 16, Jamaica, <=50K
  9 | 52, Self-emp-not-inc, 209642, HS-grad, 9, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 45, United-States, >50K
 10 | 31, Private, 45781, Masters, 14, Never-married, Prof-specialty, Not-in-family, White, Female, 14084, 0, 50, United-States, >50K
 11 | 42, Private, 159449, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 5178, 0, 40, United-States, >50K
 12 | 37, Private, 280464, Some-college, 10, Married-civ-spouse, Exec-managerial, Husband, Black, Male, 0, 0, 80, United-States, >50K
 13 | 30, State-gov, 141297, Bachelors, 13, Married-civ-spouse, Prof-specialty, Husband, Asian-Pac-Islander, Male, 0, 0, 40, India, >50K
 14 | 23, Private, 122272, Bachelors, 13, Never-married, Adm-clerical, Own-child, White, Female, 0, 0, 30, United-States, <=50K
 15 | 32, Private, 205019, Assoc-acdm, 12, Never-married, Sales, Not-in-family, Black, Male, 0, 0, 50, United-States, <=50K
 16 | 40, Private, 121772, Assoc-voc, 11, Married-civ-spouse, Craft-repair, Husband, Asian-Pac-Islander, Male, 0, 0, 40, ?, >50K
 17 | 34, Private, 245487, 7th-8th, 4, Married-civ-spouse, Transport-moving, Husband, Amer-Indian-Eskimo, Male, 0, 0, 45, Mexico, <=50K
 18 | 25, Self-emp-not-inc, 176756, HS-grad, 9, Never-married, Farming-fishing, Own-child, White, Male, 0, 0, 35, United-States, <=50K
 19 | 32, Private, 186824, HS-grad, 9, Never-married, Machine-op-inspct, Unmarried, White, Male, 0, 0, 40, United-States, <=50K
 20 | 38, Private, 28887, 11th, 7, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 50, United-States, <=50K
 21 | 43, Self-emp-not-inc, 292175, Masters, 14, Divorced, Exec-managerial, Unmarried, White, Female, 0, 0, 45, United-States, >50K
 22 | 40, Private, 193524, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 60, United-States, >50K
 23 | 54, Private, 302146, HS-grad, 9, Separated, Other-service, Unmarried, Black, Female, 0, 0, 20, United-States, <=50K
 24 | 35, Federal-gov, 76845, 9th, 5, Married-civ-spouse, Farming-fishing, Husband, Black, Male, 0, 0, 40, United-States, <=50K
 25 | 43, Private, 117037, 11th, 7, Married-civ-spouse, Transport-moving, Husband, White, Male, 0, 2042, 40, United-States, <=50K
 26 | 59, Private, 109015, HS-grad, 9, Divorced, Tech-support, Unmarried, White, Female, 0, 0, 40, United-States, <=50K
 27 | 56, Local-gov, 216851, Bachelors, 13, Married-civ-spouse, Tech-support, Husband, White, Male, 0, 0, 40, United-States, >50K
 28 | 19, Private, 168294, HS-grad, 9, Never-married, Craft-repair, Own-child, White, Male, 0, 0, 40, United-States, <=50K
 29 | 54, ?, 180211, Some-college, 10, Married-civ-spouse, ?, Husband, Asian-Pac-Islander, Male, 0, 0, 60, South, >50K
 30 | 39, Private, 367260, HS-grad, 9, Divorced, Exec-managerial, Not-in-family, White, Male, 0, 0, 80, United-States, <=50K
 31 | 49, Private, 193366, HS-grad, 9, Married-civ-spouse, Craft-repair, Husband, White, Male, 0, 0, 40, United-States, <=50K
 32 | 23, Local-gov, 190709, Assoc-acdm, 12, Never-married, Protective-serv, Not-in-family, White, Male, 0, 0, 52, United-States, <=50K
 33 | 20, Private, 266015, Some-college, 10, Never-married, Sales, Own-child, Black, Male, 0, 0, 44, United-States, <=50K
 34 | 45, Private, 386940, Bachelors, 13, Divorced, Exec-managerial, Own-child, White, Male, 0, 1408, 40, United-States, <=50K
 35 | 30, Federal-gov, 59951, Some-college, 10, Married-civ-spouse, Adm-clerical, Own-child, White, Male, 0, 0, 40, United-States, <=50K
 36 | 22, State-gov, 311512, Some-college, 10, Married-civ-spouse, Other-service, Husband, Black, Male, 0, 0, 15, United-States, <=50K
 37 | 48, Private, 242406, 11th, 7, Never-married, Machine-op-inspct, Unmarried, White, Male, 0, 0, 40, Puerto-Rico, <=50K
 38 | 21, Private, 197200, Some-college, 10, Never-married, Machine-op-inspct, Own-child, White, Male, 0, 0, 40, United-States, <=50K
 39 | 19, Private, 544091, HS-grad, 9, Married-AF-spouse, Adm-clerical, Wife, White, Female, 0, 0, 25, United-States, <=50K
 40 | 31, Private, 84154, Some-college, 10, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 38, ?, >50K
 41 | 48, Self-emp-not-inc, 265477, Assoc-acdm, 12, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 40, United-States, <=50K
 42 | 31, Private, 507875, 9th, 5, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 0, 0, 43, United-States, <=50K
 43 | 53, Self-emp-not-inc, 88506, Bachelors, 13, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 40, United-States, <=50K
 44 | 24, Private, 172987, Bachelors, 13, Married-civ-spouse, Tech-support, Husband, White, Male, 0, 0, 50, United-States, <=50K
 45 | 49, Private, 94638, HS-grad, 9, Separated, Adm-clerical, Unmarried, White, Female, 0, 0, 40, United-States, <=50K
 46 | 25, Private, 289980, HS-grad, 9, Never-married, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 35, United-States, <=50K
 47 | 57, Federal-gov, 337895, Bachelors, 13, Married-civ-spouse, Prof-specialty, Husband, Black, Male, 0, 0, 40, United-States, >50K
 48 | 53, Private, 144361, HS-grad, 9, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 0, 0, 38, United-States, <=50K
 49 | 44, Private, 128354, Masters, 14, Divorced, Exec-managerial, Unmarried, White, Female, 0, 0, 40, United-States, <=50K
 50 | 41, State-gov, 101603, Assoc-voc, 11, Married-civ-spouse, Craft-repair, Husband, White, Male, 0, 0, 40, United-States, <=50K
 51 | 29, Private, 271466, Assoc-voc, 11, Never-married, Prof-specialty, Not-in-family, White, Male, 0, 0, 43, United-States, <=50K
 52 | 25, Private, 32275, Some-college, 10, Married-civ-spouse, Exec-managerial, Wife, Other, Female, 0, 0, 40, United-States, <=50K
 53 | 18, Private, 226956, HS-grad, 9, Never-married, Other-service, Own-child, White, Female, 0, 0, 30, ?, <=50K
 54 | 47, Private, 51835, Prof-school, 15, Married-civ-spouse, Prof-specialty, Wife, White, Female, 0, 1902, 60, Honduras, >50K
 55 | 50, Federal-gov, 251585, Bachelors, 13, Divorced, Exec-managerial, Not-in-family, White, Male, 0, 0, 55, United-States, >50K
 56 | 47, Self-emp-inc, 109832, HS-grad, 9, Divorced, Exec-managerial, Not-in-family, White, Male, 0, 0, 60, United-States, <=50K
 57 | 43, Private, 237993, Some-college, 10, Married-civ-spouse, Tech-support, Husband, White, Male, 0, 0, 40, United-States, >50K
 58 | 46, Private, 216666, 5th-6th, 3, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 0, 0, 40, Mexico, <=50K
 59 | 35, Private, 56352, Assoc-voc, 11, Married-civ-spouse, Other-service, Husband, White, Male, 0, 0, 40, Puerto-Rico, <=50K
 60 | 41, Private, 147372, HS-grad, 9, Married-civ-spouse, Adm-clerical, Husband, White, Male, 0, 0, 48, United-States, <=50K
 61 | 30, Private, 188146, HS-grad, 9, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 5013, 0, 40, United-States, <=50K
 62 | 30, Private, 59496, Bachelors, 13, Married-civ-spouse, Sales, Husband, White, Male, 2407, 0, 40, United-States, <=50K
 63 | 32, ?, 293936, 7th-8th, 4, Married-spouse-absent, ?, Not-in-family, White, Male, 0, 0, 40, ?, <=50K
 64 | 48, Private, 149640, HS-grad, 9, Married-civ-spouse, Transport-moving, Husband, White, Male, 0, 0, 40, United-States, <=50K
 65 | 42, Private, 116632, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 45, United-States, >50K
 66 | 29, Private, 105598, Some-college, 10, Divorced, Tech-support, Not-in-family, White, Male, 0, 0, 58, United-States, <=50K
 67 | 36, Private, 155537, HS-grad, 9, Married-civ-spouse, Craft-repair, Husband, White, Male, 0, 0, 40, United-States, <=50K
 68 | 28, Private, 183175, Some-college, 10, Divorced, Adm-clerical, Not-in-family, White, Female, 0, 0, 40, United-States, <=50K
 69 | 53, Private, 169846, HS-grad, 9, Married-civ-spouse, Adm-clerical, Wife, White, Female, 0, 0, 40, United-States, >50K
 70 | 49, Self-emp-inc, 191681, Some-college, 10, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 50, United-States, >50K
 71 | 25, ?, 200681, Some-college, 10, Never-married, ?, Own-child, White, Male, 0, 0, 40, United-States, <=50K
 72 | 19, Private, 101509, Some-college, 10, Never-married, Prof-specialty, Own-child, White, Male, 0, 0, 32, United-States, <=50K
 73 | 31, Private, 309974, Bachelors, 13, Separated, Sales, Own-child, Black, Female, 0, 0, 40, United-States, <=50K
 74 | 29, Self-emp-not-inc, 162298, Bachelors, 13, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 70, United-States, >50K
 75 | 23, Private, 211678, Some-college, 10, Never-married, Machine-op-inspct, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
 76 | 79, Private, 124744, Some-college, 10, Married-civ-spouse, Prof-specialty, Other-relative, White, Male, 0, 0, 20, United-States, <=50K
 77 | 27, Private, 213921, HS-grad, 9, Never-married, Other-service, Own-child, White, Male, 0, 0, 40, Mexico, <=50K
 78 | 40, Private, 32214, Assoc-acdm, 12, Married-civ-spouse, Adm-clerical, Husband, White, Male, 0, 0, 40, United-States, <=50K
 79 | 67, ?, 212759, 10th, 6, Married-civ-spouse, ?, Husband, White, Male, 0, 0, 2, United-States, <=50K
 80 | 18, Private, 309634, 11th, 7, Never-married, Other-service, Own-child, White, Female, 0, 0, 22, United-States, <=50K
 81 | 31, Local-gov, 125927, 7th-8th, 4, Married-civ-spouse, Farming-fishing, Husband, White, Male, 0, 0, 40, United-States, <=50K
 82 | 18, Private, 446839, HS-grad, 9, Never-married, Sales, Not-in-family, White, Male, 0, 0, 30, United-States, <=50K
 83 | 52, Private, 276515, Bachelors, 13, Married-civ-spouse, Other-service, Husband, White, Male, 0, 0, 40, Cuba, <=50K
 84 | 46, Private, 51618, HS-grad, 9, Married-civ-spouse, Other-service, Wife, White, Female, 0, 0, 40, United-States, <=50K
 85 | 59, Private, 159937, HS-grad, 9, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 48, United-States, <=50K
 86 | 44, Private, 343591, HS-grad, 9, Divorced, Craft-repair, Not-in-family, White, Female, 14344, 0, 40, United-States, >50K
 87 | 53, Private, 346253, HS-grad, 9, Divorced, Sales, Own-child, White, Female, 0, 0, 35, United-States, <=50K
 88 | 49, Local-gov, 268234, HS-grad, 9, Married-civ-spouse, Protective-serv, Husband, White, Male, 0, 0, 40, United-States, >50K
 89 | 33, Private, 202051, Masters, 14, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 50, United-States, <=50K
 90 | 30, Private, 54334, 9th, 5, Never-married, Sales, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
 91 | 43, Federal-gov, 410867, Doctorate, 16, Never-married, Prof-specialty, Not-in-family, White, Female, 0, 0, 50, United-States, >50K
 92 | 57, Private, 249977, Assoc-voc, 11, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 40, United-States, <=50K
 93 | 37, Private, 286730, Some-college, 10, Divorced, Craft-repair, Unmarried, White, Female, 0, 0, 40, United-States, <=50K
 94 | 28, Private, 212563, Some-college, 10, Divorced, Machine-op-inspct, Unmarried, Black, Female, 0, 0, 25, United-States, <=50K
 95 | 30, Private, 117747, HS-grad, 9, Married-civ-spouse, Sales, Wife, Asian-Pac-Islander, Female, 0, 1573, 35, ?, <=50K
 96 | 34, Local-gov, 226296, Bachelors, 13, Married-civ-spouse, Protective-serv, Husband, White, Male, 0, 0, 40, United-States, >50K
 97 | 29, Local-gov, 115585, Some-college, 10, Never-married, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 50, United-States, <=50K
 98 | 48, Self-emp-not-inc, 191277, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 1902, 60, United-States, >50K
 99 | 37, Private, 202683, Some-college, 10, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 48, United-States, >50K
100 | 48, Private, 171095, Assoc-acdm, 12, Divorced, Exec-managerial, Unmarried, White, Female, 0, 0, 40, England, <=50K
101 | 32, Federal-gov, 249409, HS-grad, 9, Never-married, Other-service, Own-child, Black, Male, 0, 0, 40, United-States, <=50K
102 | 76, Private, 124191, Masters, 14, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 40, United-States, >50K
103 | 


--------------------------------------------------------------------------------
/src/main/resources/customers.csv:
--------------------------------------------------------------------------------
1 | customerId,customerName
2 | 1,John
3 | 2,Clerk
4 | 3,Micheal
5 | 4,Sample
6 | 


--------------------------------------------------------------------------------
/src/main/resources/data.txt:
--------------------------------------------------------------------------------
1 | hello how are you
2 | hello how are you


--------------------------------------------------------------------------------
/src/main/resources/eventimedata.md:
--------------------------------------------------------------------------------
 1 | ## Event Time Example
 2 | //The first records is for time Wed, 27 Apr 2016 11:34:22 GMT.
 3 | 
 4 | 1461756862000,"aapl",500.0
 5 | 
 6 | // Event after 5 seconds
 7 | 
 8 | 1461756867001,"aapl",600.0
 9 | 
10 | // Event after 11 seconds
11 | 
12 | 1461756872000,"aapl",400.0
13 | 
14 | 
15 | ## Late Events Example
16 | 
17 | //It’s an event is for Wed, 27 Apr 2016 11:34:27 which is 5 seconds before the last event.
18 | 
19 | 1461756867001,"aapl",200.0
20 | 
21 | 
22 | ## Session Window Input
23 | 
24 | // start two sessions
25 | 
26 | session1,100
27 | session2,200
28 | 
29 | // Additional Event for Session 1
30 | 
31 | session1,200
32 | 
33 | // End Session 1
34 | 
35 | session1,200,end
36 | 
37 | 
38 | // Starting new session1 and updating existing session 2
39 | 
40 | session1,100
41 | session2,200
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/src/main/resources/sales.csv:
--------------------------------------------------------------------------------
 1 | transactionId,customerId,itemId,amountPaid
 2 | 111,1,1,100.0
 3 | 112,2,2,505.0
 4 | 113,3,3,510.0
 5 | 114,4,4,600.0
 6 | 115,1,2,500.0
 7 | 116,1,2,500.0
 8 | 117,1,2,500.0
 9 | 118,1,2,500.0
10 | 119,2,3,500.0
11 | 120,1,2,500.0
12 | 121,1,4,500.0
13 | 122,1,2,500.0
14 | 123,1,4,500.0
15 | 124,1,2,500.0


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/CatalogExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | 
 6 | /**
 7 |   * Catalogue Example
 8 |   */
 9 | object CatalogExample {
10 | 
11 |   def main(args: Array[String]) {
12 | 
13 |     val sparkSession = SparkSession.builder.
14 |            master("local")
15 |            .appName("example")
16 |            .getOrCreate()
17 | 
18 | 
19 |     val df = sparkSession.read.csv("src/main/resources/sales.csv")
20 |     df.createTempView("sales")
21 | 
22 |     //interacting with catalogue
23 | 
24 |     val catalog = sparkSession.catalog
25 | 
26 |     //print the databases
27 | 
28 |     catalog.listDatabases().select("name").show()
29 | 
30 |     // print all the tables
31 | 
32 |     catalog.listTables().select("name").show()
33 | 
34 |     // is cached
35 |     println(catalog.isCached("sales"))
36 |     df.cache()
37 |     println(catalog.isCached("sales"))
38 | 
39 |     // drop the table
40 |     catalog.dropTempView("sales")
41 |     catalog.listTables().select("name").show()
42 | 
43 |     // list functions
44 |     catalog.listFunctions().select("name","description","className","isTemporary").show(100)
45 |   }
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/CustomOptimizationExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.catalyst.expressions.{Literal, Multiply}
 5 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 6 | import org.apache.spark.sql.catalyst.rules.Rule
 7 | 
 8 | /**
 9 |   * User Defined Optimization
10 |   */
11 | object CustomOptimizationExample {
12 | 
13 |   object MultiplyOptimizationRule extends Rule[LogicalPlan] {
14 |     def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
15 |       case Multiply(left,right) if right.isInstanceOf[Literal] &&
16 |         right.asInstanceOf[Literal].value.asInstanceOf[Double] == 1.0 =>
17 |         println("optimization of one applied")
18 |         left
19 |     }
20 |   }
21 | 
22 | 
23 |   def main(args: Array[String]) {
24 | 
25 |     val sparkSession = SparkSession.builder.
26 |       master("local")
27 |       .appName("example")
28 |       .getOrCreate()
29 | 
30 | 
31 |     val df = sparkSession.read.option("header","true").csv("src/main/resources/sales.csv")
32 |     val multipliedDF = df.selectExpr("amountPaid * 1")
33 |     println(multipliedDF.queryExecution.optimizedPlan.numberedTreeString)
34 | 
35 |     //add our custom optimization
36 |     sparkSession.experimental.extraOptimizations = Seq(MultiplyOptimizationRule)
37 |     val multipliedDFWithOptimization = df.selectExpr("amountPaid * 1")
38 |     println("after optimization")
39 | 
40 |     println(multipliedDFWithOptimization.queryExecution.optimizedPlan.numberedTreeString)
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/DataSetWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |   * Created by madhu on 6/5/16.
 7 |   */
 8 | object DataSetWordCount {
 9 | 
10 |   def main(args: Array[String]) {
11 | 
12 |     val sparkSession = SparkSession.builder.
13 |       master("local")
14 |       .appName("example")
15 |       .getOrCreate()
16 | 
17 |     import sparkSession.implicits._
18 |     val data = sparkSession.read.text("src/main/resources/data.txt").as[String]
19 | 
20 |     val words = data.flatMap(value => value.split("\\s+"))
21 | 
22 |     val groupedWords = words.groupByKey(_.toLowerCase)
23 | 
24 |     val counts = groupedWords.count()
25 | 
26 |     counts.show()
27 | 
28 | 
29 |   }
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/DataSourceV2Example.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2
 2 | 
 3 | import org.apache.spark.Partition
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object DataSourceV2Example {
 7 | 
 8 |   def main(args: Array[String]) {
 9 | 
10 |     val sparkSession = SparkSession.builder
11 |       .master("local[2]")
12 |       .appName("example")
13 |       .getOrCreate()
14 | 
15 |     val simpleDf = sparkSession.read
16 |       .format("com.madhukaraphatak.examples.sparktwo.datasourcev2.simple")
17 |       .load()
18 | 
19 |     simpleDf.show()
20 |     println(
21 |       "number of partitions in simple source is " + simpleDf.rdd.getNumPartitions)
22 | 
23 |     val simpleMultiDf = sparkSession.read
24 |       .format("com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemulti")
25 |       .load()
26 | 
27 |     simpleMultiDf.show()
28 |     println(
29 |       "number of partitions in simple multi source is " + simpleMultiDf.rdd.getNumPartitions)
30 | 
31 |     val simpleCsvDf = sparkSession.read
32 |       .format("com.madhukaraphatak.examples.sparktwo.datasourcev2.simplecsv")
33 |       .load("src/main/resources/sales.csv")
34 | 
35 |     simpleCsvDf.printSchema()
36 |     simpleCsvDf.show()
37 |     println(
38 |       "number of partitions in simple csv source is " + simpleCsvDf.rdd.getNumPartitions)
39 | 
40 |     val simpleMysqlDf = sparkSession.read
41 |       .format("com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemysql")
42 |       .load()
43 | 
44 |     simpleMysqlDf.printSchema()
45 |     simpleMysqlDf.filter("user=\"root\"").show()
46 |     println(
47 |       "number of partitions in simple mysql source is " + simpleMysqlDf.rdd.getNumPartitions)
48 | 
49 |     //write examples
50 |     simpleMysqlDf.write
51 |       .format(
52 |         "com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemysqlwriter")
53 |       .save()
54 |     simpleMysqlDf.write
55 |       .format(
56 |         "com.madhukaraphatak.examples.sparktwo.datasourcev2.mysqlwithtransaction")
57 |       .save()
58 | 
59 |     val simplePartitoningDf = sparkSession.read
60 |       .format(
61 |         "com.madhukaraphatak.examples.sparktwo.datasourcev2.partitionaffinity")
62 |       .load()
63 | 
64 |     val dfRDD = simplePartitoningDf.rdd
65 |     val baseRDD =
66 |       dfRDD.dependencies.head.rdd.dependencies.head.rdd.dependencies.head.rdd
67 | 
68 |     val partition = baseRDD.partitions(0)
69 |     val getPrefferedLocationDef = baseRDD.getClass
70 |       .getMethod("getPreferredLocations", classOf[Partition])
71 |     val preferredLocation = getPrefferedLocationDef
72 |       .invoke(baseRDD, partition)
73 |       .asInstanceOf[Seq[String]]
74 |     println("preferred location is " + preferredLocation)
75 | 
76 |     sparkSession.stop()
77 | 
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/DatasetVsDataFrame.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |   * Logical Plans for Dataframe and Dataset
 7 |   */
 8 | object DatasetVsDataFrame {
 9 | 
10 |   case class Sales(transactionId:Int,customerId:Int,itemId:Int,amountPaid:Double)
11 | 
12 |   def main(args: Array[String]) {
13 | 
14 |     val sparkSession = SparkSession.builder.
15 |       master("local")
16 |       .appName("example")
17 |       .getOrCreate()
18 | 
19 |     val sparkContext = sparkSession.sparkContext
20 |     import sparkSession.implicits._
21 | 
22 | 
23 |     //read data from text file
24 | 
25 |     val df = sparkSession.read.option("header","true").option("inferSchema","true").csv("src/main/resources/sales.csv")
26 |     val ds = sparkSession.read.option("header","true").option("inferSchema","true").csv("src/main/resources/sales.csv").as[Sales]
27 | 
28 | 
29 |     val selectedDF = df.select("itemId")
30 | 
31 |     val selectedDS = ds.map(_.itemId)
32 | 
33 |     println(selectedDF.queryExecution.optimizedPlan.numberedTreeString)
34 | 
35 |     println(selectedDS.queryExecution.optimizedPlan.numberedTreeString)
36 | 
37 | 
38 |   }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/MysqlTransactionExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object MysqlTransactionExample {
 6 | 
 7 |   def main(args: Array[String]) {
 8 | 
 9 |     val sparkSession = SparkSession.builder
10 |       .master("local[2]")
11 |       .appName("example")
12 |       .getOrCreate()
13 | 
14 |     import sparkSession.implicits._
15 |     val simpleDf = sparkSession.read
16 |       .format("com.madhukaraphatak.examples.sparktwo.datasourcev2.simple")
17 |       .load()
18 |       .as[String]
19 | 
20 |     val errorDf = simpleDf.map(value => {
21 |       if (value == "3") throw new IllegalArgumentException("value cannot be 3")
22 |       else value
23 |     })
24 | 
25 |     //errorDf.show()
26 | 
27 |     // results in partial writes
28 |     errorDf.write
29 |       .format(
30 |         "com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemysqlwriter")
31 |       .save()
32 | 
33 |     //use transactional ones
34 | 
35 |     errorDf.write
36 |       .format(
37 |         "com.madhukaraphatak.examples.sparktwo.datasourcev2.mysqlwithtransaction")
38 |       .save()
39 | 
40 |     sparkSession.stop()
41 | 
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/RDDToDataSet.scala:
--------------------------------------------------------------------------------
  1 | package com.madhukaraphatak.examples.sparktwo
  2 | 
  3 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
  4 | import org.apache.spark.sql.{Row, SparkSession}
  5 | 
  6 | /**
  7 |   * RDD API to Dataset API
  8 |   */
  9 | object RDDToDataSet {
 10 | 
 11 |   def main(args: Array[String]) {
 12 | 
 13 |     val sparkSession = SparkSession.builder.
 14 |       master("local")
 15 |       .appName("example")
 16 |       .getOrCreate()
 17 | 
 18 |     val sparkContext = sparkSession.sparkContext
 19 |     import sparkSession.implicits._
 20 | 
 21 | 
 22 |     //read data from text file
 23 |     val rdd = sparkContext.textFile("src/main/resources/data.txt")
 24 |     val ds = sparkSession.read.text("src/main/resources/data.txt").as[String]
 25 | 
 26 | 
 27 |     // do count
 28 |     println("count ")
 29 |     println(rdd.count())
 30 |     println(ds.count())
 31 | 
 32 |     // wordcount
 33 |     println(" wordcount ")
 34 | 
 35 |     val wordsRDD = rdd.flatMap(value => value.split("\\s+"))
 36 |     val wordsPair = wordsRDD.map(word => (word,1))
 37 |     val wordCount = wordsPair.reduceByKey(_+_)
 38 |     println(wordCount.collect.toList)
 39 | 
 40 |     val wordsDs = ds.flatMap(value => value.split("\\s+"))
 41 |     val wordsPairDs = wordsDs.groupByKey(value => value)
 42 |     val wordCountDs = wordsPairDs.count
 43 |     wordCountDs.show()
 44 | 
 45 |     //cache
 46 |     rdd.cache()
 47 |     ds.cache()
 48 | 
 49 |     //filter
 50 | 
 51 |     val filteredRDD = wordsRDD.filter(value => value =="hello")
 52 |     println(filteredRDD.collect().toList)
 53 | 
 54 |     val filteredDS = wordsDs.filter(value => value =="hello")
 55 |     filteredDS.show()
 56 | 
 57 | 
 58 |     //map partitions
 59 | 
 60 |     val mapPartitionsRDD = rdd.mapPartitions(iterator => List(iterator.count(value => true)).iterator)
 61 |     println(s" the count each partition is ${mapPartitionsRDD.collect().toList}")
 62 | 
 63 |     val mapPartitionsDs = ds.mapPartitions(iterator => List(iterator.count(value => true)).iterator)
 64 |     mapPartitionsDs.show()
 65 | 
 66 |     //converting to each other
 67 |     val dsToRDD = ds.rdd
 68 |     println(dsToRDD.collect())
 69 | 
 70 |     val rddStringToRowRDD = rdd.map(value => Row(value))
 71 |     val dfschema = StructType(Array(StructField("value",StringType)))
 72 |     val rddToDF = sparkSession.createDataFrame(rddStringToRowRDD,dfschema)
 73 |     val rDDToDataSet = rddToDF.as[String]
 74 |     rDDToDataSet.show()
 75 | 
 76 |     // double based operation
 77 | 
 78 |     val doubleRDD = sparkContext.makeRDD(List(1.0,5.0,8.9,9.0))
 79 |     val rddSum =doubleRDD.sum()
 80 |     val rddMean = doubleRDD.mean()
 81 | 
 82 |     println(s"sum is $rddSum")
 83 |     println(s"mean is $rddMean")
 84 | 
 85 |     val rowRDD = doubleRDD.map(value => Row.fromSeq(List(value)))
 86 |     val schema = StructType(Array(StructField("value",DoubleType)))
 87 |     val doubleDS = sparkSession.createDataFrame(rowRDD,schema)
 88 | 
 89 |     import org.apache.spark.sql.functions._
 90 |     doubleDS.agg(sum("value")).show()
 91 |     doubleDS.agg(mean("value")).show()
 92 | 
 93 |     //reduceByKey API
 94 |     val reduceCountByRDD = wordsPair.reduceByKey(_+_)
 95 |     val reduceCountByDs = wordsPairDs.mapGroups((key,values) =>(key,values.length))
 96 | 
 97 |     println(reduceCountByRDD.collect().toList)
 98 |     println(reduceCountByDs.collect().toList)
 99 | 
100 |     //reduce function
101 |     val rddReduce = doubleRDD.reduce((a,b) => a +b)
102 |     val dsReduce = doubleDS.reduce((row1,row2) =>Row(row1.getDouble(0) + row2.getDouble(0)))
103 |     
104 |     println("rdd reduce is " +rddReduce +" dataset reduce "+dsReduce)
105 | 
106 |   }
107 | 
108 | }
109 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/SparkSessionExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |   * Spark Session example
 7 |   *
 8 |   */
 9 | object SparkSessionExample {
10 | 
11 |   def main(args: Array[String]) {
12 | 
13 |     val sparkSession = SparkSession.builder
14 |       .master("local")
15 |       .appName("spark session example")
16 |       .getOrCreate()
17 | 
18 |     val df = sparkSession.read
19 |       .option("header", "true")
20 |       .csv("src/main/resources/sales.csv")
21 | 
22 |     df.show()
23 | 
24 |   }
25 | 
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/TimeWindowExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo
 2 | 
 3 | import org.apache.spark.sql.{DataFrame, SparkSession}
 4 | import org.apache.spark.sql.functions._
 5 | 
 6 | /**
 7 |   * Time window Example
 8 |   */
 9 | object TimeWindowExample {
10 | 
11 | 
12 |   def printWindow(windowDF:DataFrame, aggCol:String) ={
13 |     windowDF.sort("window.start").select("window.start","window.end",s"$aggCol").
14 |       show(truncate = false)
15 |   }
16 | 
17 |   def main(args: Array[String]) {
18 | 
19 |     val sparkSession = SparkSession.builder.
20 |       master("local")
21 |       .appName("time window example")
22 |       .getOrCreate()
23 | 
24 | 
25 |     sparkSession.sparkContext.setLogLevel("ERROR")
26 |     val stocksDF = sparkSession.read.option("header","true").
27 |       option("inferSchema","true")
28 |       .csv("src/main/resources/applestock.csv")
29 | 
30 |     //weekly average of 2016
31 | 
32 |     val stocks2016 = stocksDF.filter("year(Date)==2016")
33 | 
34 |     val tumblingWindowDS = stocks2016.groupBy(window(stocks2016.col("Date"),"1 week"))
35 |       .agg(avg("Close").as("weekly_average"))
36 |     println("weekly average in 2016 using tumbling window is")
37 |     printWindow(tumblingWindowDS,"weekly_average")
38 | 
39 | 
40 |     val windowWithStartTime = stocks2016.groupBy(window(stocks2016.col("Date"),"1 week","1 week", "4 days")).
41 |       agg(avg("Close").as("weekly_average"))
42 |     println("weekly average in 2016 using sliding window is")
43 |     printWindow(windowWithStartTime,"weekly_average")
44 | 
45 |     val filteredWindow = windowWithStartTime.filter("year(window.start)=2016")
46 |     println("weekly average in 2016 after filtering is")
47 |     printWindow(filteredWindow,"weekly_average")
48 | 
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/MysqlWithTransaction.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.mysqlwithtransaction
 2 | 
 3 | import org.apache.spark.sql.sources.v2._
 4 | import org.apache.spark.sql.Row
 5 | import org.apache.spark.sql.types._
 6 | import org.apache.spark.sql.sources.v2.writer._
 7 | import scala.collection.JavaConverters._
 8 | import org.apache.spark.sql.SparkSession
 9 | import org.apache.spark.sql.sources._
10 | import java.util.Optional
11 | import org.apache.spark.sql.SaveMode
12 | import java.sql.{Connection, DriverManager}
13 | 
14 | class DefaultSource extends DataSourceV2 with WriteSupport {
15 | 
16 |   def createWriter(jobId: String,
17 |                    schema: StructType,
18 |                    mode: SaveMode,
19 |                    options: DataSourceOptions): Optional[DataSourceWriter] = {
20 |     Optional.of(new MysqlDataSourceWriter())
21 | 
22 |   }
23 | }
24 | 
25 | class MysqlDataSourceWriter extends DataSourceWriter {
26 | 
27 |   override def createWriterFactory(): DataWriterFactory[Row] = {
28 |     new MysqlDataWriterFactory()
29 |   }
30 | 
31 |   override def commit(messages: Array[WriterCommitMessage]) = {}
32 | 
33 |   override def abort(messages: Array[WriterCommitMessage]) = {
34 |     println("abort is called in  data source writer")
35 |   }
36 | 
37 | }
38 | 
39 | class MysqlDataWriterFactory extends DataWriterFactory[Row] {
40 |   override def createDataWriter(partitionId: Int,
41 |                                 attemptNumber: Int): DataWriter[Row] = {
42 |     new MysqlDataWriter()
43 |   }
44 | }
45 | 
46 | class MysqlDataWriter extends DataWriter[Row] {
47 | 
48 |   val url = "jdbc:mysql://localhost/test"
49 |   val user = "root"
50 |   val password = "abc123"
51 |   val table = "userwrite"
52 | 
53 |   val connection = DriverManager.getConnection(url, user, password)
54 |   connection.setAutoCommit(false)
55 |   val statement = s"insert into $table (user) values (?)"
56 |   val preparedStatement = connection.prepareStatement(statement)
57 | 
58 |   def write(record: Row) = {
59 |     val value = record.getString(0)
60 |     preparedStatement.setString(1, value)
61 |     preparedStatement.executeUpdate()
62 |   }
63 | 
64 |   def commit(): WriterCommitMessage = {
65 |     connection.commit()
66 |     WriteSucceeded
67 |   }
68 | 
69 |   def abort() = {
70 |     println("abort is called in data writer")
71 |   }
72 | 
73 |   object WriteSucceeded extends WriterCommitMessage
74 | 
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/SimpleCsvDataSource.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.simplecsv
 2 | 
 3 | import org.apache.spark.sql.sources.v2._
 4 | import org.apache.spark.sql.Row
 5 | import org.apache.spark.sql.types._
 6 | import org.apache.spark.sql.sources.v2.reader._
 7 | import scala.collection.JavaConverters._
 8 | import org.apache.spark.sql.SparkSession
 9 | 
10 | class DefaultSource extends DataSourceV2 with ReadSupport {
11 | 
12 |   def createReader(options: DataSourceOptions) = {
13 |     val path = options.get("path").get
14 |     new SimpleCsvDataSourceReader(path)
15 |   }
16 | }
17 | 
18 | class SimpleCsvDataSourceReader(path: String) extends DataSourceReader {
19 | 
20 |   def readSchema() = {
21 |     val sparkContext = SparkSession.builder.getOrCreate().sparkContext
22 |     val firstLine = sparkContext.textFile(path).first()
23 |     val columnNames = firstLine.split(",")
24 |     val structFields = columnNames.map(value ⇒ StructField(value, StringType))
25 |     StructType(structFields)
26 |   }
27 | 
28 |   def createDataReaderFactories = {
29 |     val sparkContext = SparkSession.builder.getOrCreate().sparkContext
30 |     val rdd = sparkContext.textFile(path)
31 | 
32 |     val factoryList = new java.util.ArrayList[DataReaderFactory[Row]]
33 |     (0 to rdd.getNumPartitions - 1).foreach(value ⇒
34 |       factoryList.add(new SimpleCsvDataSourceReaderFactory(value, path)))
35 |     factoryList
36 |   }
37 | 
38 | }
39 | 
40 | class SimpleCsvDataSourceReaderFactory(partitionNumber: Int, filePath: String, hasHeader: Boolean = true) extends DataReaderFactory[Row] {
41 | 
42 |   def createDataReader = new SimpleCsvDataReader(partitionNumber, filePath, hasHeader)
43 | }
44 | 
45 | class SimpleCsvDataReader(partitionNumber: Int, filePath: String, hasHeader: Boolean = true) extends DataReader[Row] {
46 | 
47 |   var iterator: Iterator[String] = null
48 | 
49 |   @transient
50 |   def next = {
51 |     if (iterator == null) {
52 |       val sparkContext = SparkSession.builder.getOrCreate().sparkContext
53 |       val rdd = sparkContext.textFile(filePath)
54 |       val filterRDD = if (hasHeader) {
55 |         val firstLine = rdd.first
56 |         rdd.filter(_ != firstLine)
57 |       }
58 |       else rdd
59 |       val partition = filterRDD.partitions(partitionNumber)
60 |       iterator = filterRDD.iterator(partition, org.apache.spark.TaskContext.get())
61 |     }
62 |     iterator.hasNext
63 |   }
64 | 
65 |   def get = {
66 |     println("calling get")
67 |     val line = iterator.next()
68 |     Row.fromSeq(line.split(","))
69 |   }
70 |   def close() = Unit
71 | }
72 | 
73 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/SimpleDataSource.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.simple
 2 | 
 3 | import org.apache.spark.sql.sources.v2._
 4 | import org.apache.spark.sql.Row
 5 | import org.apache.spark.sql.types._
 6 | import org.apache.spark.sql.sources.v2.reader._
 7 | import scala.collection.JavaConverters._
 8 | 
 9 | class DefaultSource extends DataSourceV2 with ReadSupport {
10 | 
11 |   def createReader(options: DataSourceOptions) = new SimpleDataSourceReader()
12 | 
13 | }
14 | 
15 | class SimpleDataSourceReader extends DataSourceReader {
16 | 
17 |   def readSchema() = StructType(Array(StructField("value", StringType)))
18 | 
19 |   def createDataReaderFactories = {
20 |     val factoryList = new java.util.ArrayList[DataReaderFactory[Row]]
21 |     factoryList.add(new SimpleDataSourceReaderFactory())
22 |     factoryList
23 |   }
24 | 
25 | }
26 | 
27 | class SimpleDataSourceReaderFactory extends DataReaderFactory[Row] with DataReader[Row] {
28 |   def createDataReader = new SimpleDataSourceReaderFactory()
29 | 
30 |   val values = Array("1", "2", "3", "4", "5")
31 | 
32 |   var index = 0
33 | 
34 |   def next = index < values.length
35 | 
36 |   def get = {
37 |     val row = Row(values(index))
38 |     index = index + 1
39 |     row
40 |   }
41 | 
42 |   def close() = Unit
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/SimpleDataSourceWithPartitionAffinity.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.partitionaffinity
 2 | 
 3 | import org.apache.spark.sql.Row
 4 | import org.apache.spark.sql.sources.v2._
 5 | import org.apache.spark.sql.sources.v2.reader._
 6 | import org.apache.spark.sql.types._
 7 | 
 8 | class DefaultSource extends DataSourceV2 with ReadSupport {
 9 | 
10 |   def createReader(options: DataSourceOptions) = new SimpleDataSourceReader()
11 | 
12 | }
13 | 
14 | class SimpleDataSourceReader extends DataSourceReader {
15 | 
16 |   def readSchema() = StructType(Array(StructField("value", StringType)))
17 | 
18 |   def createDataReaderFactories = {
19 |     val factoryList = new java.util.ArrayList[DataReaderFactory[Row]]
20 |     factoryList.add(new SimpleDataSourceReaderFactory(0, 4))
21 |     factoryList.add(new SimpleDataSourceReaderFactory(5, 9))
22 |     factoryList
23 |   }
24 | 
25 | }
26 | 
27 | class SimpleDataSourceReaderFactory(var start: Int, var end: Int)
28 |     extends DataReaderFactory[Row] {
29 |   def createDataReader = new SimpleDataReader(start, end)
30 | 
31 |   override def preferredLocations(): Array[String] = Array("sample-hostname")
32 | }
33 | 
34 | class SimpleDataReader(var start: Int, end: Int) extends DataReader[Row] {
35 | 
36 |   val values = Array("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")
37 | 
38 |   var index = 0
39 | 
40 |   def next = start <= end
41 | 
42 |   def get = {
43 |     val row = Row(values(start))
44 |     start = start + 1
45 |     row
46 |   }
47 | 
48 |   def close() = Unit
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/SimpleMultiDataSource.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemulti
 2 | 
 3 | import org.apache.spark.sql.sources.v2._
 4 | import org.apache.spark.sql.Row
 5 | import org.apache.spark.sql.types._
 6 | import org.apache.spark.sql.sources.v2.reader._
 7 | import scala.collection.JavaConverters._
 8 | 
 9 | class DefaultSource extends DataSourceV2 with ReadSupport {
10 | 
11 |   def createReader(options: DataSourceOptions) = new SimpleDataSourceReader()
12 | 
13 | }
14 | 
15 | class SimpleDataSourceReader extends DataSourceReader {
16 | 
17 |   def readSchema() = StructType(Array(StructField("value", StringType)))
18 | 
19 |   def createDataReaderFactories = {
20 |     val factoryList = new java.util.ArrayList[DataReaderFactory[Row]]
21 |     factoryList.add(new SimpleDataSourceReaderFactory(0, 4))
22 |     factoryList.add(new SimpleDataSourceReaderFactory(5, 9))
23 |     factoryList
24 |   }
25 | 
26 | }
27 | 
28 | class SimpleDataSourceReaderFactory(var start: Int, var end: Int) extends DataReaderFactory[Row] with DataReader[Row] {
29 | 
30 |   def createDataReader = new SimpleDataSourceReaderFactory(start, end)
31 | 
32 |   val values = Array("1", "2", "3", "4", "5", "6", "7", "8", "9", "10")
33 | 
34 |   var index = 0
35 | 
36 |   def next = start <= end
37 | 
38 |   def get = {
39 |     val row = Row(values(start))
40 |     start = start + 1
41 |     row
42 |   }
43 | 
44 |   def close() = Unit
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/SimpleMysqlDataSource.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemysql
 2 | 
 3 | import org.apache.spark.sql.sources.v2._
 4 | import org.apache.spark.sql.Row
 5 | import org.apache.spark.sql.types._
 6 | import org.apache.spark.sql.sources.v2.reader._
 7 | import scala.collection.JavaConverters._
 8 | import org.apache.spark.sql.SparkSession
 9 | import org.apache.spark.sql.sources._
10 | 
11 | class DefaultSource extends DataSourceV2 with ReadSupport {
12 | 
13 |   def createReader(options: DataSourceOptions) = {
14 |     new SimpleMysqlDataSourceReader()
15 |   }
16 | }
17 | 
18 | class SimpleMysqlDataSourceReader()
19 |     extends DataSourceReader
20 |     with SupportsPushDownFilters {
21 | 
22 |   var pushedFilters: Array[Filter] = Array[Filter]()
23 |   def readSchema() = {
24 |     val columnNames = Array("user")
25 |     val structFields = columnNames.map(value ⇒ StructField(value, StringType))
26 |     StructType(structFields)
27 |   }
28 | 
29 |   def pushFilters(filters: Array[Filter]) = {
30 |     println("Filters " + filters.toList)
31 |     pushedFilters = filters
32 |     pushedFilters
33 |   }
34 | 
35 |   def createDataReaderFactories = {
36 |     val sparkContext = SparkSession.builder.getOrCreate().sparkContext
37 | 
38 |     val factoryList = new java.util.ArrayList[DataReaderFactory[Row]]
39 |     factoryList.add(new SimpleMysqlDataSourceReaderFactory(pushedFilters))
40 |     factoryList
41 |   }
42 | 
43 | }
44 | 
45 | class SimpleMysqlDataSourceReaderFactory(pushedFilters: Array[Filter])
46 |     extends DataReaderFactory[Row] {
47 | 
48 |   def createDataReader = new SimpleMysqlDataReader(pushedFilters: Array[Filter])
49 | }
50 | 
51 | class SimpleMysqlDataReader(pushedFilters: Array[Filter])
52 |     extends DataReader[Row] {
53 | 
54 |   var iterator: Iterator[Row] = null
55 | 
56 |   val getQuery: String = {
57 |     if (pushedFilters == null || pushedFilters.isEmpty)
58 |       "(select user from user)a"
59 |     else {
60 |       pushedFilters(1) match {
61 |         case filter: EqualTo =>
62 |           val condition = s"${filter.attribute} = '${filter.value}'"
63 |           s"(select user from user where $condition)a"
64 |         case _ => "(select user from user)a"
65 |       }
66 |     }
67 |   }
68 | 
69 |   def next = {
70 |     if (iterator == null) {
71 |       val url = "jdbc:mysql://localhost/mysql"
72 |       val user = "root"
73 |       val password = "abc123"
74 | 
75 |       val properties = new java.util.Properties()
76 |       properties.setProperty("user", user)
77 |       properties.setProperty("password", password)
78 | 
79 |       val sparkSession = SparkSession.builder.getOrCreate()
80 |       val df = sparkSession.read.jdbc(url, getQuery, properties)
81 |       val rdd = df.rdd
82 |       val partition = rdd.partitions(0)
83 |       iterator = rdd.iterator(partition, org.apache.spark.TaskContext.get())
84 |     }
85 |     iterator.hasNext
86 |   }
87 | 
88 |   def get = {
89 |     iterator.next()
90 |   }
91 |   def close() = Unit
92 | }
93 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/SimpleMysqlWriter.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemysqlwriter
 2 | 
 3 | import org.apache.spark.sql.sources.v2._
 4 | import org.apache.spark.sql.Row
 5 | import org.apache.spark.sql.types._
 6 | import org.apache.spark.sql.sources.v2.writer._
 7 | import scala.collection.JavaConverters._
 8 | import org.apache.spark.sql.SparkSession
 9 | import org.apache.spark.sql.sources._
10 | import java.util.Optional
11 | import org.apache.spark.sql.SaveMode
12 | import java.sql.{Connection,DriverManager}
13 | 
14 | class DefaultSource extends DataSourceV2 with WriteSupport {
15 | 
16 |   def createWriter(jobId: String, schema: StructType, mode: SaveMode,
17 |                    options: DataSourceOptions): Optional[DataSourceWriter] = {
18 |     Optional.of(new MysqlDataSourceWriter())
19 | 
20 |   }
21 | }
22 | 
23 | class MysqlDataSourceWriter extends DataSourceWriter {
24 | 
25 |   override def createWriterFactory(): DataWriterFactory[Row] = {
26 |     new MysqlDataWriterFactory()
27 |   }
28 | 
29 |   override def commit(messages: Array[WriterCommitMessage]) = {
30 | 
31 |   }
32 | 
33 |   override def abort(messages: Array[WriterCommitMessage]) = {
34 | 
35 |   }
36 | 
37 | }
38 | 
39 | class MysqlDataWriterFactory extends DataWriterFactory[Row] {
40 |   override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[Row] = {
41 |     new MysqlDataWriter()
42 |   }
43 | }
44 | 
45 | class MysqlDataWriter extends DataWriter[Row] {
46 | 
47 |   val url = "jdbc:mysql://localhost/test"
48 |   val user = "root"
49 |   val password = "abc123"
50 |   val table ="userwrite"
51 | 
52 |   val connection = DriverManager.getConnection(url,user,password)
53 |   val statement = "insert into userwrite (user) values (?)"
54 |   val preparedStatement = connection.prepareStatement(statement)
55 | 
56 | 
57 |   def write(record: Row) = {
58 |    val value = record.getString(0)
59 |    preparedStatement.setString(1,value)
60 |    preparedStatement.executeUpdate()
61 |   }
62 | 
63 |   def commit(): WriterCommitMessage = {
64 |     WriteSucceeded
65 |   }
66 | 
67 |   def abort() = {
68 | 
69 |   }
70 | 
71 |   object WriteSucceeded extends WriterCommitMessage
72 | 
73 | }
74 | 
75 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/ml/MultiColumnTransformation.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.ml
 2 | 
 3 | import org.apache.spark.ml.Pipeline
 4 | import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer}
 5 | import org.apache.spark.sql.SparkSession
 6 | 
 7 | object MultiColumnTransformation {
 8 | 
 9 |   def main(args: Array[String]): Unit = {
10 | 
11 |     val sparkSession = SparkSession.builder.
12 |       master("local")
13 |       .appName("example")
14 |       .getOrCreate()
15 | 
16 | 
17 |     val salaryDf = sparkSession.read.format("csv").option("header", "true").load("src/main/resources/adult.csv")
18 | 
19 |     val stringColumns = Array("workclass", "occupation", "sex")
20 | 
21 |     val outputColumns = stringColumns.map(_ + "_onehot")
22 | 
23 |     val indexers = stringColumns.map(column => {
24 |       val indexer = new StringIndexer()
25 |       indexer.setInputCol(column)
26 |       indexer.setOutputCol(column + "_index")
27 |     })
28 | 
29 |     val singleOneHotEncoder = new OneHotEncoderEstimator()
30 |     singleOneHotEncoder.setInputCols(stringColumns.map(_ + "_index"))
31 |     singleOneHotEncoder.setOutputCols(outputColumns)
32 | 
33 |     val pipeline = new Pipeline()
34 |     pipeline.setStages(indexers ++ Array(singleOneHotEncoder))
35 | 
36 |     val outputDf = pipeline.fit(salaryDf).transform(salaryDf)
37 | 
38 |     outputDf.select(outputColumns.head, outputColumns.tail: _*).show()
39 | 
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/ml/ParallelCrossValidation.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.ml
 2 | 
 3 | import org.apache.spark.ml.Pipeline
 4 | import org.apache.spark.ml.classification.LogisticRegression
 5 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
 6 | import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler}
 7 | import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
 8 | import org.apache.spark.sql.SparkSession
 9 | 
10 | object ParallelCrossValidation {
11 | 
12 |   def main(args: Array[String]): Unit = {
13 | 
14 | 
15 |     val sparkSession = SparkSession.builder.
16 |       master("local[*]")
17 |       .appName("example")
18 |       .getOrCreate()
19 | 
20 | 
21 |     val salaryDf = sparkSession.read.format("csv")
22 |       .option("header", "true")
23 |       .option("inferSchema", "true")
24 |       .load("src/main/resources/adult.csv")
25 | 
26 |     val stringColumns = Array("workclass", "occupation", "sex", "education", "martial_status", "relationship",
27 |       "race", "native_country")
28 | 
29 |     val numericalColumns = Array("age", "fnlwgt", "capital_loss", "capital_gain")
30 | 
31 |     val labelColumn = "salary"
32 |     val outputColumns = stringColumns.map(_ + "_onehot")
33 | 
34 |     val indexers = stringColumns.map(column => {
35 |       val indexer = new StringIndexer()
36 |       indexer.setInputCol(column)
37 |       indexer.setHandleInvalid("keep")
38 |       indexer.setOutputCol(column + "_index")
39 |     })
40 | 
41 |     val singleOneHotEncoder = new OneHotEncoderEstimator()
42 |     singleOneHotEncoder.setInputCols(stringColumns.map(_ + "_index"))
43 |     singleOneHotEncoder.setOutputCols(outputColumns)
44 | 
45 |     val vectorAssembler = new VectorAssembler()
46 |     vectorAssembler.setInputCols(outputColumns ++ numericalColumns)
47 |     vectorAssembler.setOutputCol("features")
48 | 
49 |     val labelIndexer = new StringIndexer()
50 |     labelIndexer.setInputCol("salary")
51 |     labelIndexer.setOutputCol("label")
52 | 
53 |     val logisticRegression = new LogisticRegression()
54 | 
55 | 
56 |     val pipeline = new Pipeline()
57 |     pipeline.setStages(indexers ++ Array(singleOneHotEncoder)
58 |       ++ Array(vectorAssembler) ++ Array(labelIndexer) ++ Array(logisticRegression))
59 | 
60 |     val paramMap = new ParamGridBuilder()
61 |       .addGrid(logisticRegression.maxIter, Array(1, 2, 3)).build()
62 | 
63 | 
64 |     val crossValidator = new CrossValidator()
65 |     crossValidator.setEstimator(pipeline)
66 |     crossValidator.setEvaluator(new BinaryClassificationEvaluator())
67 |     crossValidator.setEstimatorParamMaps(paramMap)
68 |     crossValidator.setParallelism(3)
69 | 
70 |     crossValidator.fit(salaryDf)
71 | 
72 |   }
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/EventTimeExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.streaming
 2 | 
 3 | import java.sql.Timestamp
 4 | 
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.streaming.OutputMode
 8 | 
 9 | object EventTimeExample {
10 | 
11 |   case class Stock(time: Timestamp, symbol: String, value: Double)
12 | 
13 |   def main(args: Array[String]): Unit = {
14 |     val sparkSession = SparkSession.builder
15 |       .master("local")
16 |       .appName("example")
17 |       .getOrCreate()
18 |     //create stream from socket
19 | 
20 |     import sparkSession.implicits._
21 |     sparkSession.sparkContext.setLogLevel("ERROR")
22 |     val socketStreamDs = sparkSession.readStream
23 |       .format("socket")
24 |       .option("host", "localhost")
25 |       .option("port", 50050)
26 |       .load()
27 |       .as[String]
28 | 
29 |     // read as stock
30 |     val stockDs = socketStreamDs.map(value => {
31 |       val columns = value.split(",")
32 |       Stock(new Timestamp(columns(0).toLong), columns(1), columns(2).toDouble)
33 |     })
34 | 
35 |     val windowedCount = stockDs
36 |       .groupBy(
37 |         window($"time", "10 seconds")
38 |       )
39 |       .sum("value")
40 | 
41 | 
42 |     val query =
43 |       windowedCount.writeStream
44 |         .format("console")
45 |         .option("truncate", "false")
46 |         .outputMode(OutputMode.Complete())
47 | 
48 |     query.start().awaitTermination()
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/FileStreamExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.streaming
 2 | 
 3 | import java.util.concurrent.TimeUnit
 4 | 
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 8 | 
 9 | /**
10 |   * Created by madhu on 24/07/17.
11 |   */
12 | object FileStreamExample {
13 | 
14 |   def main(args: Array[String]): Unit = {
15 | 
16 |     val sparkSession = SparkSession.builder
17 |       .master("local")
18 |       .appName("example")
19 |       .getOrCreate()
20 | 
21 |     val schema = StructType(
22 |       Array(StructField("transactionId", StringType),
23 |             StructField("customerId", StringType),
24 |             StructField("itemId", StringType),
25 |             StructField("amountPaid", StringType)))
26 | 
27 |     //create stream from folder
28 |     val fileStreamDf = sparkSession.readStream
29 |       .option("header", "true")
30 |       .schema(schema)
31 |       .csv("/tmp/input")
32 | 
33 |     val query = fileStreamDf.writeStream
34 |       .format("console")
35 |       .outputMode(OutputMode.Append()).start()
36 | 
37 |       query.awaitTermination()
38 | 
39 |   }
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/IngestionTimeWindow.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.streaming
 2 | 
 3 | import java.sql.Timestamp
 4 | 
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.streaming.OutputMode
 8 | 
 9 | object IngestionTimeWindow {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 |     val sparkSession = SparkSession.builder
13 |       .master("local")
14 |       .appName("example")
15 |       .getOrCreate()
16 |     //create stream from socket
17 |     sparkSession.sparkContext.setLogLevel("ERROR")
18 |     val socketStreamDf = sparkSession.readStream
19 |       .format("socket")
20 |       .option("host", "localhost")
21 |       .option("port", 50050)
22 |       .option("includeTimestamp", true)
23 |       .load()
24 |     import sparkSession.implicits._
25 |     val socketDs = socketStreamDf.as[(String, Timestamp)]
26 |     val wordsDs = socketDs
27 |       .flatMap(line => line._1.split(" ").map(word => {
28 |          Thread.sleep(15000)
29 |         (word, line._2)
30 |       }))
31 |       .toDF("word", "timestamp")
32 | 
33 |     val windowedCount = wordsDs
34 |       .groupBy(
35 |         window($"timestamp", "15 seconds")
36 |       )
37 |         .count()
38 |       .orderBy("window")
39 | 
40 | 
41 |     val query =
42 |       windowedCount.writeStream
43 |         .format("console").option("truncate","false")
44 |         .outputMode(OutputMode.Complete()).start()
45 | 
46 | 
47 |     query.awaitTermination()
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/ProcessingTimeWindow.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.streaming
 2 | import java.sql.Timestamp
 3 | 
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions._
 6 | 
 7 | import org.apache.spark.sql.streaming.OutputMode
 8 | 
 9 | object ProcessingTimeWindow {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 |     val sparkSession = SparkSession.builder
13 |       .master("local")
14 |       .appName("example")
15 |       .getOrCreate()
16 |     //create stream from socket
17 |     sparkSession.sparkContext.setLogLevel("ERROR")
18 |     val socketStreamDf = sparkSession.readStream
19 |       .format("socket")
20 |       .option("host", "localhost")
21 |       .option("port", 50050)
22 |       .load()
23 |     val currentTimeDf = socketStreamDf.withColumn("processingTime",current_timestamp())
24 |     import sparkSession.implicits._
25 |     val socketDs = currentTimeDf.as[(String, Timestamp)]
26 |     val wordsDs = socketDs
27 |       .flatMap(line => line._1.split(" ").map(word => (word, line._2)))
28 |       .toDF("word", "processingTime")
29 | 
30 |     val windowedCount = wordsDs
31 |       .groupBy(
32 |         window($"processingTime", "15 seconds")
33 |       )
34 |       .count()
35 |       .orderBy("window")
36 | 
37 |     val query =
38 |       windowedCount.writeStream
39 |         .format("console")
40 |         .option("truncate","false")
41 |         .outputMode(OutputMode.Complete())
42 | 
43 |     query.start().awaitTermination()
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/RecoverableAggregation.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.streaming
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.OutputMode
 5 | import org.apache.spark.sql.types.{
 6 |   DoubleType,
 7 |   StringType,
 8 |   StructField,
 9 |   StructType
10 | }
11 | 
12 | object RecoverableAggregation {
13 | 
14 |   def main(args: Array[String]): Unit = {
15 |     val sparkSession = SparkSession.builder
16 |       .master("local")
17 |       .appName("example")
18 |       .getOrCreate()
19 | 
20 |     val schema = StructType(
21 |       Array(StructField("transactionId", StringType),
22 |             StructField("customerId", StringType),
23 |             StructField("itemId", StringType),
24 |             StructField("amountPaid", DoubleType)))
25 | 
26 |     //create stream from folder
27 |     val fileStreamDf = sparkSession.readStream
28 |       .option("header", "true")
29 |       .schema(schema)
30 |       .csv("/tmp/input")
31 | 
32 |     val countDs = fileStreamDf.groupBy("customerId").sum("amountPaid")
33 |     val query =
34 |       countDs.writeStream
35 |         .format("console")
36 |         .option("checkpointLocation", "/tmp/checkpoint")
37 |         .outputMode(OutputMode.Complete())
38 | 
39 |     query.start().awaitTermination()
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/SessionisationExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.streaming
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.{ GroupState, GroupStateTimeout }
 5 | 
 6 | import scala.util.Try
 7 | 
 8 | case class Session(sessionId: String, value: Double, endSignal: Option[String])
 9 | 
10 | case class SessionInfo(
11 |   totalSum: Double)
12 | 
13 | case class SessionUpdate(
14 |   id:       String,
15 |   totalSum: Double,
16 |   expired:  Boolean)
17 | 
18 | object SessionisationExample {
19 | 
20 |   def main(args: Array[String]): Unit = {
21 |     val sparkSession = SparkSession.builder
22 |       .master("local")
23 |       .appName("example")
24 |       .getOrCreate()
25 |     //create stream from socket
26 |     sparkSession.sparkContext.setLogLevel("ERROR")
27 |     val socketStreamDf = sparkSession.readStream
28 |       .format("socket")
29 |       .option("host", "localhost")
30 |       .option("port", 50050)
31 |       .load()
32 |     import sparkSession.implicits._
33 |     val socketDs = socketStreamDf.as[String]
34 | 
35 |     // events
36 |     val events = socketDs.map(line ⇒ {
37 |       val columns = line.split(",")
38 |       val endSignal = Try(Some(columns(2))).getOrElse(None)
39 |       Session(columns(0), columns(1).toDouble, endSignal)
40 |     })
41 | 
42 |     val sessionUpdates = events.groupByKey(_.sessionId)
43 |       .mapGroupsWithState[SessionInfo, SessionUpdate](GroupStateTimeout.NoTimeout()) {
44 |         case (sessionId: String, eventsIter: Iterator[Session], state: GroupState[SessionInfo]) ⇒
45 |           val events = eventsIter.toSeq
46 |           val updatedSession = if (state.exists) {
47 |             val existingState = state.get
48 |             val updatedEvents = SessionInfo(existingState.totalSum + events.map(event ⇒ event.value).reduce(_ + _))
49 |             updatedEvents
50 |           }
51 |           else {
52 |             SessionInfo(events.map(event => event.value).reduce(_+_))
53 |           }
54 |           state.update(updatedSession)
55 |           //check did we get end signal or not
56 |           val isEndSignal = events.filter(value ⇒ value.endSignal.isDefined).length > 0
57 |           if (isEndSignal) {
58 |             state.remove()
59 |             SessionUpdate(sessionId, updatedSession.totalSum, true)
60 |           }
61 |           else {
62 |             SessionUpdate(sessionId, updatedSession.totalSum, false)
63 |           }
64 |       }
65 | 
66 |     val query = sessionUpdates
67 |       .writeStream
68 |       .outputMode("update")
69 |       .format("console")
70 |       .start()
71 | 
72 |     query.awaitTermination()
73 | 
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/SocketMiniBatchExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.streaming
 2 | 
 3 | import java.util.concurrent.TimeUnit
 4 | 
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
 7 | 
 8 | /**
 9 |   * Created by madhu on 24/07/17.
10 |   */
11 | object SocketMiniBatchExample {
12 | 
13 |   def main(args: Array[String]): Unit = {
14 | 
15 |     val sparkSession = SparkSession.builder.
16 |       master("local")
17 |       .appName("example")
18 |       .getOrCreate()
19 | 
20 |     //create stream from socket
21 | 
22 |     val socketStreamDf = sparkSession.readStream.
23 |       format("socket")
24 |       .option("host", "localhost")
25 |       .option("port", 50050).load()
26 | 
27 |     val query = socketStreamDf.writeStream.format("console").outputMode(OutputMode.Append()).trigger(
28 |       Trigger.ProcessingTime(10, TimeUnit.SECONDS)
29 |     ).start()
30 | 
31 |     query.awaitTermination()
32 |   }
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/SocketReadExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.streaming
 2 | 
 3 | import org.apache.spark.sql.streaming.OutputMode
 4 | import org.apache.spark.sql.{SaveMode, SparkSession}
 5 | 
 6 | /**
 7 |   * Created by madhu on 24/07/17.
 8 |   */
 9 | object SocketReadExample {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 | 
13 |     val sparkSession = SparkSession.builder
14 |       .master("local")
15 |       .appName("example")
16 |       .getOrCreate()
17 | 
18 |     //create stream from socket
19 | 
20 |     val socketStreamDf = sparkSession.readStream
21 |       .format("socket")
22 |       .option("host", "localhost")
23 |       .option("port", 50050)
24 |       .load()
25 | 
26 |     val consoleDataFrameWriter = socketStreamDf.writeStream
27 |       .format("console")
28 |       .outputMode(OutputMode.Append())
29 | 
30 |     val query = consoleDataFrameWriter.start()
31 | 
32 |     query.awaitTermination()
33 | 
34 |  }
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/SocketWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.streaming
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.OutputMode
 5 | 
 6 | /**
 7 |   * Created by madhu on 24/07/17.
 8 |   */
 9 | object SocketWordCount {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 |     val sparkSession = SparkSession.builder
13 |       .master("local")
14 |       .appName("example")
15 |       .getOrCreate()
16 | 
17 |     //create stream from socket
18 | 
19 |     val socketStreamDf = sparkSession.readStream
20 |       .format("socket")
21 |       .option("host", "localhost")
22 |       .option("port", 50050)
23 |       .load()
24 | 
25 |     import sparkSession.implicits._
26 |     val socketDs = socketStreamDf.as[String]
27 |     val wordsDs =  socketDs.flatMap(value => value.split(" "))
28 |     val countDs = wordsDs.groupBy("value").count()
29 | 
30 |     val query =
31 |       countDs.writeStream.format("console").outputMode(OutputMode.Complete())
32 | 
33 |     query.start().awaitTermination()
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/StatelessWordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.streaming
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.{ OutputMode, Trigger }
 5 | 
 6 | /**
 7 |  * Created by madhu on 24/07/17.
 8 |  */
 9 | object StatelessWordCount {
10 | 
11 |   def main(args: Array[String]): Unit = {
12 |     val sparkSession = SparkSession.builder
13 |       .master("local")
14 |       .appName("example")
15 |       .getOrCreate()
16 | 
17 |     //create stream from socket
18 | 
19 |     val socketStreamDf = sparkSession.readStream
20 |       .format("socket")
21 |       .option("host", "localhost")
22 |       .option("port", 50050)
23 |       .load()
24 | 
25 |     import sparkSession.implicits._
26 |     val socketDs = socketStreamDf.as[String]
27 |     val wordsDs = socketDs.flatMap(value ⇒ value.split(" "))
28 | 
29 |     val countDs = wordsDs.groupByKey(value => value).flatMapGroups{
30 |       case (value, iter) ⇒ Iterator((value, iter.length))
31 |     }.toDF("value", "count")
32 | 
33 |     val query =
34 |       countDs.writeStream.format("console").outputMode(OutputMode.Append()).
35 |         trigger(Trigger.ProcessingTime("5 seconds"))
36 | 
37 |     query.start().awaitTermination()
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/StreamJoin.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.streaming
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.OutputMode
 5 | 
 6 | /**
 7 |  * Created by madhu on 24/07/17.
 8 |  */
 9 | object StreamJoin {
10 | 
11 |   case class Sales(
12 |     transactionId: String,
13 |     customerId:    String,
14 |     itemId:        String,
15 |     amountPaid:    Double)
16 |   case class Customer(customerId: String, customerName: String)
17 |   def main(args: Array[String]): Unit = {
18 |     val sparkSession = SparkSession.builder
19 |       .master("local")
20 |       .appName("example")
21 |       .getOrCreate()
22 | 
23 |     //create stream from socket
24 |     val socketStreamDf = sparkSession.readStream
25 |       .format("socket")
26 |       .option("host", "localhost")
27 |       .option("port", 50050)
28 |       .load()
29 | 
30 |     import sparkSession.implicits._
31 |     //take customer data as static df
32 |     val customerDs = sparkSession.read
33 |       .format("csv")
34 |       .option("header", true)
35 |       .load("src/main/resources/customers.csv")
36 |       .as[Customer]
37 | 
38 |     import sparkSession.implicits._
39 |     val dataDf = socketStreamDf.as[String].flatMap(value ⇒ value.split(" "))
40 |     val salesDs = dataDf
41 |       .as[String]
42 |       .map(value ⇒ {
43 |         val values = value.split(",")
44 |         Sales(values(0), values(1), values(2), values(3).toDouble)
45 |       })
46 | 
47 |     val joinedDs = salesDs
48 |       .join(customerDs, "customerId")
49 |     //create sales schema
50 |     val query =
51 |       joinedDs.writeStream.format("console").outputMode(OutputMode.Append())
52 | 
53 |     query.start().awaitTermination()
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/WaterMarkExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.examples.sparktwo.streaming
 2 | 
 3 | import java.sql.Timestamp
 4 | 
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.streaming.OutputMode
 8 | 
 9 | object WaterMarkExample {
10 | 
11 |   case class Stock(time: Timestamp, symbol: String, value: Double)
12 | 
13 |   def main(args: Array[String]): Unit = {
14 |     val sparkSession = SparkSession.builder
15 |       .master("local")
16 |       .appName("example")
17 |       .getOrCreate()
18 |     //create stream from socket
19 | 
20 |     import sparkSession.implicits._
21 |     sparkSession.sparkContext.setLogLevel("ERROR")
22 |     val socketStreamDs = sparkSession.readStream
23 |       .format("socket")
24 |       .option("host", "localhost")
25 |       .option("port", 50050)
26 |       .load()
27 |       .as[String]
28 | 
29 |     // read as stock
30 |     val stockDs = socketStreamDs.map(value => {
31 |       val columns = value.split(",")
32 |       Stock(new Timestamp(columns(0).toLong), columns(1), columns(2).toDouble)
33 |     })
34 | 
35 |     val windowedCount = stockDs
36 |       .withWatermark("time", "500 milliseconds")
37 |       .groupBy(
38 |         window($"time", "10 seconds")
39 |       )
40 |       .sum("value")
41 | 
42 |     val query =
43 |       windowedCount.writeStream
44 |         .format("console")
45 |         .option("truncate", "false")
46 |         .outputMode(OutputMode.Update())
47 | 
48 |     query.start().awaitTermination()
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/todo.txt:
--------------------------------------------------------------------------------
1 | * Recoverable State
2 | * Using Window API
3 |     * Process Time
4 |     * Event Time
5 | * Custom Trigger - Sensitisation
6 | * Different kind of sinks
7 | * Kafka integration ( with offset)
8 | * Writing custom sources


--------------------------------------------------------------------------------