├── .gitignore ├── README.md ├── build.sbt ├── project ├── build.properties └── plugins.sbt ├── src └── main │ ├── resources │ ├── adult.csv │ ├── applestock.csv │ ├── customers.csv │ ├── data.txt │ ├── eventimedata.md │ └── sales.csv │ └── scala │ └── com │ └── madhukaraphatak │ └── examples │ └── sparktwo │ ├── CatalogExample.scala │ ├── CustomOptimizationExample.scala │ ├── DataSetWordCount.scala │ ├── DataSourceV2Example.scala │ ├── DatasetVsDataFrame.scala │ ├── MysqlTransactionExample.scala │ ├── RDDToDataSet.scala │ ├── SparkSessionExample.scala │ ├── TimeWindowExample.scala │ ├── datasourcev2 │ ├── MysqlWithTransaction.scala │ ├── SimpleCsvDataSource.scala │ ├── SimpleDataSource.scala │ ├── SimpleDataSourceWithPartitionAffinity.scala │ ├── SimpleMultiDataSource.scala │ ├── SimpleMysqlDataSource.scala │ └── SimpleMysqlWriter.scala │ ├── ml │ ├── MultiColumnTransformation.scala │ └── ParallelCrossValidation.scala │ └── streaming │ ├── EventTimeExample.scala │ ├── FileStreamExample.scala │ ├── IngestionTimeWindow.scala │ ├── ProcessingTimeWindow.scala │ ├── RecoverableAggregation.scala │ ├── SessionisationExample.scala │ ├── SocketMiniBatchExample.scala │ ├── SocketReadExample.scala │ ├── SocketWordCount.scala │ ├── StatelessWordCount.scala │ ├── StreamJoin.scala │ └── WaterMarkExample.scala └── todo.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | .idea/ 3 | *.iml 4 | target/ 5 | project/target 6 | dependency-reduced-pom.xml 7 | *.pdf 8 | *.swp 9 | *.sw* 10 | metastore_db 11 | *.log 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Examples for Spark 2.0 release. 2 | 3 | # Build 4 | 5 | sbt clean package -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "SparkTwoExperiments" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.11.8" 6 | 7 | val sparkVersion = "2.3.0" 8 | 9 | 10 | resolvers ++= Seq( 11 | "apache-snapshots" at "http://repository.apache.org/snapshots/" 12 | ) 13 | 14 | libraryDependencies ++= Seq( 15 | "org.apache.spark" %% "spark-core" % sparkVersion, 16 | "org.apache.spark" %% "spark-sql" % sparkVersion, 17 | "org.apache.spark" %% "spark-mllib" % sparkVersion, 18 | "org.apache.spark" %% "spark-streaming" % sparkVersion, 19 | "org.apache.spark" %% "spark-hive" % sparkVersion, 20 | "mysql" % "mysql-connector-java" % "5.1.6" 21 | ) 22 | 23 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.8 -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn -------------------------------------------------------------------------------- /src/main/resources/adult.csv: -------------------------------------------------------------------------------- 1 | age,workclass,fnlwgt,education,education_num,martial_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary 2 | 39, State-gov, 77516, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K 3 | 50, Self-emp-not-inc, 83311, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 13, United-States, <=50K 4 | 38, Private, 215646, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K 5 | 53, Private, 234721, 11th, 7, Married-civ-spouse, Handlers-cleaners, Husband, Black, Male, 0, 0, 40, United-States, <=50K 6 | 28, Private, 338409, Bachelors, 13, Married-civ-spouse, Prof-specialty, Wife, Black, Female, 0, 0, 40, Cuba, <=50K 7 | 37, Private, 284582, Masters, 14, Married-civ-spouse, Exec-managerial, Wife, White, Female, 0, 0, 40, United-States, <=50K 8 | 49, Private, 160187, 9th, 5, Married-spouse-absent, Other-service, Not-in-family, Black, Female, 0, 0, 16, Jamaica, <=50K 9 | 52, Self-emp-not-inc, 209642, HS-grad, 9, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 45, United-States, >50K 10 | 31, Private, 45781, Masters, 14, Never-married, Prof-specialty, Not-in-family, White, Female, 14084, 0, 50, United-States, >50K 11 | 42, Private, 159449, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 5178, 0, 40, United-States, >50K 12 | 37, Private, 280464, Some-college, 10, Married-civ-spouse, Exec-managerial, Husband, Black, Male, 0, 0, 80, United-States, >50K 13 | 30, State-gov, 141297, Bachelors, 13, Married-civ-spouse, Prof-specialty, Husband, Asian-Pac-Islander, Male, 0, 0, 40, India, >50K 14 | 23, Private, 122272, Bachelors, 13, Never-married, Adm-clerical, Own-child, White, Female, 0, 0, 30, United-States, <=50K 15 | 32, Private, 205019, Assoc-acdm, 12, Never-married, Sales, Not-in-family, Black, Male, 0, 0, 50, United-States, <=50K 16 | 40, Private, 121772, Assoc-voc, 11, Married-civ-spouse, Craft-repair, Husband, Asian-Pac-Islander, Male, 0, 0, 40, ?, >50K 17 | 34, Private, 245487, 7th-8th, 4, Married-civ-spouse, Transport-moving, Husband, Amer-Indian-Eskimo, Male, 0, 0, 45, Mexico, <=50K 18 | 25, Self-emp-not-inc, 176756, HS-grad, 9, Never-married, Farming-fishing, Own-child, White, Male, 0, 0, 35, United-States, <=50K 19 | 32, Private, 186824, HS-grad, 9, Never-married, Machine-op-inspct, Unmarried, White, Male, 0, 0, 40, United-States, <=50K 20 | 38, Private, 28887, 11th, 7, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 50, United-States, <=50K 21 | 43, Self-emp-not-inc, 292175, Masters, 14, Divorced, Exec-managerial, Unmarried, White, Female, 0, 0, 45, United-States, >50K 22 | 40, Private, 193524, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 60, United-States, >50K 23 | 54, Private, 302146, HS-grad, 9, Separated, Other-service, Unmarried, Black, Female, 0, 0, 20, United-States, <=50K 24 | 35, Federal-gov, 76845, 9th, 5, Married-civ-spouse, Farming-fishing, Husband, Black, Male, 0, 0, 40, United-States, <=50K 25 | 43, Private, 117037, 11th, 7, Married-civ-spouse, Transport-moving, Husband, White, Male, 0, 2042, 40, United-States, <=50K 26 | 59, Private, 109015, HS-grad, 9, Divorced, Tech-support, Unmarried, White, Female, 0, 0, 40, United-States, <=50K 27 | 56, Local-gov, 216851, Bachelors, 13, Married-civ-spouse, Tech-support, Husband, White, Male, 0, 0, 40, United-States, >50K 28 | 19, Private, 168294, HS-grad, 9, Never-married, Craft-repair, Own-child, White, Male, 0, 0, 40, United-States, <=50K 29 | 54, ?, 180211, Some-college, 10, Married-civ-spouse, ?, Husband, Asian-Pac-Islander, Male, 0, 0, 60, South, >50K 30 | 39, Private, 367260, HS-grad, 9, Divorced, Exec-managerial, Not-in-family, White, Male, 0, 0, 80, United-States, <=50K 31 | 49, Private, 193366, HS-grad, 9, Married-civ-spouse, Craft-repair, Husband, White, Male, 0, 0, 40, United-States, <=50K 32 | 23, Local-gov, 190709, Assoc-acdm, 12, Never-married, Protective-serv, Not-in-family, White, Male, 0, 0, 52, United-States, <=50K 33 | 20, Private, 266015, Some-college, 10, Never-married, Sales, Own-child, Black, Male, 0, 0, 44, United-States, <=50K 34 | 45, Private, 386940, Bachelors, 13, Divorced, Exec-managerial, Own-child, White, Male, 0, 1408, 40, United-States, <=50K 35 | 30, Federal-gov, 59951, Some-college, 10, Married-civ-spouse, Adm-clerical, Own-child, White, Male, 0, 0, 40, United-States, <=50K 36 | 22, State-gov, 311512, Some-college, 10, Married-civ-spouse, Other-service, Husband, Black, Male, 0, 0, 15, United-States, <=50K 37 | 48, Private, 242406, 11th, 7, Never-married, Machine-op-inspct, Unmarried, White, Male, 0, 0, 40, Puerto-Rico, <=50K 38 | 21, Private, 197200, Some-college, 10, Never-married, Machine-op-inspct, Own-child, White, Male, 0, 0, 40, United-States, <=50K 39 | 19, Private, 544091, HS-grad, 9, Married-AF-spouse, Adm-clerical, Wife, White, Female, 0, 0, 25, United-States, <=50K 40 | 31, Private, 84154, Some-college, 10, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 38, ?, >50K 41 | 48, Self-emp-not-inc, 265477, Assoc-acdm, 12, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 40, United-States, <=50K 42 | 31, Private, 507875, 9th, 5, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 0, 0, 43, United-States, <=50K 43 | 53, Self-emp-not-inc, 88506, Bachelors, 13, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 40, United-States, <=50K 44 | 24, Private, 172987, Bachelors, 13, Married-civ-spouse, Tech-support, Husband, White, Male, 0, 0, 50, United-States, <=50K 45 | 49, Private, 94638, HS-grad, 9, Separated, Adm-clerical, Unmarried, White, Female, 0, 0, 40, United-States, <=50K 46 | 25, Private, 289980, HS-grad, 9, Never-married, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 35, United-States, <=50K 47 | 57, Federal-gov, 337895, Bachelors, 13, Married-civ-spouse, Prof-specialty, Husband, Black, Male, 0, 0, 40, United-States, >50K 48 | 53, Private, 144361, HS-grad, 9, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 0, 0, 38, United-States, <=50K 49 | 44, Private, 128354, Masters, 14, Divorced, Exec-managerial, Unmarried, White, Female, 0, 0, 40, United-States, <=50K 50 | 41, State-gov, 101603, Assoc-voc, 11, Married-civ-spouse, Craft-repair, Husband, White, Male, 0, 0, 40, United-States, <=50K 51 | 29, Private, 271466, Assoc-voc, 11, Never-married, Prof-specialty, Not-in-family, White, Male, 0, 0, 43, United-States, <=50K 52 | 25, Private, 32275, Some-college, 10, Married-civ-spouse, Exec-managerial, Wife, Other, Female, 0, 0, 40, United-States, <=50K 53 | 18, Private, 226956, HS-grad, 9, Never-married, Other-service, Own-child, White, Female, 0, 0, 30, ?, <=50K 54 | 47, Private, 51835, Prof-school, 15, Married-civ-spouse, Prof-specialty, Wife, White, Female, 0, 1902, 60, Honduras, >50K 55 | 50, Federal-gov, 251585, Bachelors, 13, Divorced, Exec-managerial, Not-in-family, White, Male, 0, 0, 55, United-States, >50K 56 | 47, Self-emp-inc, 109832, HS-grad, 9, Divorced, Exec-managerial, Not-in-family, White, Male, 0, 0, 60, United-States, <=50K 57 | 43, Private, 237993, Some-college, 10, Married-civ-spouse, Tech-support, Husband, White, Male, 0, 0, 40, United-States, >50K 58 | 46, Private, 216666, 5th-6th, 3, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 0, 0, 40, Mexico, <=50K 59 | 35, Private, 56352, Assoc-voc, 11, Married-civ-spouse, Other-service, Husband, White, Male, 0, 0, 40, Puerto-Rico, <=50K 60 | 41, Private, 147372, HS-grad, 9, Married-civ-spouse, Adm-clerical, Husband, White, Male, 0, 0, 48, United-States, <=50K 61 | 30, Private, 188146, HS-grad, 9, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 5013, 0, 40, United-States, <=50K 62 | 30, Private, 59496, Bachelors, 13, Married-civ-spouse, Sales, Husband, White, Male, 2407, 0, 40, United-States, <=50K 63 | 32, ?, 293936, 7th-8th, 4, Married-spouse-absent, ?, Not-in-family, White, Male, 0, 0, 40, ?, <=50K 64 | 48, Private, 149640, HS-grad, 9, Married-civ-spouse, Transport-moving, Husband, White, Male, 0, 0, 40, United-States, <=50K 65 | 42, Private, 116632, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 45, United-States, >50K 66 | 29, Private, 105598, Some-college, 10, Divorced, Tech-support, Not-in-family, White, Male, 0, 0, 58, United-States, <=50K 67 | 36, Private, 155537, HS-grad, 9, Married-civ-spouse, Craft-repair, Husband, White, Male, 0, 0, 40, United-States, <=50K 68 | 28, Private, 183175, Some-college, 10, Divorced, Adm-clerical, Not-in-family, White, Female, 0, 0, 40, United-States, <=50K 69 | 53, Private, 169846, HS-grad, 9, Married-civ-spouse, Adm-clerical, Wife, White, Female, 0, 0, 40, United-States, >50K 70 | 49, Self-emp-inc, 191681, Some-college, 10, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 50, United-States, >50K 71 | 25, ?, 200681, Some-college, 10, Never-married, ?, Own-child, White, Male, 0, 0, 40, United-States, <=50K 72 | 19, Private, 101509, Some-college, 10, Never-married, Prof-specialty, Own-child, White, Male, 0, 0, 32, United-States, <=50K 73 | 31, Private, 309974, Bachelors, 13, Separated, Sales, Own-child, Black, Female, 0, 0, 40, United-States, <=50K 74 | 29, Self-emp-not-inc, 162298, Bachelors, 13, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 70, United-States, >50K 75 | 23, Private, 211678, Some-college, 10, Never-married, Machine-op-inspct, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K 76 | 79, Private, 124744, Some-college, 10, Married-civ-spouse, Prof-specialty, Other-relative, White, Male, 0, 0, 20, United-States, <=50K 77 | 27, Private, 213921, HS-grad, 9, Never-married, Other-service, Own-child, White, Male, 0, 0, 40, Mexico, <=50K 78 | 40, Private, 32214, Assoc-acdm, 12, Married-civ-spouse, Adm-clerical, Husband, White, Male, 0, 0, 40, United-States, <=50K 79 | 67, ?, 212759, 10th, 6, Married-civ-spouse, ?, Husband, White, Male, 0, 0, 2, United-States, <=50K 80 | 18, Private, 309634, 11th, 7, Never-married, Other-service, Own-child, White, Female, 0, 0, 22, United-States, <=50K 81 | 31, Local-gov, 125927, 7th-8th, 4, Married-civ-spouse, Farming-fishing, Husband, White, Male, 0, 0, 40, United-States, <=50K 82 | 18, Private, 446839, HS-grad, 9, Never-married, Sales, Not-in-family, White, Male, 0, 0, 30, United-States, <=50K 83 | 52, Private, 276515, Bachelors, 13, Married-civ-spouse, Other-service, Husband, White, Male, 0, 0, 40, Cuba, <=50K 84 | 46, Private, 51618, HS-grad, 9, Married-civ-spouse, Other-service, Wife, White, Female, 0, 0, 40, United-States, <=50K 85 | 59, Private, 159937, HS-grad, 9, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 48, United-States, <=50K 86 | 44, Private, 343591, HS-grad, 9, Divorced, Craft-repair, Not-in-family, White, Female, 14344, 0, 40, United-States, >50K 87 | 53, Private, 346253, HS-grad, 9, Divorced, Sales, Own-child, White, Female, 0, 0, 35, United-States, <=50K 88 | 49, Local-gov, 268234, HS-grad, 9, Married-civ-spouse, Protective-serv, Husband, White, Male, 0, 0, 40, United-States, >50K 89 | 33, Private, 202051, Masters, 14, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 50, United-States, <=50K 90 | 30, Private, 54334, 9th, 5, Never-married, Sales, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K 91 | 43, Federal-gov, 410867, Doctorate, 16, Never-married, Prof-specialty, Not-in-family, White, Female, 0, 0, 50, United-States, >50K 92 | 57, Private, 249977, Assoc-voc, 11, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 40, United-States, <=50K 93 | 37, Private, 286730, Some-college, 10, Divorced, Craft-repair, Unmarried, White, Female, 0, 0, 40, United-States, <=50K 94 | 28, Private, 212563, Some-college, 10, Divorced, Machine-op-inspct, Unmarried, Black, Female, 0, 0, 25, United-States, <=50K 95 | 30, Private, 117747, HS-grad, 9, Married-civ-spouse, Sales, Wife, Asian-Pac-Islander, Female, 0, 1573, 35, ?, <=50K 96 | 34, Local-gov, 226296, Bachelors, 13, Married-civ-spouse, Protective-serv, Husband, White, Male, 0, 0, 40, United-States, >50K 97 | 29, Local-gov, 115585, Some-college, 10, Never-married, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 50, United-States, <=50K 98 | 48, Self-emp-not-inc, 191277, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 1902, 60, United-States, >50K 99 | 37, Private, 202683, Some-college, 10, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 48, United-States, >50K 100 | 48, Private, 171095, Assoc-acdm, 12, Divorced, Exec-managerial, Unmarried, White, Female, 0, 0, 40, England, <=50K 101 | 32, Federal-gov, 249409, HS-grad, 9, Never-married, Other-service, Own-child, Black, Male, 0, 0, 40, United-States, <=50K 102 | 76, Private, 124191, Masters, 14, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 40, United-States, >50K 103 | -------------------------------------------------------------------------------- /src/main/resources/customers.csv: -------------------------------------------------------------------------------- 1 | customerId,customerName 2 | 1,John 3 | 2,Clerk 4 | 3,Micheal 5 | 4,Sample 6 | -------------------------------------------------------------------------------- /src/main/resources/data.txt: -------------------------------------------------------------------------------- 1 | hello how are you 2 | hello how are you -------------------------------------------------------------------------------- /src/main/resources/eventimedata.md: -------------------------------------------------------------------------------- 1 | ## Event Time Example 2 | //The first records is for time Wed, 27 Apr 2016 11:34:22 GMT. 3 | 4 | 1461756862000,"aapl",500.0 5 | 6 | // Event after 5 seconds 7 | 8 | 1461756867001,"aapl",600.0 9 | 10 | // Event after 11 seconds 11 | 12 | 1461756872000,"aapl",400.0 13 | 14 | 15 | ## Late Events Example 16 | 17 | //It’s an event is for Wed, 27 Apr 2016 11:34:27 which is 5 seconds before the last event. 18 | 19 | 1461756867001,"aapl",200.0 20 | 21 | 22 | ## Session Window Input 23 | 24 | // start two sessions 25 | 26 | session1,100 27 | session2,200 28 | 29 | // Additional Event for Session 1 30 | 31 | session1,200 32 | 33 | // End Session 1 34 | 35 | session1,200,end 36 | 37 | 38 | // Starting new session1 and updating existing session 2 39 | 40 | session1,100 41 | session2,200 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /src/main/resources/sales.csv: -------------------------------------------------------------------------------- 1 | transactionId,customerId,itemId,amountPaid 2 | 111,1,1,100.0 3 | 112,2,2,505.0 4 | 113,3,3,510.0 5 | 114,4,4,600.0 6 | 115,1,2,500.0 7 | 116,1,2,500.0 8 | 117,1,2,500.0 9 | 118,1,2,500.0 10 | 119,2,3,500.0 11 | 120,1,2,500.0 12 | 121,1,4,500.0 13 | 122,1,2,500.0 14 | 123,1,4,500.0 15 | 124,1,2,500.0 -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/CatalogExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | 6 | /** 7 | * Catalogue Example 8 | */ 9 | object CatalogExample { 10 | 11 | def main(args: Array[String]) { 12 | 13 | val sparkSession = SparkSession.builder. 14 | master("local") 15 | .appName("example") 16 | .getOrCreate() 17 | 18 | 19 | val df = sparkSession.read.csv("src/main/resources/sales.csv") 20 | df.createTempView("sales") 21 | 22 | //interacting with catalogue 23 | 24 | val catalog = sparkSession.catalog 25 | 26 | //print the databases 27 | 28 | catalog.listDatabases().select("name").show() 29 | 30 | // print all the tables 31 | 32 | catalog.listTables().select("name").show() 33 | 34 | // is cached 35 | println(catalog.isCached("sales")) 36 | df.cache() 37 | println(catalog.isCached("sales")) 38 | 39 | // drop the table 40 | catalog.dropTempView("sales") 41 | catalog.listTables().select("name").show() 42 | 43 | // list functions 44 | catalog.listFunctions().select("name","description","className","isTemporary").show(100) 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/CustomOptimizationExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.catalyst.expressions.{Literal, Multiply} 5 | import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan 6 | import org.apache.spark.sql.catalyst.rules.Rule 7 | 8 | /** 9 | * User Defined Optimization 10 | */ 11 | object CustomOptimizationExample { 12 | 13 | object MultiplyOptimizationRule extends Rule[LogicalPlan] { 14 | def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions { 15 | case Multiply(left,right) if right.isInstanceOf[Literal] && 16 | right.asInstanceOf[Literal].value.asInstanceOf[Double] == 1.0 => 17 | println("optimization of one applied") 18 | left 19 | } 20 | } 21 | 22 | 23 | def main(args: Array[String]) { 24 | 25 | val sparkSession = SparkSession.builder. 26 | master("local") 27 | .appName("example") 28 | .getOrCreate() 29 | 30 | 31 | val df = sparkSession.read.option("header","true").csv("src/main/resources/sales.csv") 32 | val multipliedDF = df.selectExpr("amountPaid * 1") 33 | println(multipliedDF.queryExecution.optimizedPlan.numberedTreeString) 34 | 35 | //add our custom optimization 36 | sparkSession.experimental.extraOptimizations = Seq(MultiplyOptimizationRule) 37 | val multipliedDFWithOptimization = df.selectExpr("amountPaid * 1") 38 | println("after optimization") 39 | 40 | println(multipliedDFWithOptimization.queryExecution.optimizedPlan.numberedTreeString) 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/DataSetWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * Created by madhu on 6/5/16. 7 | */ 8 | object DataSetWordCount { 9 | 10 | def main(args: Array[String]) { 11 | 12 | val sparkSession = SparkSession.builder. 13 | master("local") 14 | .appName("example") 15 | .getOrCreate() 16 | 17 | import sparkSession.implicits._ 18 | val data = sparkSession.read.text("src/main/resources/data.txt").as[String] 19 | 20 | val words = data.flatMap(value => value.split("\\s+")) 21 | 22 | val groupedWords = words.groupByKey(_.toLowerCase) 23 | 24 | val counts = groupedWords.count() 25 | 26 | counts.show() 27 | 28 | 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/DataSourceV2Example.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2 2 | 3 | import org.apache.spark.Partition 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object DataSourceV2Example { 7 | 8 | def main(args: Array[String]) { 9 | 10 | val sparkSession = SparkSession.builder 11 | .master("local[2]") 12 | .appName("example") 13 | .getOrCreate() 14 | 15 | val simpleDf = sparkSession.read 16 | .format("com.madhukaraphatak.examples.sparktwo.datasourcev2.simple") 17 | .load() 18 | 19 | simpleDf.show() 20 | println( 21 | "number of partitions in simple source is " + simpleDf.rdd.getNumPartitions) 22 | 23 | val simpleMultiDf = sparkSession.read 24 | .format("com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemulti") 25 | .load() 26 | 27 | simpleMultiDf.show() 28 | println( 29 | "number of partitions in simple multi source is " + simpleMultiDf.rdd.getNumPartitions) 30 | 31 | val simpleCsvDf = sparkSession.read 32 | .format("com.madhukaraphatak.examples.sparktwo.datasourcev2.simplecsv") 33 | .load("src/main/resources/sales.csv") 34 | 35 | simpleCsvDf.printSchema() 36 | simpleCsvDf.show() 37 | println( 38 | "number of partitions in simple csv source is " + simpleCsvDf.rdd.getNumPartitions) 39 | 40 | val simpleMysqlDf = sparkSession.read 41 | .format("com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemysql") 42 | .load() 43 | 44 | simpleMysqlDf.printSchema() 45 | simpleMysqlDf.filter("user=\"root\"").show() 46 | println( 47 | "number of partitions in simple mysql source is " + simpleMysqlDf.rdd.getNumPartitions) 48 | 49 | //write examples 50 | simpleMysqlDf.write 51 | .format( 52 | "com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemysqlwriter") 53 | .save() 54 | simpleMysqlDf.write 55 | .format( 56 | "com.madhukaraphatak.examples.sparktwo.datasourcev2.mysqlwithtransaction") 57 | .save() 58 | 59 | val simplePartitoningDf = sparkSession.read 60 | .format( 61 | "com.madhukaraphatak.examples.sparktwo.datasourcev2.partitionaffinity") 62 | .load() 63 | 64 | val dfRDD = simplePartitoningDf.rdd 65 | val baseRDD = 66 | dfRDD.dependencies.head.rdd.dependencies.head.rdd.dependencies.head.rdd 67 | 68 | val partition = baseRDD.partitions(0) 69 | val getPrefferedLocationDef = baseRDD.getClass 70 | .getMethod("getPreferredLocations", classOf[Partition]) 71 | val preferredLocation = getPrefferedLocationDef 72 | .invoke(baseRDD, partition) 73 | .asInstanceOf[Seq[String]] 74 | println("preferred location is " + preferredLocation) 75 | 76 | sparkSession.stop() 77 | 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/DatasetVsDataFrame.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * Logical Plans for Dataframe and Dataset 7 | */ 8 | object DatasetVsDataFrame { 9 | 10 | case class Sales(transactionId:Int,customerId:Int,itemId:Int,amountPaid:Double) 11 | 12 | def main(args: Array[String]) { 13 | 14 | val sparkSession = SparkSession.builder. 15 | master("local") 16 | .appName("example") 17 | .getOrCreate() 18 | 19 | val sparkContext = sparkSession.sparkContext 20 | import sparkSession.implicits._ 21 | 22 | 23 | //read data from text file 24 | 25 | val df = sparkSession.read.option("header","true").option("inferSchema","true").csv("src/main/resources/sales.csv") 26 | val ds = sparkSession.read.option("header","true").option("inferSchema","true").csv("src/main/resources/sales.csv").as[Sales] 27 | 28 | 29 | val selectedDF = df.select("itemId") 30 | 31 | val selectedDS = ds.map(_.itemId) 32 | 33 | println(selectedDF.queryExecution.optimizedPlan.numberedTreeString) 34 | 35 | println(selectedDS.queryExecution.optimizedPlan.numberedTreeString) 36 | 37 | 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/MysqlTransactionExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object MysqlTransactionExample { 6 | 7 | def main(args: Array[String]) { 8 | 9 | val sparkSession = SparkSession.builder 10 | .master("local[2]") 11 | .appName("example") 12 | .getOrCreate() 13 | 14 | import sparkSession.implicits._ 15 | val simpleDf = sparkSession.read 16 | .format("com.madhukaraphatak.examples.sparktwo.datasourcev2.simple") 17 | .load() 18 | .as[String] 19 | 20 | val errorDf = simpleDf.map(value => { 21 | if (value == "3") throw new IllegalArgumentException("value cannot be 3") 22 | else value 23 | }) 24 | 25 | //errorDf.show() 26 | 27 | // results in partial writes 28 | errorDf.write 29 | .format( 30 | "com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemysqlwriter") 31 | .save() 32 | 33 | //use transactional ones 34 | 35 | errorDf.write 36 | .format( 37 | "com.madhukaraphatak.examples.sparktwo.datasourcev2.mysqlwithtransaction") 38 | .save() 39 | 40 | sparkSession.stop() 41 | 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/RDDToDataSet.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo 2 | 3 | import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} 4 | import org.apache.spark.sql.{Row, SparkSession} 5 | 6 | /** 7 | * RDD API to Dataset API 8 | */ 9 | object RDDToDataSet { 10 | 11 | def main(args: Array[String]) { 12 | 13 | val sparkSession = SparkSession.builder. 14 | master("local") 15 | .appName("example") 16 | .getOrCreate() 17 | 18 | val sparkContext = sparkSession.sparkContext 19 | import sparkSession.implicits._ 20 | 21 | 22 | //read data from text file 23 | val rdd = sparkContext.textFile("src/main/resources/data.txt") 24 | val ds = sparkSession.read.text("src/main/resources/data.txt").as[String] 25 | 26 | 27 | // do count 28 | println("count ") 29 | println(rdd.count()) 30 | println(ds.count()) 31 | 32 | // wordcount 33 | println(" wordcount ") 34 | 35 | val wordsRDD = rdd.flatMap(value => value.split("\\s+")) 36 | val wordsPair = wordsRDD.map(word => (word,1)) 37 | val wordCount = wordsPair.reduceByKey(_+_) 38 | println(wordCount.collect.toList) 39 | 40 | val wordsDs = ds.flatMap(value => value.split("\\s+")) 41 | val wordsPairDs = wordsDs.groupByKey(value => value) 42 | val wordCountDs = wordsPairDs.count 43 | wordCountDs.show() 44 | 45 | //cache 46 | rdd.cache() 47 | ds.cache() 48 | 49 | //filter 50 | 51 | val filteredRDD = wordsRDD.filter(value => value =="hello") 52 | println(filteredRDD.collect().toList) 53 | 54 | val filteredDS = wordsDs.filter(value => value =="hello") 55 | filteredDS.show() 56 | 57 | 58 | //map partitions 59 | 60 | val mapPartitionsRDD = rdd.mapPartitions(iterator => List(iterator.count(value => true)).iterator) 61 | println(s" the count each partition is ${mapPartitionsRDD.collect().toList}") 62 | 63 | val mapPartitionsDs = ds.mapPartitions(iterator => List(iterator.count(value => true)).iterator) 64 | mapPartitionsDs.show() 65 | 66 | //converting to each other 67 | val dsToRDD = ds.rdd 68 | println(dsToRDD.collect()) 69 | 70 | val rddStringToRowRDD = rdd.map(value => Row(value)) 71 | val dfschema = StructType(Array(StructField("value",StringType))) 72 | val rddToDF = sparkSession.createDataFrame(rddStringToRowRDD,dfschema) 73 | val rDDToDataSet = rddToDF.as[String] 74 | rDDToDataSet.show() 75 | 76 | // double based operation 77 | 78 | val doubleRDD = sparkContext.makeRDD(List(1.0,5.0,8.9,9.0)) 79 | val rddSum =doubleRDD.sum() 80 | val rddMean = doubleRDD.mean() 81 | 82 | println(s"sum is $rddSum") 83 | println(s"mean is $rddMean") 84 | 85 | val rowRDD = doubleRDD.map(value => Row.fromSeq(List(value))) 86 | val schema = StructType(Array(StructField("value",DoubleType))) 87 | val doubleDS = sparkSession.createDataFrame(rowRDD,schema) 88 | 89 | import org.apache.spark.sql.functions._ 90 | doubleDS.agg(sum("value")).show() 91 | doubleDS.agg(mean("value")).show() 92 | 93 | //reduceByKey API 94 | val reduceCountByRDD = wordsPair.reduceByKey(_+_) 95 | val reduceCountByDs = wordsPairDs.mapGroups((key,values) =>(key,values.length)) 96 | 97 | println(reduceCountByRDD.collect().toList) 98 | println(reduceCountByDs.collect().toList) 99 | 100 | //reduce function 101 | val rddReduce = doubleRDD.reduce((a,b) => a +b) 102 | val dsReduce = doubleDS.reduce((row1,row2) =>Row(row1.getDouble(0) + row2.getDouble(0))) 103 | 104 | println("rdd reduce is " +rddReduce +" dataset reduce "+dsReduce) 105 | 106 | } 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/SparkSessionExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * Spark Session example 7 | * 8 | */ 9 | object SparkSessionExample { 10 | 11 | def main(args: Array[String]) { 12 | 13 | val sparkSession = SparkSession.builder 14 | .master("local") 15 | .appName("spark session example") 16 | .getOrCreate() 17 | 18 | val df = sparkSession.read 19 | .option("header", "true") 20 | .csv("src/main/resources/sales.csv") 21 | 22 | df.show() 23 | 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/TimeWindowExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo 2 | 3 | import org.apache.spark.sql.{DataFrame, SparkSession} 4 | import org.apache.spark.sql.functions._ 5 | 6 | /** 7 | * Time window Example 8 | */ 9 | object TimeWindowExample { 10 | 11 | 12 | def printWindow(windowDF:DataFrame, aggCol:String) ={ 13 | windowDF.sort("window.start").select("window.start","window.end",s"$aggCol"). 14 | show(truncate = false) 15 | } 16 | 17 | def main(args: Array[String]) { 18 | 19 | val sparkSession = SparkSession.builder. 20 | master("local") 21 | .appName("time window example") 22 | .getOrCreate() 23 | 24 | 25 | sparkSession.sparkContext.setLogLevel("ERROR") 26 | val stocksDF = sparkSession.read.option("header","true"). 27 | option("inferSchema","true") 28 | .csv("src/main/resources/applestock.csv") 29 | 30 | //weekly average of 2016 31 | 32 | val stocks2016 = stocksDF.filter("year(Date)==2016") 33 | 34 | val tumblingWindowDS = stocks2016.groupBy(window(stocks2016.col("Date"),"1 week")) 35 | .agg(avg("Close").as("weekly_average")) 36 | println("weekly average in 2016 using tumbling window is") 37 | printWindow(tumblingWindowDS,"weekly_average") 38 | 39 | 40 | val windowWithStartTime = stocks2016.groupBy(window(stocks2016.col("Date"),"1 week","1 week", "4 days")). 41 | agg(avg("Close").as("weekly_average")) 42 | println("weekly average in 2016 using sliding window is") 43 | printWindow(windowWithStartTime,"weekly_average") 44 | 45 | val filteredWindow = windowWithStartTime.filter("year(window.start)=2016") 46 | println("weekly average in 2016 after filtering is") 47 | printWindow(filteredWindow,"weekly_average") 48 | 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/MysqlWithTransaction.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.mysqlwithtransaction 2 | 3 | import org.apache.spark.sql.sources.v2._ 4 | import org.apache.spark.sql.Row 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.sources.v2.writer._ 7 | import scala.collection.JavaConverters._ 8 | import org.apache.spark.sql.SparkSession 9 | import org.apache.spark.sql.sources._ 10 | import java.util.Optional 11 | import org.apache.spark.sql.SaveMode 12 | import java.sql.{Connection, DriverManager} 13 | 14 | class DefaultSource extends DataSourceV2 with WriteSupport { 15 | 16 | def createWriter(jobId: String, 17 | schema: StructType, 18 | mode: SaveMode, 19 | options: DataSourceOptions): Optional[DataSourceWriter] = { 20 | Optional.of(new MysqlDataSourceWriter()) 21 | 22 | } 23 | } 24 | 25 | class MysqlDataSourceWriter extends DataSourceWriter { 26 | 27 | override def createWriterFactory(): DataWriterFactory[Row] = { 28 | new MysqlDataWriterFactory() 29 | } 30 | 31 | override def commit(messages: Array[WriterCommitMessage]) = {} 32 | 33 | override def abort(messages: Array[WriterCommitMessage]) = { 34 | println("abort is called in data source writer") 35 | } 36 | 37 | } 38 | 39 | class MysqlDataWriterFactory extends DataWriterFactory[Row] { 40 | override def createDataWriter(partitionId: Int, 41 | attemptNumber: Int): DataWriter[Row] = { 42 | new MysqlDataWriter() 43 | } 44 | } 45 | 46 | class MysqlDataWriter extends DataWriter[Row] { 47 | 48 | val url = "jdbc:mysql://localhost/test" 49 | val user = "root" 50 | val password = "abc123" 51 | val table = "userwrite" 52 | 53 | val connection = DriverManager.getConnection(url, user, password) 54 | connection.setAutoCommit(false) 55 | val statement = s"insert into $table (user) values (?)" 56 | val preparedStatement = connection.prepareStatement(statement) 57 | 58 | def write(record: Row) = { 59 | val value = record.getString(0) 60 | preparedStatement.setString(1, value) 61 | preparedStatement.executeUpdate() 62 | } 63 | 64 | def commit(): WriterCommitMessage = { 65 | connection.commit() 66 | WriteSucceeded 67 | } 68 | 69 | def abort() = { 70 | println("abort is called in data writer") 71 | } 72 | 73 | object WriteSucceeded extends WriterCommitMessage 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/SimpleCsvDataSource.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.simplecsv 2 | 3 | import org.apache.spark.sql.sources.v2._ 4 | import org.apache.spark.sql.Row 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.sources.v2.reader._ 7 | import scala.collection.JavaConverters._ 8 | import org.apache.spark.sql.SparkSession 9 | 10 | class DefaultSource extends DataSourceV2 with ReadSupport { 11 | 12 | def createReader(options: DataSourceOptions) = { 13 | val path = options.get("path").get 14 | new SimpleCsvDataSourceReader(path) 15 | } 16 | } 17 | 18 | class SimpleCsvDataSourceReader(path: String) extends DataSourceReader { 19 | 20 | def readSchema() = { 21 | val sparkContext = SparkSession.builder.getOrCreate().sparkContext 22 | val firstLine = sparkContext.textFile(path).first() 23 | val columnNames = firstLine.split(",") 24 | val structFields = columnNames.map(value ⇒ StructField(value, StringType)) 25 | StructType(structFields) 26 | } 27 | 28 | def createDataReaderFactories = { 29 | val sparkContext = SparkSession.builder.getOrCreate().sparkContext 30 | val rdd = sparkContext.textFile(path) 31 | 32 | val factoryList = new java.util.ArrayList[DataReaderFactory[Row]] 33 | (0 to rdd.getNumPartitions - 1).foreach(value ⇒ 34 | factoryList.add(new SimpleCsvDataSourceReaderFactory(value, path))) 35 | factoryList 36 | } 37 | 38 | } 39 | 40 | class SimpleCsvDataSourceReaderFactory(partitionNumber: Int, filePath: String, hasHeader: Boolean = true) extends DataReaderFactory[Row] { 41 | 42 | def createDataReader = new SimpleCsvDataReader(partitionNumber, filePath, hasHeader) 43 | } 44 | 45 | class SimpleCsvDataReader(partitionNumber: Int, filePath: String, hasHeader: Boolean = true) extends DataReader[Row] { 46 | 47 | var iterator: Iterator[String] = null 48 | 49 | @transient 50 | def next = { 51 | if (iterator == null) { 52 | val sparkContext = SparkSession.builder.getOrCreate().sparkContext 53 | val rdd = sparkContext.textFile(filePath) 54 | val filterRDD = if (hasHeader) { 55 | val firstLine = rdd.first 56 | rdd.filter(_ != firstLine) 57 | } 58 | else rdd 59 | val partition = filterRDD.partitions(partitionNumber) 60 | iterator = filterRDD.iterator(partition, org.apache.spark.TaskContext.get()) 61 | } 62 | iterator.hasNext 63 | } 64 | 65 | def get = { 66 | println("calling get") 67 | val line = iterator.next() 68 | Row.fromSeq(line.split(",")) 69 | } 70 | def close() = Unit 71 | } 72 | 73 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/SimpleDataSource.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.simple 2 | 3 | import org.apache.spark.sql.sources.v2._ 4 | import org.apache.spark.sql.Row 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.sources.v2.reader._ 7 | import scala.collection.JavaConverters._ 8 | 9 | class DefaultSource extends DataSourceV2 with ReadSupport { 10 | 11 | def createReader(options: DataSourceOptions) = new SimpleDataSourceReader() 12 | 13 | } 14 | 15 | class SimpleDataSourceReader extends DataSourceReader { 16 | 17 | def readSchema() = StructType(Array(StructField("value", StringType))) 18 | 19 | def createDataReaderFactories = { 20 | val factoryList = new java.util.ArrayList[DataReaderFactory[Row]] 21 | factoryList.add(new SimpleDataSourceReaderFactory()) 22 | factoryList 23 | } 24 | 25 | } 26 | 27 | class SimpleDataSourceReaderFactory extends DataReaderFactory[Row] with DataReader[Row] { 28 | def createDataReader = new SimpleDataSourceReaderFactory() 29 | 30 | val values = Array("1", "2", "3", "4", "5") 31 | 32 | var index = 0 33 | 34 | def next = index < values.length 35 | 36 | def get = { 37 | val row = Row(values(index)) 38 | index = index + 1 39 | row 40 | } 41 | 42 | def close() = Unit 43 | } 44 | 45 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/SimpleDataSourceWithPartitionAffinity.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.partitionaffinity 2 | 3 | import org.apache.spark.sql.Row 4 | import org.apache.spark.sql.sources.v2._ 5 | import org.apache.spark.sql.sources.v2.reader._ 6 | import org.apache.spark.sql.types._ 7 | 8 | class DefaultSource extends DataSourceV2 with ReadSupport { 9 | 10 | def createReader(options: DataSourceOptions) = new SimpleDataSourceReader() 11 | 12 | } 13 | 14 | class SimpleDataSourceReader extends DataSourceReader { 15 | 16 | def readSchema() = StructType(Array(StructField("value", StringType))) 17 | 18 | def createDataReaderFactories = { 19 | val factoryList = new java.util.ArrayList[DataReaderFactory[Row]] 20 | factoryList.add(new SimpleDataSourceReaderFactory(0, 4)) 21 | factoryList.add(new SimpleDataSourceReaderFactory(5, 9)) 22 | factoryList 23 | } 24 | 25 | } 26 | 27 | class SimpleDataSourceReaderFactory(var start: Int, var end: Int) 28 | extends DataReaderFactory[Row] { 29 | def createDataReader = new SimpleDataReader(start, end) 30 | 31 | override def preferredLocations(): Array[String] = Array("sample-hostname") 32 | } 33 | 34 | class SimpleDataReader(var start: Int, end: Int) extends DataReader[Row] { 35 | 36 | val values = Array("1", "2", "3", "4", "5", "6", "7", "8", "9", "10") 37 | 38 | var index = 0 39 | 40 | def next = start <= end 41 | 42 | def get = { 43 | val row = Row(values(start)) 44 | start = start + 1 45 | row 46 | } 47 | 48 | def close() = Unit 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/SimpleMultiDataSource.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemulti 2 | 3 | import org.apache.spark.sql.sources.v2._ 4 | import org.apache.spark.sql.Row 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.sources.v2.reader._ 7 | import scala.collection.JavaConverters._ 8 | 9 | class DefaultSource extends DataSourceV2 with ReadSupport { 10 | 11 | def createReader(options: DataSourceOptions) = new SimpleDataSourceReader() 12 | 13 | } 14 | 15 | class SimpleDataSourceReader extends DataSourceReader { 16 | 17 | def readSchema() = StructType(Array(StructField("value", StringType))) 18 | 19 | def createDataReaderFactories = { 20 | val factoryList = new java.util.ArrayList[DataReaderFactory[Row]] 21 | factoryList.add(new SimpleDataSourceReaderFactory(0, 4)) 22 | factoryList.add(new SimpleDataSourceReaderFactory(5, 9)) 23 | factoryList 24 | } 25 | 26 | } 27 | 28 | class SimpleDataSourceReaderFactory(var start: Int, var end: Int) extends DataReaderFactory[Row] with DataReader[Row] { 29 | 30 | def createDataReader = new SimpleDataSourceReaderFactory(start, end) 31 | 32 | val values = Array("1", "2", "3", "4", "5", "6", "7", "8", "9", "10") 33 | 34 | var index = 0 35 | 36 | def next = start <= end 37 | 38 | def get = { 39 | val row = Row(values(start)) 40 | start = start + 1 41 | row 42 | } 43 | 44 | def close() = Unit 45 | } 46 | 47 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/SimpleMysqlDataSource.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemysql 2 | 3 | import org.apache.spark.sql.sources.v2._ 4 | import org.apache.spark.sql.Row 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.sources.v2.reader._ 7 | import scala.collection.JavaConverters._ 8 | import org.apache.spark.sql.SparkSession 9 | import org.apache.spark.sql.sources._ 10 | 11 | class DefaultSource extends DataSourceV2 with ReadSupport { 12 | 13 | def createReader(options: DataSourceOptions) = { 14 | new SimpleMysqlDataSourceReader() 15 | } 16 | } 17 | 18 | class SimpleMysqlDataSourceReader() 19 | extends DataSourceReader 20 | with SupportsPushDownFilters { 21 | 22 | var pushedFilters: Array[Filter] = Array[Filter]() 23 | def readSchema() = { 24 | val columnNames = Array("user") 25 | val structFields = columnNames.map(value ⇒ StructField(value, StringType)) 26 | StructType(structFields) 27 | } 28 | 29 | def pushFilters(filters: Array[Filter]) = { 30 | println("Filters " + filters.toList) 31 | pushedFilters = filters 32 | pushedFilters 33 | } 34 | 35 | def createDataReaderFactories = { 36 | val sparkContext = SparkSession.builder.getOrCreate().sparkContext 37 | 38 | val factoryList = new java.util.ArrayList[DataReaderFactory[Row]] 39 | factoryList.add(new SimpleMysqlDataSourceReaderFactory(pushedFilters)) 40 | factoryList 41 | } 42 | 43 | } 44 | 45 | class SimpleMysqlDataSourceReaderFactory(pushedFilters: Array[Filter]) 46 | extends DataReaderFactory[Row] { 47 | 48 | def createDataReader = new SimpleMysqlDataReader(pushedFilters: Array[Filter]) 49 | } 50 | 51 | class SimpleMysqlDataReader(pushedFilters: Array[Filter]) 52 | extends DataReader[Row] { 53 | 54 | var iterator: Iterator[Row] = null 55 | 56 | val getQuery: String = { 57 | if (pushedFilters == null || pushedFilters.isEmpty) 58 | "(select user from user)a" 59 | else { 60 | pushedFilters(1) match { 61 | case filter: EqualTo => 62 | val condition = s"${filter.attribute} = '${filter.value}'" 63 | s"(select user from user where $condition)a" 64 | case _ => "(select user from user)a" 65 | } 66 | } 67 | } 68 | 69 | def next = { 70 | if (iterator == null) { 71 | val url = "jdbc:mysql://localhost/mysql" 72 | val user = "root" 73 | val password = "abc123" 74 | 75 | val properties = new java.util.Properties() 76 | properties.setProperty("user", user) 77 | properties.setProperty("password", password) 78 | 79 | val sparkSession = SparkSession.builder.getOrCreate() 80 | val df = sparkSession.read.jdbc(url, getQuery, properties) 81 | val rdd = df.rdd 82 | val partition = rdd.partitions(0) 83 | iterator = rdd.iterator(partition, org.apache.spark.TaskContext.get()) 84 | } 85 | iterator.hasNext 86 | } 87 | 88 | def get = { 89 | iterator.next() 90 | } 91 | def close() = Unit 92 | } 93 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/datasourcev2/SimpleMysqlWriter.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.datasourcev2.simplemysqlwriter 2 | 3 | import org.apache.spark.sql.sources.v2._ 4 | import org.apache.spark.sql.Row 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.sources.v2.writer._ 7 | import scala.collection.JavaConverters._ 8 | import org.apache.spark.sql.SparkSession 9 | import org.apache.spark.sql.sources._ 10 | import java.util.Optional 11 | import org.apache.spark.sql.SaveMode 12 | import java.sql.{Connection,DriverManager} 13 | 14 | class DefaultSource extends DataSourceV2 with WriteSupport { 15 | 16 | def createWriter(jobId: String, schema: StructType, mode: SaveMode, 17 | options: DataSourceOptions): Optional[DataSourceWriter] = { 18 | Optional.of(new MysqlDataSourceWriter()) 19 | 20 | } 21 | } 22 | 23 | class MysqlDataSourceWriter extends DataSourceWriter { 24 | 25 | override def createWriterFactory(): DataWriterFactory[Row] = { 26 | new MysqlDataWriterFactory() 27 | } 28 | 29 | override def commit(messages: Array[WriterCommitMessage]) = { 30 | 31 | } 32 | 33 | override def abort(messages: Array[WriterCommitMessage]) = { 34 | 35 | } 36 | 37 | } 38 | 39 | class MysqlDataWriterFactory extends DataWriterFactory[Row] { 40 | override def createDataWriter(partitionId: Int, attemptNumber: Int): DataWriter[Row] = { 41 | new MysqlDataWriter() 42 | } 43 | } 44 | 45 | class MysqlDataWriter extends DataWriter[Row] { 46 | 47 | val url = "jdbc:mysql://localhost/test" 48 | val user = "root" 49 | val password = "abc123" 50 | val table ="userwrite" 51 | 52 | val connection = DriverManager.getConnection(url,user,password) 53 | val statement = "insert into userwrite (user) values (?)" 54 | val preparedStatement = connection.prepareStatement(statement) 55 | 56 | 57 | def write(record: Row) = { 58 | val value = record.getString(0) 59 | preparedStatement.setString(1,value) 60 | preparedStatement.executeUpdate() 61 | } 62 | 63 | def commit(): WriterCommitMessage = { 64 | WriteSucceeded 65 | } 66 | 67 | def abort() = { 68 | 69 | } 70 | 71 | object WriteSucceeded extends WriterCommitMessage 72 | 73 | } 74 | 75 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/ml/MultiColumnTransformation.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.ml 2 | 3 | import org.apache.spark.ml.Pipeline 4 | import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer} 5 | import org.apache.spark.sql.SparkSession 6 | 7 | object MultiColumnTransformation { 8 | 9 | def main(args: Array[String]): Unit = { 10 | 11 | val sparkSession = SparkSession.builder. 12 | master("local") 13 | .appName("example") 14 | .getOrCreate() 15 | 16 | 17 | val salaryDf = sparkSession.read.format("csv").option("header", "true").load("src/main/resources/adult.csv") 18 | 19 | val stringColumns = Array("workclass", "occupation", "sex") 20 | 21 | val outputColumns = stringColumns.map(_ + "_onehot") 22 | 23 | val indexers = stringColumns.map(column => { 24 | val indexer = new StringIndexer() 25 | indexer.setInputCol(column) 26 | indexer.setOutputCol(column + "_index") 27 | }) 28 | 29 | val singleOneHotEncoder = new OneHotEncoderEstimator() 30 | singleOneHotEncoder.setInputCols(stringColumns.map(_ + "_index")) 31 | singleOneHotEncoder.setOutputCols(outputColumns) 32 | 33 | val pipeline = new Pipeline() 34 | pipeline.setStages(indexers ++ Array(singleOneHotEncoder)) 35 | 36 | val outputDf = pipeline.fit(salaryDf).transform(salaryDf) 37 | 38 | outputDf.select(outputColumns.head, outputColumns.tail: _*).show() 39 | 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/ml/ParallelCrossValidation.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.ml 2 | 3 | import org.apache.spark.ml.Pipeline 4 | import org.apache.spark.ml.classification.LogisticRegression 5 | import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator 6 | import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} 7 | import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} 8 | import org.apache.spark.sql.SparkSession 9 | 10 | object ParallelCrossValidation { 11 | 12 | def main(args: Array[String]): Unit = { 13 | 14 | 15 | val sparkSession = SparkSession.builder. 16 | master("local[*]") 17 | .appName("example") 18 | .getOrCreate() 19 | 20 | 21 | val salaryDf = sparkSession.read.format("csv") 22 | .option("header", "true") 23 | .option("inferSchema", "true") 24 | .load("src/main/resources/adult.csv") 25 | 26 | val stringColumns = Array("workclass", "occupation", "sex", "education", "martial_status", "relationship", 27 | "race", "native_country") 28 | 29 | val numericalColumns = Array("age", "fnlwgt", "capital_loss", "capital_gain") 30 | 31 | val labelColumn = "salary" 32 | val outputColumns = stringColumns.map(_ + "_onehot") 33 | 34 | val indexers = stringColumns.map(column => { 35 | val indexer = new StringIndexer() 36 | indexer.setInputCol(column) 37 | indexer.setHandleInvalid("keep") 38 | indexer.setOutputCol(column + "_index") 39 | }) 40 | 41 | val singleOneHotEncoder = new OneHotEncoderEstimator() 42 | singleOneHotEncoder.setInputCols(stringColumns.map(_ + "_index")) 43 | singleOneHotEncoder.setOutputCols(outputColumns) 44 | 45 | val vectorAssembler = new VectorAssembler() 46 | vectorAssembler.setInputCols(outputColumns ++ numericalColumns) 47 | vectorAssembler.setOutputCol("features") 48 | 49 | val labelIndexer = new StringIndexer() 50 | labelIndexer.setInputCol("salary") 51 | labelIndexer.setOutputCol("label") 52 | 53 | val logisticRegression = new LogisticRegression() 54 | 55 | 56 | val pipeline = new Pipeline() 57 | pipeline.setStages(indexers ++ Array(singleOneHotEncoder) 58 | ++ Array(vectorAssembler) ++ Array(labelIndexer) ++ Array(logisticRegression)) 59 | 60 | val paramMap = new ParamGridBuilder() 61 | .addGrid(logisticRegression.maxIter, Array(1, 2, 3)).build() 62 | 63 | 64 | val crossValidator = new CrossValidator() 65 | crossValidator.setEstimator(pipeline) 66 | crossValidator.setEvaluator(new BinaryClassificationEvaluator()) 67 | crossValidator.setEstimatorParamMaps(paramMap) 68 | crossValidator.setParallelism(3) 69 | 70 | crossValidator.fit(salaryDf) 71 | 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/EventTimeExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.streaming 2 | 3 | import java.sql.Timestamp 4 | 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.streaming.OutputMode 8 | 9 | object EventTimeExample { 10 | 11 | case class Stock(time: Timestamp, symbol: String, value: Double) 12 | 13 | def main(args: Array[String]): Unit = { 14 | val sparkSession = SparkSession.builder 15 | .master("local") 16 | .appName("example") 17 | .getOrCreate() 18 | //create stream from socket 19 | 20 | import sparkSession.implicits._ 21 | sparkSession.sparkContext.setLogLevel("ERROR") 22 | val socketStreamDs = sparkSession.readStream 23 | .format("socket") 24 | .option("host", "localhost") 25 | .option("port", 50050) 26 | .load() 27 | .as[String] 28 | 29 | // read as stock 30 | val stockDs = socketStreamDs.map(value => { 31 | val columns = value.split(",") 32 | Stock(new Timestamp(columns(0).toLong), columns(1), columns(2).toDouble) 33 | }) 34 | 35 | val windowedCount = stockDs 36 | .groupBy( 37 | window($"time", "10 seconds") 38 | ) 39 | .sum("value") 40 | 41 | 42 | val query = 43 | windowedCount.writeStream 44 | .format("console") 45 | .option("truncate", "false") 46 | .outputMode(OutputMode.Complete()) 47 | 48 | query.start().awaitTermination() 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/FileStreamExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.streaming 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.streaming.{OutputMode, Trigger} 7 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 8 | 9 | /** 10 | * Created by madhu on 24/07/17. 11 | */ 12 | object FileStreamExample { 13 | 14 | def main(args: Array[String]): Unit = { 15 | 16 | val sparkSession = SparkSession.builder 17 | .master("local") 18 | .appName("example") 19 | .getOrCreate() 20 | 21 | val schema = StructType( 22 | Array(StructField("transactionId", StringType), 23 | StructField("customerId", StringType), 24 | StructField("itemId", StringType), 25 | StructField("amountPaid", StringType))) 26 | 27 | //create stream from folder 28 | val fileStreamDf = sparkSession.readStream 29 | .option("header", "true") 30 | .schema(schema) 31 | .csv("/tmp/input") 32 | 33 | val query = fileStreamDf.writeStream 34 | .format("console") 35 | .outputMode(OutputMode.Append()).start() 36 | 37 | query.awaitTermination() 38 | 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/IngestionTimeWindow.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.streaming 2 | 3 | import java.sql.Timestamp 4 | 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.streaming.OutputMode 8 | 9 | object IngestionTimeWindow { 10 | 11 | def main(args: Array[String]): Unit = { 12 | val sparkSession = SparkSession.builder 13 | .master("local") 14 | .appName("example") 15 | .getOrCreate() 16 | //create stream from socket 17 | sparkSession.sparkContext.setLogLevel("ERROR") 18 | val socketStreamDf = sparkSession.readStream 19 | .format("socket") 20 | .option("host", "localhost") 21 | .option("port", 50050) 22 | .option("includeTimestamp", true) 23 | .load() 24 | import sparkSession.implicits._ 25 | val socketDs = socketStreamDf.as[(String, Timestamp)] 26 | val wordsDs = socketDs 27 | .flatMap(line => line._1.split(" ").map(word => { 28 | Thread.sleep(15000) 29 | (word, line._2) 30 | })) 31 | .toDF("word", "timestamp") 32 | 33 | val windowedCount = wordsDs 34 | .groupBy( 35 | window($"timestamp", "15 seconds") 36 | ) 37 | .count() 38 | .orderBy("window") 39 | 40 | 41 | val query = 42 | windowedCount.writeStream 43 | .format("console").option("truncate","false") 44 | .outputMode(OutputMode.Complete()).start() 45 | 46 | 47 | query.awaitTermination() 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/ProcessingTimeWindow.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.streaming 2 | import java.sql.Timestamp 3 | 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | 7 | import org.apache.spark.sql.streaming.OutputMode 8 | 9 | object ProcessingTimeWindow { 10 | 11 | def main(args: Array[String]): Unit = { 12 | val sparkSession = SparkSession.builder 13 | .master("local") 14 | .appName("example") 15 | .getOrCreate() 16 | //create stream from socket 17 | sparkSession.sparkContext.setLogLevel("ERROR") 18 | val socketStreamDf = sparkSession.readStream 19 | .format("socket") 20 | .option("host", "localhost") 21 | .option("port", 50050) 22 | .load() 23 | val currentTimeDf = socketStreamDf.withColumn("processingTime",current_timestamp()) 24 | import sparkSession.implicits._ 25 | val socketDs = currentTimeDf.as[(String, Timestamp)] 26 | val wordsDs = socketDs 27 | .flatMap(line => line._1.split(" ").map(word => (word, line._2))) 28 | .toDF("word", "processingTime") 29 | 30 | val windowedCount = wordsDs 31 | .groupBy( 32 | window($"processingTime", "15 seconds") 33 | ) 34 | .count() 35 | .orderBy("window") 36 | 37 | val query = 38 | windowedCount.writeStream 39 | .format("console") 40 | .option("truncate","false") 41 | .outputMode(OutputMode.Complete()) 42 | 43 | query.start().awaitTermination() 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/RecoverableAggregation.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.streaming 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.OutputMode 5 | import org.apache.spark.sql.types.{ 6 | DoubleType, 7 | StringType, 8 | StructField, 9 | StructType 10 | } 11 | 12 | object RecoverableAggregation { 13 | 14 | def main(args: Array[String]): Unit = { 15 | val sparkSession = SparkSession.builder 16 | .master("local") 17 | .appName("example") 18 | .getOrCreate() 19 | 20 | val schema = StructType( 21 | Array(StructField("transactionId", StringType), 22 | StructField("customerId", StringType), 23 | StructField("itemId", StringType), 24 | StructField("amountPaid", DoubleType))) 25 | 26 | //create stream from folder 27 | val fileStreamDf = sparkSession.readStream 28 | .option("header", "true") 29 | .schema(schema) 30 | .csv("/tmp/input") 31 | 32 | val countDs = fileStreamDf.groupBy("customerId").sum("amountPaid") 33 | val query = 34 | countDs.writeStream 35 | .format("console") 36 | .option("checkpointLocation", "/tmp/checkpoint") 37 | .outputMode(OutputMode.Complete()) 38 | 39 | query.start().awaitTermination() 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/SessionisationExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.streaming 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.{ GroupState, GroupStateTimeout } 5 | 6 | import scala.util.Try 7 | 8 | case class Session(sessionId: String, value: Double, endSignal: Option[String]) 9 | 10 | case class SessionInfo( 11 | totalSum: Double) 12 | 13 | case class SessionUpdate( 14 | id: String, 15 | totalSum: Double, 16 | expired: Boolean) 17 | 18 | object SessionisationExample { 19 | 20 | def main(args: Array[String]): Unit = { 21 | val sparkSession = SparkSession.builder 22 | .master("local") 23 | .appName("example") 24 | .getOrCreate() 25 | //create stream from socket 26 | sparkSession.sparkContext.setLogLevel("ERROR") 27 | val socketStreamDf = sparkSession.readStream 28 | .format("socket") 29 | .option("host", "localhost") 30 | .option("port", 50050) 31 | .load() 32 | import sparkSession.implicits._ 33 | val socketDs = socketStreamDf.as[String] 34 | 35 | // events 36 | val events = socketDs.map(line ⇒ { 37 | val columns = line.split(",") 38 | val endSignal = Try(Some(columns(2))).getOrElse(None) 39 | Session(columns(0), columns(1).toDouble, endSignal) 40 | }) 41 | 42 | val sessionUpdates = events.groupByKey(_.sessionId) 43 | .mapGroupsWithState[SessionInfo, SessionUpdate](GroupStateTimeout.NoTimeout()) { 44 | case (sessionId: String, eventsIter: Iterator[Session], state: GroupState[SessionInfo]) ⇒ 45 | val events = eventsIter.toSeq 46 | val updatedSession = if (state.exists) { 47 | val existingState = state.get 48 | val updatedEvents = SessionInfo(existingState.totalSum + events.map(event ⇒ event.value).reduce(_ + _)) 49 | updatedEvents 50 | } 51 | else { 52 | SessionInfo(events.map(event => event.value).reduce(_+_)) 53 | } 54 | state.update(updatedSession) 55 | //check did we get end signal or not 56 | val isEndSignal = events.filter(value ⇒ value.endSignal.isDefined).length > 0 57 | if (isEndSignal) { 58 | state.remove() 59 | SessionUpdate(sessionId, updatedSession.totalSum, true) 60 | } 61 | else { 62 | SessionUpdate(sessionId, updatedSession.totalSum, false) 63 | } 64 | } 65 | 66 | val query = sessionUpdates 67 | .writeStream 68 | .outputMode("update") 69 | .format("console") 70 | .start() 71 | 72 | query.awaitTermination() 73 | 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/SocketMiniBatchExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.streaming 2 | 3 | import java.util.concurrent.TimeUnit 4 | 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.streaming.{OutputMode, Trigger} 7 | 8 | /** 9 | * Created by madhu on 24/07/17. 10 | */ 11 | object SocketMiniBatchExample { 12 | 13 | def main(args: Array[String]): Unit = { 14 | 15 | val sparkSession = SparkSession.builder. 16 | master("local") 17 | .appName("example") 18 | .getOrCreate() 19 | 20 | //create stream from socket 21 | 22 | val socketStreamDf = sparkSession.readStream. 23 | format("socket") 24 | .option("host", "localhost") 25 | .option("port", 50050).load() 26 | 27 | val query = socketStreamDf.writeStream.format("console").outputMode(OutputMode.Append()).trigger( 28 | Trigger.ProcessingTime(10, TimeUnit.SECONDS) 29 | ).start() 30 | 31 | query.awaitTermination() 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/SocketReadExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.streaming 2 | 3 | import org.apache.spark.sql.streaming.OutputMode 4 | import org.apache.spark.sql.{SaveMode, SparkSession} 5 | 6 | /** 7 | * Created by madhu on 24/07/17. 8 | */ 9 | object SocketReadExample { 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | val sparkSession = SparkSession.builder 14 | .master("local") 15 | .appName("example") 16 | .getOrCreate() 17 | 18 | //create stream from socket 19 | 20 | val socketStreamDf = sparkSession.readStream 21 | .format("socket") 22 | .option("host", "localhost") 23 | .option("port", 50050) 24 | .load() 25 | 26 | val consoleDataFrameWriter = socketStreamDf.writeStream 27 | .format("console") 28 | .outputMode(OutputMode.Append()) 29 | 30 | val query = consoleDataFrameWriter.start() 31 | 32 | query.awaitTermination() 33 | 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/SocketWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.streaming 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.OutputMode 5 | 6 | /** 7 | * Created by madhu on 24/07/17. 8 | */ 9 | object SocketWordCount { 10 | 11 | def main(args: Array[String]): Unit = { 12 | val sparkSession = SparkSession.builder 13 | .master("local") 14 | .appName("example") 15 | .getOrCreate() 16 | 17 | //create stream from socket 18 | 19 | val socketStreamDf = sparkSession.readStream 20 | .format("socket") 21 | .option("host", "localhost") 22 | .option("port", 50050) 23 | .load() 24 | 25 | import sparkSession.implicits._ 26 | val socketDs = socketStreamDf.as[String] 27 | val wordsDs = socketDs.flatMap(value => value.split(" ")) 28 | val countDs = wordsDs.groupBy("value").count() 29 | 30 | val query = 31 | countDs.writeStream.format("console").outputMode(OutputMode.Complete()) 32 | 33 | query.start().awaitTermination() 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/StatelessWordCount.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.streaming 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.{ OutputMode, Trigger } 5 | 6 | /** 7 | * Created by madhu on 24/07/17. 8 | */ 9 | object StatelessWordCount { 10 | 11 | def main(args: Array[String]): Unit = { 12 | val sparkSession = SparkSession.builder 13 | .master("local") 14 | .appName("example") 15 | .getOrCreate() 16 | 17 | //create stream from socket 18 | 19 | val socketStreamDf = sparkSession.readStream 20 | .format("socket") 21 | .option("host", "localhost") 22 | .option("port", 50050) 23 | .load() 24 | 25 | import sparkSession.implicits._ 26 | val socketDs = socketStreamDf.as[String] 27 | val wordsDs = socketDs.flatMap(value ⇒ value.split(" ")) 28 | 29 | val countDs = wordsDs.groupByKey(value => value).flatMapGroups{ 30 | case (value, iter) ⇒ Iterator((value, iter.length)) 31 | }.toDF("value", "count") 32 | 33 | val query = 34 | countDs.writeStream.format("console").outputMode(OutputMode.Append()). 35 | trigger(Trigger.ProcessingTime("5 seconds")) 36 | 37 | query.start().awaitTermination() 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/StreamJoin.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.streaming 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.OutputMode 5 | 6 | /** 7 | * Created by madhu on 24/07/17. 8 | */ 9 | object StreamJoin { 10 | 11 | case class Sales( 12 | transactionId: String, 13 | customerId: String, 14 | itemId: String, 15 | amountPaid: Double) 16 | case class Customer(customerId: String, customerName: String) 17 | def main(args: Array[String]): Unit = { 18 | val sparkSession = SparkSession.builder 19 | .master("local") 20 | .appName("example") 21 | .getOrCreate() 22 | 23 | //create stream from socket 24 | val socketStreamDf = sparkSession.readStream 25 | .format("socket") 26 | .option("host", "localhost") 27 | .option("port", 50050) 28 | .load() 29 | 30 | import sparkSession.implicits._ 31 | //take customer data as static df 32 | val customerDs = sparkSession.read 33 | .format("csv") 34 | .option("header", true) 35 | .load("src/main/resources/customers.csv") 36 | .as[Customer] 37 | 38 | import sparkSession.implicits._ 39 | val dataDf = socketStreamDf.as[String].flatMap(value ⇒ value.split(" ")) 40 | val salesDs = dataDf 41 | .as[String] 42 | .map(value ⇒ { 43 | val values = value.split(",") 44 | Sales(values(0), values(1), values(2), values(3).toDouble) 45 | }) 46 | 47 | val joinedDs = salesDs 48 | .join(customerDs, "customerId") 49 | //create sales schema 50 | val query = 51 | joinedDs.writeStream.format("console").outputMode(OutputMode.Append()) 52 | 53 | query.start().awaitTermination() 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/examples/sparktwo/streaming/WaterMarkExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.examples.sparktwo.streaming 2 | 3 | import java.sql.Timestamp 4 | 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.streaming.OutputMode 8 | 9 | object WaterMarkExample { 10 | 11 | case class Stock(time: Timestamp, symbol: String, value: Double) 12 | 13 | def main(args: Array[String]): Unit = { 14 | val sparkSession = SparkSession.builder 15 | .master("local") 16 | .appName("example") 17 | .getOrCreate() 18 | //create stream from socket 19 | 20 | import sparkSession.implicits._ 21 | sparkSession.sparkContext.setLogLevel("ERROR") 22 | val socketStreamDs = sparkSession.readStream 23 | .format("socket") 24 | .option("host", "localhost") 25 | .option("port", 50050) 26 | .load() 27 | .as[String] 28 | 29 | // read as stock 30 | val stockDs = socketStreamDs.map(value => { 31 | val columns = value.split(",") 32 | Stock(new Timestamp(columns(0).toLong), columns(1), columns(2).toDouble) 33 | }) 34 | 35 | val windowedCount = stockDs 36 | .withWatermark("time", "500 milliseconds") 37 | .groupBy( 38 | window($"time", "10 seconds") 39 | ) 40 | .sum("value") 41 | 42 | val query = 43 | windowedCount.writeStream 44 | .format("console") 45 | .option("truncate", "false") 46 | .outputMode(OutputMode.Update()) 47 | 48 | query.start().awaitTermination() 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /todo.txt: -------------------------------------------------------------------------------- 1 | * Recoverable State 2 | * Using Window API 3 | * Process Time 4 | * Event Time 5 | * Custom Trigger - Sensitisation 6 | * Different kind of sinks 7 | * Kafka integration ( with offset) 8 | * Writing custom sources --------------------------------------------------------------------------------