├── .github └── workflows │ └── scala.yml ├── .gitignore ├── README.md ├── build.sbt ├── project └── build.properties └── src └── main ├── resources ├── adult.csv ├── customers.csv ├── metrics.properties ├── multicharacterseperator.csv ├── nested │ ├── a.csv │ └── folder1 │ │ └── b.csv └── sales.csv └── scala └── com └── madhukaraphatak └── spark ├── barrier ├── BarrierContextExample.scala ├── BarrierExceptionExample.scala ├── BarrierMethodExample.scala └── BarrierRddExample.scala ├── core ├── BarrierExample.scala └── plugins │ ├── custommetrics │ ├── CustomMetricExample.scala │ └── CustomMetricSparkPlugin.scala │ ├── driverplugin │ ├── CustomDriverPlugin.scala │ ├── CustomSparkPlugin.scala │ └── DriverPluginExample.scala │ ├── dynamicconfig │ ├── Configuration.scala │ ├── CustomConfigDriverPlugin.scala │ ├── CustomConfigSparkPlugin.scala │ └── DynamicConfigExample.scala │ └── rpccommunication │ ├── RpcCommunicationExample.scala │ └── RpcSparkPlugin.scala ├── ml ├── MLUtils.scala ├── MultiColumnTransformer.scala └── WeightedLogisticRegression.scala ├── sources ├── BinaryFile.scala ├── MultiCharacterDelimiterCSV.scala ├── RecursiveFolderReadExample.scala └── datasourcev2 │ ├── DataSourceV2Example.scala │ ├── SimpleCsvDataSource.scala │ ├── SimpleDataSource.scala │ ├── SimpleMultiDataSource.scala │ ├── SimpleMysqlWriterDataSource.scala │ ├── streamandbatch │ ├── DataSourceV2StreamAndBatchExample.scala │ └── SimpleStreamAndBatchDataSource.scala │ └── streaming │ ├── DataSourceV2StreamingExample.scala │ └── SimpleStreamingDataSource.scala └── sql ├── DataFrameTail.scala ├── InMemoryTableScanExample.scala ├── JoinHintsExample.scala ├── MinAndMaxByExample.scala ├── MultiColumnSampleBy.scala └── adaptive └── shuffle ├── AdaptiveShuffle.scala └── NoAdaptiveShuffle.scala /.github/workflows/scala.yml: -------------------------------------------------------------------------------- 1 | name: Scala CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Set up JDK 1.8 17 | uses: actions/setup-java@v1 18 | with: 19 | java-version: 1.8 20 | - name: Compile 21 | run: sbt compile 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | .idea/ 3 | *.iml 4 | target/ 5 | project/target 6 | dependency-reduced-pom.xml 7 | *.pdf 8 | *.swp 9 | *.sw* 10 | metastore_db 11 | *.log 12 | creditcard* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Examples for Spark 3.0 release. 2 | 3 | # Build 4 | 5 | sbt clean package -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "spark-three-examples" 2 | 3 | version := "0.1" 4 | 5 | scalaVersion := "2.12.4" 6 | 7 | val sparkVersion = "3.0.1" 8 | 9 | 10 | resolvers += "Spark Snapshot Repository" at "https://repository.apache.org/snapshots" 11 | 12 | libraryDependencies ++= Seq( 13 | "org.apache.spark" %% "spark-sql" % sparkVersion, 14 | "org.apache.spark" %% "spark-mllib" % sparkVersion, 15 | "mysql" % "mysql-connector-java" % "5.1.6" 16 | ) 17 | 18 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.3.8 -------------------------------------------------------------------------------- /src/main/resources/adult.csv: -------------------------------------------------------------------------------- 1 | age,workclass,fnlwgt,education,education_num,martial_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary 2 | 39, State-gov, 77516, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K 3 | 50, Self-emp-not-inc, 83311, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 13, United-States, <=50K 4 | 38, Private, 215646, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K 5 | 53, Private, 234721, 11th, 7, Married-civ-spouse, Handlers-cleaners, Husband, Black, Male, 0, 0, 40, United-States, <=50K 6 | 28, Private, 338409, Bachelors, 13, Married-civ-spouse, Prof-specialty, Wife, Black, Female, 0, 0, 40, Cuba, <=50K 7 | 37, Private, 284582, Masters, 14, Married-civ-spouse, Exec-managerial, Wife, White, Female, 0, 0, 40, United-States, <=50K 8 | 49, Private, 160187, 9th, 5, Married-spouse-absent, Other-service, Not-in-family, Black, Female, 0, 0, 16, Jamaica, <=50K 9 | 52, Self-emp-not-inc, 209642, HS-grad, 9, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 45, United-States, >50K 10 | 31, Private, 45781, Masters, 14, Never-married, Prof-specialty, Not-in-family, White, Female, 14084, 0, 50, United-States, >50K 11 | 42, Private, 159449, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 5178, 0, 40, United-States, >50K 12 | 37, Private, 280464, Some-college, 10, Married-civ-spouse, Exec-managerial, Husband, Black, Male, 0, 0, 80, United-States, >50K 13 | 30, State-gov, 141297, Bachelors, 13, Married-civ-spouse, Prof-specialty, Husband, Asian-Pac-Islander, Male, 0, 0, 40, India, >50K 14 | 23, Private, 122272, Bachelors, 13, Never-married, Adm-clerical, Own-child, White, Female, 0, 0, 30, United-States, <=50K 15 | 32, Private, 205019, Assoc-acdm, 12, Never-married, Sales, Not-in-family, Black, Male, 0, 0, 50, United-States, <=50K 16 | 40, Private, 121772, Assoc-voc, 11, Married-civ-spouse, Craft-repair, Husband, Asian-Pac-Islander, Male, 0, 0, 40, ?, >50K 17 | 34, Private, 245487, 7th-8th, 4, Married-civ-spouse, Transport-moving, Husband, Amer-Indian-Eskimo, Male, 0, 0, 45, Mexico, <=50K 18 | 25, Self-emp-not-inc, 176756, HS-grad, 9, Never-married, Farming-fishing, Own-child, White, Male, 0, 0, 35, United-States, <=50K 19 | 32, Private, 186824, HS-grad, 9, Never-married, Machine-op-inspct, Unmarried, White, Male, 0, 0, 40, United-States, <=50K 20 | 38, Private, 28887, 11th, 7, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 50, United-States, <=50K 21 | 43, Self-emp-not-inc, 292175, Masters, 14, Divorced, Exec-managerial, Unmarried, White, Female, 0, 0, 45, United-States, >50K 22 | 40, Private, 193524, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 60, United-States, >50K 23 | 54, Private, 302146, HS-grad, 9, Separated, Other-service, Unmarried, Black, Female, 0, 0, 20, United-States, <=50K 24 | 35, Federal-gov, 76845, 9th, 5, Married-civ-spouse, Farming-fishing, Husband, Black, Male, 0, 0, 40, United-States, <=50K 25 | 43, Private, 117037, 11th, 7, Married-civ-spouse, Transport-moving, Husband, White, Male, 0, 2042, 40, United-States, <=50K 26 | 59, Private, 109015, HS-grad, 9, Divorced, Tech-support, Unmarried, White, Female, 0, 0, 40, United-States, <=50K 27 | 56, Local-gov, 216851, Bachelors, 13, Married-civ-spouse, Tech-support, Husband, White, Male, 0, 0, 40, United-States, >50K 28 | 19, Private, 168294, HS-grad, 9, Never-married, Craft-repair, Own-child, White, Male, 0, 0, 40, United-States, <=50K 29 | 54, ?, 180211, Some-college, 10, Married-civ-spouse, ?, Husband, Asian-Pac-Islander, Male, 0, 0, 60, South, >50K 30 | 39, Private, 367260, HS-grad, 9, Divorced, Exec-managerial, Not-in-family, White, Male, 0, 0, 80, United-States, <=50K 31 | 49, Private, 193366, HS-grad, 9, Married-civ-spouse, Craft-repair, Husband, White, Male, 0, 0, 40, United-States, <=50K 32 | 23, Local-gov, 190709, Assoc-acdm, 12, Never-married, Protective-serv, Not-in-family, White, Male, 0, 0, 52, United-States, <=50K 33 | 20, Private, 266015, Some-college, 10, Never-married, Sales, Own-child, Black, Male, 0, 0, 44, United-States, <=50K 34 | 45, Private, 386940, Bachelors, 13, Divorced, Exec-managerial, Own-child, White, Male, 0, 1408, 40, United-States, <=50K 35 | 30, Federal-gov, 59951, Some-college, 10, Married-civ-spouse, Adm-clerical, Own-child, White, Male, 0, 0, 40, United-States, <=50K 36 | 22, State-gov, 311512, Some-college, 10, Married-civ-spouse, Other-service, Husband, Black, Male, 0, 0, 15, United-States, <=50K 37 | 48, Private, 242406, 11th, 7, Never-married, Machine-op-inspct, Unmarried, White, Male, 0, 0, 40, Puerto-Rico, <=50K 38 | 21, Private, 197200, Some-college, 10, Never-married, Machine-op-inspct, Own-child, White, Male, 0, 0, 40, United-States, <=50K 39 | 19, Private, 544091, HS-grad, 9, Married-AF-spouse, Adm-clerical, Wife, White, Female, 0, 0, 25, United-States, <=50K 40 | 31, Private, 84154, Some-college, 10, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 38, ?, >50K 41 | 48, Self-emp-not-inc, 265477, Assoc-acdm, 12, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 40, United-States, <=50K 42 | 31, Private, 507875, 9th, 5, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 0, 0, 43, United-States, <=50K 43 | 53, Self-emp-not-inc, 88506, Bachelors, 13, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 40, United-States, <=50K 44 | 24, Private, 172987, Bachelors, 13, Married-civ-spouse, Tech-support, Husband, White, Male, 0, 0, 50, United-States, <=50K 45 | 49, Private, 94638, HS-grad, 9, Separated, Adm-clerical, Unmarried, White, Female, 0, 0, 40, United-States, <=50K 46 | 25, Private, 289980, HS-grad, 9, Never-married, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 35, United-States, <=50K 47 | 57, Federal-gov, 337895, Bachelors, 13, Married-civ-spouse, Prof-specialty, Husband, Black, Male, 0, 0, 40, United-States, >50K 48 | 53, Private, 144361, HS-grad, 9, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 0, 0, 38, United-States, <=50K 49 | 44, Private, 128354, Masters, 14, Divorced, Exec-managerial, Unmarried, White, Female, 0, 0, 40, United-States, <=50K 50 | 41, State-gov, 101603, Assoc-voc, 11, Married-civ-spouse, Craft-repair, Husband, White, Male, 0, 0, 40, United-States, <=50K 51 | 29, Private, 271466, Assoc-voc, 11, Never-married, Prof-specialty, Not-in-family, White, Male, 0, 0, 43, United-States, <=50K 52 | 25, Private, 32275, Some-college, 10, Married-civ-spouse, Exec-managerial, Wife, Other, Female, 0, 0, 40, United-States, <=50K 53 | 18, Private, 226956, HS-grad, 9, Never-married, Other-service, Own-child, White, Female, 0, 0, 30, ?, <=50K 54 | 47, Private, 51835, Prof-school, 15, Married-civ-spouse, Prof-specialty, Wife, White, Female, 0, 1902, 60, Honduras, >50K 55 | 50, Federal-gov, 251585, Bachelors, 13, Divorced, Exec-managerial, Not-in-family, White, Male, 0, 0, 55, United-States, >50K 56 | 47, Self-emp-inc, 109832, HS-grad, 9, Divorced, Exec-managerial, Not-in-family, White, Male, 0, 0, 60, United-States, <=50K 57 | 43, Private, 237993, Some-college, 10, Married-civ-spouse, Tech-support, Husband, White, Male, 0, 0, 40, United-States, >50K 58 | 46, Private, 216666, 5th-6th, 3, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 0, 0, 40, Mexico, <=50K 59 | 35, Private, 56352, Assoc-voc, 11, Married-civ-spouse, Other-service, Husband, White, Male, 0, 0, 40, Puerto-Rico, <=50K 60 | 41, Private, 147372, HS-grad, 9, Married-civ-spouse, Adm-clerical, Husband, White, Male, 0, 0, 48, United-States, <=50K 61 | 30, Private, 188146, HS-grad, 9, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 5013, 0, 40, United-States, <=50K 62 | 30, Private, 59496, Bachelors, 13, Married-civ-spouse, Sales, Husband, White, Male, 2407, 0, 40, United-States, <=50K 63 | 32, ?, 293936, 7th-8th, 4, Married-spouse-absent, ?, Not-in-family, White, Male, 0, 0, 40, ?, <=50K 64 | 48, Private, 149640, HS-grad, 9, Married-civ-spouse, Transport-moving, Husband, White, Male, 0, 0, 40, United-States, <=50K 65 | 42, Private, 116632, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 45, United-States, >50K 66 | 29, Private, 105598, Some-college, 10, Divorced, Tech-support, Not-in-family, White, Male, 0, 0, 58, United-States, <=50K 67 | 36, Private, 155537, HS-grad, 9, Married-civ-spouse, Craft-repair, Husband, White, Male, 0, 0, 40, United-States, <=50K 68 | 28, Private, 183175, Some-college, 10, Divorced, Adm-clerical, Not-in-family, White, Female, 0, 0, 40, United-States, <=50K 69 | 53, Private, 169846, HS-grad, 9, Married-civ-spouse, Adm-clerical, Wife, White, Female, 0, 0, 40, United-States, >50K 70 | 49, Self-emp-inc, 191681, Some-college, 10, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 50, United-States, >50K 71 | 25, ?, 200681, Some-college, 10, Never-married, ?, Own-child, White, Male, 0, 0, 40, United-States, <=50K 72 | 19, Private, 101509, Some-college, 10, Never-married, Prof-specialty, Own-child, White, Male, 0, 0, 32, United-States, <=50K 73 | 31, Private, 309974, Bachelors, 13, Separated, Sales, Own-child, Black, Female, 0, 0, 40, United-States, <=50K 74 | 29, Self-emp-not-inc, 162298, Bachelors, 13, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 70, United-States, >50K 75 | 23, Private, 211678, Some-college, 10, Never-married, Machine-op-inspct, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K 76 | 79, Private, 124744, Some-college, 10, Married-civ-spouse, Prof-specialty, Other-relative, White, Male, 0, 0, 20, United-States, <=50K 77 | 27, Private, 213921, HS-grad, 9, Never-married, Other-service, Own-child, White, Male, 0, 0, 40, Mexico, <=50K 78 | 40, Private, 32214, Assoc-acdm, 12, Married-civ-spouse, Adm-clerical, Husband, White, Male, 0, 0, 40, United-States, <=50K 79 | 67, ?, 212759, 10th, 6, Married-civ-spouse, ?, Husband, White, Male, 0, 0, 2, United-States, <=50K 80 | 18, Private, 309634, 11th, 7, Never-married, Other-service, Own-child, White, Female, 0, 0, 22, United-States, <=50K 81 | 31, Local-gov, 125927, 7th-8th, 4, Married-civ-spouse, Farming-fishing, Husband, White, Male, 0, 0, 40, United-States, <=50K 82 | 18, Private, 446839, HS-grad, 9, Never-married, Sales, Not-in-family, White, Male, 0, 0, 30, United-States, <=50K 83 | 52, Private, 276515, Bachelors, 13, Married-civ-spouse, Other-service, Husband, White, Male, 0, 0, 40, Cuba, <=50K 84 | 46, Private, 51618, HS-grad, 9, Married-civ-spouse, Other-service, Wife, White, Female, 0, 0, 40, United-States, <=50K 85 | 59, Private, 159937, HS-grad, 9, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 48, United-States, <=50K 86 | 44, Private, 343591, HS-grad, 9, Divorced, Craft-repair, Not-in-family, White, Female, 14344, 0, 40, United-States, >50K 87 | 53, Private, 346253, HS-grad, 9, Divorced, Sales, Own-child, White, Female, 0, 0, 35, United-States, <=50K 88 | 49, Local-gov, 268234, HS-grad, 9, Married-civ-spouse, Protective-serv, Husband, White, Male, 0, 0, 40, United-States, >50K 89 | 33, Private, 202051, Masters, 14, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 50, United-States, <=50K 90 | 30, Private, 54334, 9th, 5, Never-married, Sales, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K 91 | 43, Federal-gov, 410867, Doctorate, 16, Never-married, Prof-specialty, Not-in-family, White, Female, 0, 0, 50, United-States, >50K 92 | 57, Private, 249977, Assoc-voc, 11, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 40, United-States, <=50K 93 | 37, Private, 286730, Some-college, 10, Divorced, Craft-repair, Unmarried, White, Female, 0, 0, 40, United-States, <=50K 94 | 28, Private, 212563, Some-college, 10, Divorced, Machine-op-inspct, Unmarried, Black, Female, 0, 0, 25, United-States, <=50K 95 | 30, Private, 117747, HS-grad, 9, Married-civ-spouse, Sales, Wife, Asian-Pac-Islander, Female, 0, 1573, 35, ?, <=50K 96 | 34, Local-gov, 226296, Bachelors, 13, Married-civ-spouse, Protective-serv, Husband, White, Male, 0, 0, 40, United-States, >50K 97 | 29, Local-gov, 115585, Some-college, 10, Never-married, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 50, United-States, <=50K 98 | 48, Self-emp-not-inc, 191277, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 1902, 60, United-States, >50K 99 | 37, Private, 202683, Some-college, 10, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 48, United-States, >50K 100 | 48, Private, 171095, Assoc-acdm, 12, Divorced, Exec-managerial, Unmarried, White, Female, 0, 0, 40, England, <=50K 101 | 32, Federal-gov, 249409, HS-grad, 9, Never-married, Other-service, Own-child, Black, Male, 0, 0, 40, United-States, <=50K 102 | 76, Private, 124191, Masters, 14, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 40, United-States, >50K 103 | -------------------------------------------------------------------------------- /src/main/resources/customers.csv: -------------------------------------------------------------------------------- 1 | customerId,customerName 2 | 1,John 3 | 2,Clerk 4 | 3,Micheal 5 | 4,Sample 6 | -------------------------------------------------------------------------------- /src/main/resources/metrics.properties: -------------------------------------------------------------------------------- 1 | *.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink -------------------------------------------------------------------------------- /src/main/resources/multicharacterseperator.csv: -------------------------------------------------------------------------------- 1 | a||b||c||d 2 | 1||2||3||4 3 | 5||6||7||8 -------------------------------------------------------------------------------- /src/main/resources/nested/a.csv: -------------------------------------------------------------------------------- 1 | a||b||c||d 2 | 1||2||3||4 3 | 5||6||7||8 -------------------------------------------------------------------------------- /src/main/resources/nested/folder1/b.csv: -------------------------------------------------------------------------------- 1 | a||b||c||d 2 | 1||2||3||4 3 | 5||6||7||8 -------------------------------------------------------------------------------- /src/main/resources/sales.csv: -------------------------------------------------------------------------------- 1 | transactionId,customerId,itemId,amountPaid 2 | 111,1,1,100.0 3 | 112,2,2,505.0 4 | 113,3,3,510.0 5 | 114,4,4,600.0 6 | 115,1,2,500.0 7 | 116,1,2,500.0 8 | 117,1,2,500.0 9 | 118,1,2,500.0 10 | 119,2,3,500.0 11 | 120,1,2,500.0 12 | 121,1,4,500.0 13 | 122,1,2,500.0 14 | 123,1,4,500.0 15 | 124,1,2,500.0 -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/barrier/BarrierContextExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.barrier 2 | 3 | import org.apache.spark.{BarrierTaskContext, TaskContext} 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object BarrierContextExample { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val sparkSession = SparkSession.builder. 11 | master("local[4]") 12 | .appName("example") 13 | .getOrCreate() 14 | 15 | val df = sparkSession.range(0,100).repartition(4) 16 | 17 | val barrierRdd = df.rdd.barrier() 18 | 19 | val mappedRDD = barrierRdd.mapPartitionsWithIndex{ case (index,iterator) => { 20 | val taskContext = BarrierTaskContext.get() 21 | val taskInfos = taskContext.getTaskInfos().map(_.address) 22 | println(taskInfos) 23 | iterator 24 | }} 25 | 26 | mappedRDD.count() 27 | 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/barrier/BarrierExceptionExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.barrier 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object BarrierExceptionExample { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val sparkSession = SparkSession.builder. 10 | master("local") 11 | .appName("example") 12 | .getOrCreate() 13 | 14 | val df = sparkSession.range(0,100).repartition(4) 15 | 16 | val barrierRdd = df.rdd.barrier() 17 | 18 | //fails running as it needs minimum four cores for four partitions 19 | val count = barrierRdd.mapPartitions(v => v).count() 20 | 21 | println("count is " + count) 22 | 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/barrier/BarrierMethodExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.barrier 2 | 3 | import org.apache.spark.BarrierTaskContext 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object BarrierMethodExample { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val sparkSession = SparkSession.builder. 11 | master("local[4]") 12 | .appName("example") 13 | .getOrCreate() 14 | 15 | val df = sparkSession.range(0,100).repartition(4) 16 | 17 | val barrierRdd = df.rdd.barrier() 18 | 19 | val mappedRDD = barrierRdd.mapPartitionsWithIndex{ case (index,iterator) => { 20 | val taskContext = BarrierTaskContext.get() 21 | taskContext.barrier() 22 | println("barrier context completed") 23 | iterator 24 | }} 25 | 26 | mappedRDD.count() 27 | 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/barrier/BarrierRddExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.barrier 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object BarrierRddExample { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val sparkSession = SparkSession.builder. 10 | master("local[4]") 11 | .appName("example") 12 | .getOrCreate() 13 | 14 | val df = sparkSession.range(0,100).repartition(4) 15 | 16 | val barrierRdd = df.rdd.barrier() 17 | 18 | val count = barrierRdd.mapPartitions(v => v).count() 19 | 20 | println("count is " + count) 21 | 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/core/BarrierExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.core 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object BarrierExample { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val sparkSession = SparkSession.builder(). 10 | appName("simple").master("local[4]").getOrCreate() 11 | 12 | val df = sparkSession.range(100).repartition(2) 13 | 14 | //run barrier mode 15 | 16 | val barrierRDD = df.rdd.barrier() 17 | 18 | barrierRDD.mapPartitionsWithIndex{ 19 | case (index,value) => { 20 | //first wait for 10s 21 | Thread.sleep(10000) 22 | if(index == 1) throw new IllegalArgumentException 23 | Thread.sleep(100000) 24 | value 25 | }}.count() 26 | 27 | 28 | 29 | 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/core/plugins/custommetrics/CustomMetricExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.core.plugins.custommetrics 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object CustomMetricExample { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val sparkConf = new SparkConf() 11 | .setMaster("local[4]") 12 | .set("spark.plugins","com.madhukaraphatak.spark.core.plugins.custommetrics.CustomMetricSparkPlugin") 13 | .set("spark.metrics.conf","src/main/resources/metrics.properties") 14 | .setAppName("executor plugin example") 15 | 16 | 17 | val sparkSession = SparkSession.builder.config(sparkConf).getOrCreate() 18 | 19 | import sparkSession.implicits._ 20 | 21 | val df = sparkSession.range(5000).repartition(5) 22 | 23 | val incrementedDf = df.mapPartitions(iterator => { 24 | var evenCount = 0 25 | val incrementedIterator = iterator.toList.map(value => { 26 | if(value % 2 == 0) evenCount = evenCount +1 27 | value +1 28 | }).toIterator 29 | CustomMetricSparkPlugin.value.inc(evenCount) 30 | incrementedIterator 31 | }) 32 | 33 | 34 | incrementedDf.count() 35 | 36 | 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/core/plugins/custommetrics/CustomMetricSparkPlugin.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.core.plugins.custommetrics 2 | 3 | import java.util 4 | 5 | import com.codahale.metrics.Counter 6 | import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext, SparkPlugin} 7 | 8 | object CustomMetricSparkPlugin { 9 | val value = new Counter 10 | } 11 | 12 | class CustomMetricSparkPlugin extends SparkPlugin{ 13 | 14 | override def driverPlugin(): DriverPlugin = null 15 | override def executorPlugin(): ExecutorPlugin = new ExecutorPlugin { 16 | override def init(ctx: PluginContext, extraConf: util.Map[String, String]): Unit = { 17 | val metricRegistry = ctx.metricRegistry() 18 | metricRegistry.register("evenMetrics",CustomMetricSparkPlugin.value) 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/core/plugins/driverplugin/CustomDriverPlugin.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.core.plugins.driverplugin 2 | import java.net.ServerSocket 3 | import java.util 4 | 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.api.plugin.{DriverPlugin, PluginContext} 7 | import org.apache.spark.sql.SparkSession 8 | 9 | class CustomDriverPlugin extends DriverPlugin{ 10 | 11 | var sparkContext:SparkContext =null 12 | var runningThread:Thread = null 13 | 14 | class ServerSocketListener { 15 | var port = 9999 16 | val listener = new ServerSocket(port) 17 | while (true) { 18 | val socket = listener.accept() 19 | new Thread(){ 20 | override def run(): Unit = { 21 | println(" got client " + socket.getInetAddress) 22 | val sparkSession = SparkSession.builder().getOrCreate() 23 | sparkSession.catalog.uncacheTable("test") 24 | } 25 | socket.close() 26 | }.start() 27 | } 28 | } 29 | 30 | override def init(sc: SparkContext, pluginContext: PluginContext): util.Map[String, String] = { 31 | println("########### called init of custom driver plugin") 32 | this.sparkContext =sparkContext 33 | 34 | runningThread = new Thread(){ 35 | override def run(): Unit = { 36 | new ServerSocketListener() 37 | } 38 | } 39 | runningThread.start() 40 | 41 | super.init(sc, pluginContext) 42 | } 43 | override def shutdown(): Unit = { 44 | println("############ called shutdown") 45 | runningThread.interrupt() 46 | System.exit(0) 47 | super.shutdown() 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/core/plugins/driverplugin/CustomSparkPlugin.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.core.plugins.driverplugin 2 | 3 | import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, SparkPlugin} 4 | 5 | class CustomSparkPlugin extends SparkPlugin{ 6 | override def driverPlugin(): DriverPlugin = new CustomDriverPlugin 7 | 8 | override def executorPlugin(): ExecutorPlugin = null 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/core/plugins/driverplugin/DriverPluginExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.core.plugins.driverplugin 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object DriverPluginExample { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val sparkConf = new SparkConf() 11 | .setMaster("local[2]") 12 | .set("spark.plugins","com.madhukaraphatak.spark.core.plugins.driverplugin.CustomSparkPlugin") 13 | .setAppName("executor plugin example") 14 | 15 | 16 | val sparkSession = SparkSession.builder.config(sparkConf).getOrCreate() 17 | val df = sparkSession.range(5000) 18 | 19 | //cache the table 20 | df.createOrReplaceTempView("test") 21 | sparkSession.catalog.cacheTable("test") 22 | 23 | df.count() 24 | 25 | Thread.sleep(10000) 26 | 27 | sparkSession.stop() 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/core/plugins/dynamicconfig/Configuration.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.core.plugins.dynamicconfig 2 | 3 | object Configuration { 4 | 5 | private var value = 10 6 | 7 | def getConfig: Int = value 8 | 9 | def changeConfig(newValue : Int):Int = {value = newValue; value} 10 | 11 | } 12 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/core/plugins/dynamicconfig/CustomConfigDriverPlugin.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.core.plugins.dynamicconfig 2 | 3 | import java.io.PrintWriter 4 | import java.net.ServerSocket 5 | import java.util 6 | 7 | import org.apache.spark.SparkContext 8 | import org.apache.spark.api.plugin.{DriverPlugin, PluginContext} 9 | 10 | 11 | class CustomConfigDriverPlugin extends DriverPlugin { 12 | 13 | var sparkContext: SparkContext = null 14 | var runningThread: Thread = null 15 | 16 | class ServerSocketListener { 17 | var port = 9999 18 | val listener = new ServerSocket(port) 19 | while (true) { 20 | val socket = listener.accept() 21 | new Thread() { 22 | override def run(): Unit = { 23 | val currentValue = Configuration.getConfig 24 | Configuration.changeConfig(currentValue + 10) 25 | val response = "HTTP/1.1 200 OK \r\n\r\n "+s" the latest configuration is ${Configuration.getConfig}" 26 | socket.getOutputStream().write(response.getBytes("UTF-8")) 27 | socket.getOutputStream.flush() 28 | socket.close() 29 | } 30 | }.start() 31 | } 32 | } 33 | 34 | override def init(sc: SparkContext, pluginContext: PluginContext): util.Map[String, String] = { 35 | this.sparkContext = sparkContext 36 | 37 | runningThread = new Thread() { 38 | override def run(): Unit = { 39 | new ServerSocketListener() 40 | } 41 | } 42 | runningThread.start() 43 | 44 | super.init(sc, pluginContext) 45 | } 46 | 47 | override def shutdown(): Unit = { 48 | runningThread.interrupt() 49 | System.exit(0) 50 | super.shutdown() 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/core/plugins/dynamicconfig/CustomConfigSparkPlugin.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.core.plugins.dynamicconfig 2 | 3 | import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, SparkPlugin} 4 | 5 | class CustomConfigSparkPlugin extends SparkPlugin{ 6 | override def driverPlugin(): DriverPlugin = new CustomConfigDriverPlugin 7 | 8 | override def executorPlugin(): ExecutorPlugin = null 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/core/plugins/dynamicconfig/DynamicConfigExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.core.plugins.dynamicconfig 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.streaming.{OutputMode, Trigger} 6 | 7 | object DynamicConfigExample { 8 | 9 | def main(args: Array[String]): Unit = { 10 | 11 | val sparkConf = new SparkConf() 12 | .setMaster("local[2]") 13 | .set("spark.plugins","com.madhukaraphatak.spark.core.plugins.dynamicconfig.CustomConfigSparkPlugin") 14 | .setAppName("executor plugin example") 15 | 16 | 17 | 18 | val sparkSession = SparkSession.builder.config(sparkConf).getOrCreate() 19 | 20 | sparkSession.sparkContext.setLogLevel("ERROR") 21 | 22 | import sparkSession.implicits._ 23 | 24 | val df = sparkSession.readStream 25 | .format("socket") 26 | .option("host","localhost") 27 | .option("port",8888).load().as[String] 28 | 29 | val returnDf = df.map(value => value + Configuration.getConfig) 30 | 31 | val query = returnDf.writeStream. 32 | queryName("something") 33 | .format("console") 34 | .outputMode(OutputMode.Append()) 35 | 36 | query.start().awaitTermination() 37 | 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/core/plugins/rpccommunication/RpcCommunicationExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.core.plugins.rpccommunication 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object RpcCommunicationExample { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val sparkConf = new SparkConf() 11 | .setMaster("local[2]") 12 | .set("spark.plugins","com.madhukaraphatak.spark.core.plugins.rpccommunication.RpcSparkPlugin") 13 | .setAppName("rpc communication example") 14 | 15 | val sparkSession = SparkSession.builder.config(sparkConf).getOrCreate() 16 | sparkSession.stop() 17 | 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/core/plugins/rpccommunication/RpcSparkPlugin.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.core.plugins.rpccommunication 2 | 3 | import java.util 4 | 5 | import com.codahale.metrics.Counter 6 | import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext, SparkPlugin} 7 | 8 | 9 | case object InitialConfigRequest extends Serializable 10 | case class InitialConfigResponse(value:Int) extends Serializable 11 | 12 | case class FinalValueResponse(value : Int) extends Serializable 13 | case class RpcMessage(message:String) extends Serializable 14 | 15 | 16 | class RpcSparkPlugin extends SparkPlugin{ 17 | override def driverPlugin(): DriverPlugin = new DriverPlugin { 18 | override def receive(message: scala.Any): AnyRef = { 19 | message match { 20 | case InitialConfigRequest => InitialConfigResponse(10) 21 | case FinalValueResponse(value) => println("the final value is "+ value); Unit 22 | 23 | } 24 | } 25 | } 26 | 27 | override def executorPlugin(): ExecutorPlugin = new ExecutorPlugin { 28 | var pluginContext:PluginContext = null 29 | var initialConfiguration:Int = 0 30 | 31 | override def init(ctx: PluginContext, extraConf: util.Map[String, String]): Unit = { 32 | pluginContext = ctx 33 | initialConfiguration = pluginContext.ask(InitialConfigRequest).asInstanceOf[InitialConfigResponse].value 34 | println("the initial configuration is " + initialConfiguration) 35 | } 36 | 37 | override def shutdown(): Unit = { 38 | val rpcMessage = FinalValueResponse(10 * initialConfiguration) 39 | pluginContext.send(rpcMessage) 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/ml/MLUtils.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.ml 2 | 3 | import org.apache.spark.mllib.evaluation.MulticlassMetrics 4 | import org.apache.spark.sql.DataFrame 5 | 6 | object MLUtils { 7 | 8 | def accuracyScore(df: DataFrame, label: String, predictCol: String) = { 9 | val rdd = df.select(predictCol,label).rdd.map(row ⇒ (row.getDouble(0), row.getInt(1).toDouble)) 10 | new MulticlassMetrics(rdd).accuracy 11 | } 12 | def recall(df: DataFrame, labelCol: String, predictCol: String, labelValue:Double) = { 13 | val rdd = df.select(predictCol,labelCol).rdd.map(row ⇒ (row.getDouble(0), row.getInt(1).toDouble)) 14 | new MulticlassMetrics(rdd).recall(labelValue) 15 | } 16 | 17 | def trainTestSplit(df:DataFrame, testSize:Double = 0.3):(DataFrame,DataFrame) = { 18 | val dfs = df.randomSplit(Array(1-testSize, testSize)) 19 | val trainDf = dfs(0) 20 | val crossDf = dfs(1) 21 | (trainDf,crossDf) 22 | } 23 | 24 | } 25 | 26 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/ml/MultiColumnTransformer.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.ml 2 | 3 | import org.apache.spark.ml.feature.StringIndexer 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object MultiColumnTransformer { 7 | def main(args: Array[String]): Unit = { 8 | 9 | val sparkSession = SparkSession.builder. 10 | master("local") 11 | .appName("example") 12 | .getOrCreate() 13 | 14 | 15 | val salaryDf = sparkSession.read.format("csv").option("header", "true").load("src/main/resources/adult.csv") 16 | 17 | val inputColumns = Array("workclass","education") 18 | 19 | val outputColumns = Array("workclass_indexed", "education_indexed") 20 | 21 | 22 | // indexer multiple column 23 | val stringIndexer = new StringIndexer() 24 | stringIndexer.setInputCols(inputColumns) 25 | stringIndexer.setOutputCols(outputColumns) 26 | val indexedDf = stringIndexer.fit(salaryDf).transform(salaryDf) 27 | 28 | indexedDf.select(outputColumns.head, outputColumns.tail:_*).show() 29 | 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/ml/WeightedLogisticRegression.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.ml 2 | 3 | import org.apache.spark.ml._ 4 | import org.apache.spark.ml.classification.LogisticRegression 5 | import org.apache.spark.ml.feature._ 6 | import org.apache.spark.sql.{DataFrame, SparkSession} 7 | import MLUtils._ 8 | import org.apache.spark.sql.functions._ 9 | 10 | /** 11 | * Weighted Logistic Regression for Credit Card Fraud 12 | * 13 | */ 14 | object WeightedLogisticRegression { 15 | 16 | def main(args: Array[String]) { 17 | 18 | val sparkSession = SparkSession.builder. 19 | master("local[4]") 20 | .appName("example") 21 | .getOrCreate() 22 | 23 | sparkSession.sparkContext.setLogLevel("ERROR") 24 | //load train df 25 | // Download the data from : https://www.kaggle.com/dalpozz/creditcardfraud/downloads/creditcard.csv 26 | val df = sparkSession.read.option("header", "true").option("inferSchema", "true").csv("src/main/resources/creditcard.csv") 27 | df.printSchema() 28 | 29 | val amountVectorAssembler = new VectorAssembler().setInputCols(Array("Amount")).setOutputCol("Amount_vector") 30 | val standarScaler = new StandardScaler().setInputCol("Amount_vector").setOutputCol("Amount_scaled") 31 | val dropColumns = Array("Time","Amount","Class") 32 | 33 | val cols = df.columns.filter( column => !dropColumns.contains(column)) ++ Array("Amount_scaled") 34 | val vectorAssembler = new VectorAssembler().setInputCols(cols).setOutputCol("features") 35 | 36 | // pipeline 37 | val logisticRegression = new LogisticRegression().setLabelCol("Class") 38 | val trainPipeline = new Pipeline().setStages(Array(amountVectorAssembler,standarScaler,vectorAssembler,logisticRegression)) 39 | 40 | println("for imbalanced data") 41 | runPipeline(trainPipeline, df) 42 | 43 | // add weight column 44 | val ratioOfFraud = getRatio(df) 45 | val fraudWeight = 1 - ratioOfFraud 46 | val nonFraudWeight = ratioOfFraud 47 | 48 | val weightedDF = df.withColumn("weight", 49 | when(df.col("Class").===("1.0"),fraudWeight) 50 | .otherwise(nonFraudWeight)) 51 | 52 | logisticRegression.setWeightCol("weight") 53 | println("for balanced data") 54 | val balancedModel = runPipeline(trainPipeline, weightedDF) 55 | 56 | println("balanced model for full data") 57 | printScores(balancedModel, weightedDF) 58 | 59 | } 60 | 61 | 62 | def getRatio(df:DataFrame) = { 63 | val fraudDf = df.filter("Class=1.0") 64 | val sampleRatio = fraudDf.count().toDouble / df.count().toDouble 65 | sampleRatio 66 | } 67 | 68 | def runPipeline(pipeline:Pipeline, df:DataFrame):PipelineModel = { 69 | val (trainDf,crossDf) = trainTestSplit(df) 70 | val model = pipeline.fit(trainDf) 71 | printScores(model, crossDf) 72 | model 73 | } 74 | 75 | def printScores(model:PipelineModel, df:DataFrame) = { 76 | println("test accuracy with pipeline " + accuracyScore(model.transform(df), "Class", "prediction")) 77 | println("test recall for 1.0 is " + recall(model.transform(df), "Class", "prediction", 1.0)) 78 | } 79 | } 80 | 81 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sources/BinaryFile.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sources 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object BinaryFile { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val sparkSession = SparkSession.builder(). 10 | appName("simple").master("local").getOrCreate() 11 | 12 | val df = sparkSession.read.format("binaryFile") 13 | .load("/home/madhu/Downloads/IMG_20190506_210110-EFFECTS.jpg") 14 | 15 | df.select("content").show() 16 | 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sources/MultiCharacterDelimiterCSV.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sources 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object MultiCharacterDelimiterCSV { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val sparkSession = SparkSession.builder. 10 | master("local") 11 | .appName("csvexample") 12 | .getOrCreate() 13 | 14 | 15 | // throws java.lang.IllegalArgumentException: Delimiter cannot be more than one character: || 16 | // in spark 2.x 17 | 18 | val df = sparkSession.read 19 | .option("delimiter","||") 20 | .option("header","true") 21 | .csv("src/main/resources/multicharacterseperator.csv") 22 | 23 | df.show() 24 | 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sources/RecursiveFolderReadExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sources 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object RecursiveFolderReadExample { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val sparkSession = SparkSession.builder. 10 | master("local") 11 | .appName("csvexample") 12 | .getOrCreate() 13 | 14 | 15 | // normal read 16 | 17 | val df = sparkSession.read 18 | .option("delimiter","||") 19 | .option("header","true") 20 | .csv("src/main/resources/nested") 21 | 22 | assert(df.count() == 2) 23 | 24 | // recursive read 25 | val recursiveDf = sparkSession.read 26 | .option("delimiter","||") 27 | .option("recursiveFileLookup","true") 28 | .option("header","true") 29 | .csv("src/main/resources/nested") 30 | 31 | assert(recursiveDf.count() == 4) 32 | 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/DataSourceV2Example.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sources.datasourcev2 2 | 3 | import org.apache.spark.Partition 4 | import org.apache.spark.sql.{SaveMode, SparkSession} 5 | import shapeless.Tuple 6 | 7 | object DataSourceV2Example { 8 | 9 | def main(args: Array[String]) { 10 | 11 | val sparkSession = SparkSession.builder 12 | .master("local[2]") 13 | .appName("example") 14 | .getOrCreate() 15 | 16 | val simpleDf = sparkSession.read 17 | .format("com.madhukaraphatak.spark.sources.datasourcev2.simple") 18 | .load() 19 | 20 | simpleDf.show() 21 | println( 22 | "number of partitions in simple source is " + simpleDf.rdd.getNumPartitions) 23 | 24 | 25 | val simpleMultiDf = sparkSession.read 26 | .format("com.madhukaraphatak.spark.sources.datasourcev2.simplemulti") 27 | .load() 28 | 29 | simpleMultiDf.show() 30 | println( 31 | "number of partitions in simple multi source is " + simpleMultiDf.rdd.getNumPartitions) 32 | 33 | 34 | val simpleCsvDf = sparkSession.read 35 | .format("com.madhukaraphatak.spark.sources.datasourcev2.simplecsv") 36 | .load("src/main/resources/adult.csv") 37 | 38 | simpleCsvDf.printSchema() 39 | simpleCsvDf.show() 40 | println( 41 | "number of partitions in simple csv source is " + simpleCsvDf.rdd.getNumPartitions) 42 | 43 | 44 | 45 | 46 | val simpleMysqlDf = sparkSession.createDataFrame(Seq( 47 | Tuple1("test1"), 48 | Tuple1("test2") 49 | )).toDF("user") 50 | 51 | //write examples 52 | simpleMysqlDf.write 53 | .format( 54 | "com.madhukaraphatak.spark.sources.datasourcev2.simplemysqlwriter") 55 | .mode(SaveMode.Append) 56 | .save() 57 | 58 | /* simpleMysqlDf.write 59 | .format( 60 | "com.madhukaraphatak.examples.sparktwo.datasourcev2.mysqlwithtransaction") 61 | .save() 62 | 63 | val simplePartitoningDf = sparkSession.read 64 | .format( 65 | "com.madhukaraphatak.examples.sparktwo.datasourcev2.partitionaffinity") 66 | .load() 67 | 68 | val dfRDD = simplePartitoningDf.rdd 69 | val baseRDD = 70 | dfRDD.dependencies.head.rdd.dependencies.head.rdd.dependencies.head.rdd 71 | 72 | val partition = baseRDD.partitions(0) 73 | val getPrefferedLocationDef = baseRDD.getClass 74 | .getMethod("getPreferredLocations", classOf[Partition]) 75 | val preferredLocation = getPrefferedLocationDef 76 | .invoke(baseRDD, partition) 77 | .asInstanceOf[Seq[String]] 78 | println("preferred location is " + preferredLocation) 79 | 80 | */ 81 | 82 | sparkSession.stop() 83 | 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/SimpleCsvDataSource.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sources.datasourcev2.simplecsv 2 | 3 | import java.util 4 | 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.catalyst.InternalRow 7 | import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability, TableProvider} 8 | import org.apache.spark.sql.connector.expressions.Transform 9 | import org.apache.spark.sql.connector.read._ 10 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 11 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 12 | import org.apache.spark.unsafe.types.UTF8String 13 | 14 | import scala.collection.JavaConverters._ 15 | 16 | /* 17 | * Default source should some kind of relation provider 18 | */ 19 | class DefaultSource extends TableProvider{ 20 | 21 | override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType = 22 | getTable(null,Array.empty[Transform],caseInsensitiveStringMap.asCaseSensitiveMap()).schema() 23 | 24 | override def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table ={ 25 | val path = map.get("path") 26 | new CsvBatchTable(path) 27 | } 28 | 29 | } 30 | 31 | object SchemaUtils { 32 | def getSchema(path:String):StructType = { 33 | val sparkContext = SparkSession.builder.getOrCreate().sparkContext 34 | val firstLine = sparkContext.textFile(path).first() 35 | val columnNames = firstLine.split(",") 36 | val structFields = columnNames.map(value ⇒ StructField(value, StringType)) 37 | StructType(structFields) 38 | } 39 | } 40 | /* 41 | Defines Read Support and Initial Schema 42 | */ 43 | 44 | class CsvBatchTable(path:String) extends Table with SupportsRead { 45 | override def name(): String = this.getClass.toString 46 | 47 | override def schema(): StructType = SchemaUtils.getSchema(path) 48 | 49 | override def capabilities(): util.Set[TableCapability] = Set(TableCapability.BATCH_READ).asJava 50 | 51 | override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = new CsvScanBuilder(path) 52 | } 53 | 54 | 55 | /* 56 | Scan object with no mixins 57 | */ 58 | class CsvScanBuilder(path:String) extends ScanBuilder { 59 | override def build(): Scan = new CsvScan(path) 60 | } 61 | 62 | 63 | // simple class to organise the partition 64 | case class CsvPartition(val partitionNumber:Int, path:String, header:Boolean=true) extends InputPartition 65 | 66 | 67 | /* 68 | Batch Reading Support 69 | 70 | The schema is repeated here as it can change after column pruning etc 71 | */ 72 | 73 | class CsvScan(path:String) extends Scan with Batch{ 74 | override def readSchema(): StructType = SchemaUtils.getSchema(path) 75 | 76 | override def toBatch: Batch = this 77 | 78 | override def planInputPartitions(): Array[InputPartition] = { 79 | val sparkContext = SparkSession.builder.getOrCreate().sparkContext 80 | val rdd = sparkContext.textFile(path) 81 | val partitions = ( 0 to rdd.partitions.length - 1).map(value => CsvPartition(value, path)) 82 | partitions.toArray 83 | 84 | } 85 | override def createReaderFactory(): PartitionReaderFactory = new CsvPartitionReaderFactory() 86 | } 87 | 88 | 89 | // reader factory 90 | class CsvPartitionReaderFactory extends PartitionReaderFactory { 91 | override def createReader(partition: InputPartition): PartitionReader[InternalRow] = new 92 | CsvPartitionReader(partition.asInstanceOf[CsvPartition]) 93 | } 94 | 95 | 96 | // parathion reader 97 | class CsvPartitionReader(inputPartition: CsvPartition) extends PartitionReader[InternalRow] { 98 | 99 | var iterator: Iterator[String] = null 100 | 101 | @transient 102 | def next = { 103 | if (iterator == null) { 104 | val sparkContext = SparkSession.builder.getOrCreate().sparkContext 105 | val rdd = sparkContext.textFile(inputPartition.path) 106 | val filterRDD = if (inputPartition.header) { 107 | val firstLine = rdd.first 108 | rdd.filter(_ != firstLine) 109 | } 110 | else rdd 111 | val partition = filterRDD.partitions(inputPartition.partitionNumber) 112 | iterator = filterRDD.iterator(partition, org.apache.spark.TaskContext.get()) 113 | } 114 | iterator.hasNext 115 | } 116 | 117 | def get = { 118 | val line = iterator.next() 119 | InternalRow.fromSeq(line.split(",").map(value => UTF8String.fromString(value))) 120 | } 121 | 122 | def close() = Unit 123 | 124 | } 125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/SimpleDataSource.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sources.datasourcev2.simple 2 | 3 | import java.util 4 | 5 | import org.apache.spark.sql.catalyst.InternalRow 6 | import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability, TableProvider} 7 | import org.apache.spark.sql.connector.expressions.Transform 8 | import org.apache.spark.sql.connector.read._ 9 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 10 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 11 | import org.apache.spark.unsafe.types.UTF8String 12 | 13 | import scala.collection.JavaConverters._ 14 | 15 | /* 16 | * Default source should some kind of relation provider 17 | */ 18 | class DefaultSource extends TableProvider{ 19 | 20 | override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType = 21 | getTable(null,Array.empty[Transform],caseInsensitiveStringMap.asCaseSensitiveMap()).schema() 22 | 23 | override def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table = 24 | new SimpleBatchTable() 25 | } 26 | 27 | 28 | /* 29 | Defines Read Support and Initial Schema 30 | */ 31 | 32 | class SimpleBatchTable extends Table with SupportsRead { 33 | override def name(): String = this.getClass.toString 34 | 35 | override def schema(): StructType = StructType(Array(StructField("value", StringType))) 36 | 37 | override def capabilities(): util.Set[TableCapability] = Set(TableCapability.BATCH_READ).asJava 38 | 39 | override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = new SimpleScanBuilder() 40 | } 41 | 42 | 43 | 44 | /* 45 | Scan object with no mixins 46 | */ 47 | class SimpleScanBuilder extends ScanBuilder { 48 | override def build(): Scan = new SimpleScan 49 | } 50 | 51 | /* 52 | Batch Reading Support 53 | 54 | The schema is repeated here as it can change after column pruning etc 55 | */ 56 | 57 | class SimpleScan extends Scan with Batch{ 58 | override def readSchema(): StructType = StructType(Array(StructField("value", StringType))) 59 | 60 | override def toBatch: Batch = this 61 | 62 | override def planInputPartitions(): Array[InputPartition] = { 63 | Array(new SimplePartition()) 64 | } 65 | override def createReaderFactory(): PartitionReaderFactory = new SimplePartitionReaderFactory() 66 | } 67 | 68 | // simple class to organise the partition 69 | class SimplePartition extends InputPartition 70 | 71 | // reader factory 72 | class SimplePartitionReaderFactory extends PartitionReaderFactory { 73 | override def createReader(partition: InputPartition): PartitionReader[InternalRow] = new SimplePartitionReader 74 | } 75 | 76 | 77 | // parathion reader 78 | class SimplePartitionReader extends PartitionReader[InternalRow] { 79 | 80 | val values = Array("1", "2", "3", "4", "5") 81 | 82 | var index = 0 83 | 84 | def next = index < values.length 85 | 86 | def get = { 87 | val stringValue = values(index) 88 | val stringUtf = UTF8String.fromString(stringValue) 89 | val row = InternalRow(stringUtf) 90 | index = index + 1 91 | row 92 | } 93 | 94 | def close() = Unit 95 | 96 | } 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/SimpleMultiDataSource.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sources.datasourcev2.simplemulti 2 | 3 | import java.util 4 | 5 | import org.apache.spark.sql.catalyst.InternalRow 6 | import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability, TableProvider} 7 | import org.apache.spark.sql.connector.expressions.Transform 8 | import org.apache.spark.sql.connector.read._ 9 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 10 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 11 | import org.apache.spark.unsafe.types.UTF8String 12 | 13 | import scala.collection.JavaConverters._ 14 | 15 | /* 16 | * Default source should some kind of relation provider 17 | */ 18 | class DefaultSource extends TableProvider{ 19 | 20 | override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType = 21 | getTable(null,Array.empty[Transform],caseInsensitiveStringMap.asCaseSensitiveMap()).schema() 22 | 23 | override def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table = 24 | new SimpleBatchTable() 25 | 26 | } 27 | 28 | 29 | /* 30 | Defines Read Support and Initial Schema 31 | */ 32 | 33 | class SimpleBatchTable extends Table with SupportsRead { 34 | override def name(): String = this.getClass.toString 35 | 36 | override def schema(): StructType = StructType(Array(StructField("value", StringType))) 37 | 38 | override def capabilities(): util.Set[TableCapability] = Set(TableCapability.BATCH_READ).asJava 39 | 40 | override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = new SimpleScanBuilder() 41 | } 42 | 43 | 44 | /* 45 | Scan object with no mixins 46 | */ 47 | class SimpleScanBuilder extends ScanBuilder { 48 | override def build(): Scan = new SimpleScan 49 | } 50 | 51 | /* 52 | Batch Reading Support 53 | 54 | The schema is repeated here as it can change after column pruning etc 55 | */ 56 | 57 | class SimpleScan extends Scan with Batch{ 58 | override def readSchema(): StructType = StructType(Array(StructField("value", StringType))) 59 | 60 | override def toBatch: Batch = this 61 | 62 | override def planInputPartitions(): Array[InputPartition] = { 63 | Array(new SimplePartition(0,4), 64 | new SimplePartition(5,9)) 65 | } 66 | override def createReaderFactory(): PartitionReaderFactory = new SimplePartitionReaderFactory() 67 | } 68 | 69 | // simple class to organise the partition 70 | class SimplePartition(val start:Int, val end:Int) extends InputPartition 71 | 72 | // reader factory 73 | class SimplePartitionReaderFactory extends PartitionReaderFactory { 74 | override def createReader(partition: InputPartition): PartitionReader[InternalRow] = new 75 | SimplePartitionReader(partition.asInstanceOf[SimplePartition]) 76 | } 77 | 78 | 79 | // parathion reader 80 | class SimplePartitionReader(inputPartition: SimplePartition) extends PartitionReader[InternalRow] { 81 | 82 | val values = Array("1", "2", "3", "4", "5","6","7","8","9","10") 83 | 84 | var index = inputPartition.start 85 | 86 | def next = index <= inputPartition.end 87 | 88 | def get = { 89 | val stringValue = values(index) 90 | val stringUtf = UTF8String.fromString(stringValue) 91 | val row = InternalRow(stringUtf) 92 | index = index + 1 93 | row 94 | } 95 | 96 | def close() = Unit 97 | 98 | } 99 | 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/SimpleMysqlWriterDataSource.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sources.datasourcev2.simplemysqlwriter 2 | 3 | import java.sql.DriverManager 4 | import java.util 5 | 6 | import org.apache.spark.sql.catalyst.InternalRow 7 | import org.apache.spark.sql.connector.catalog._ 8 | import org.apache.spark.sql.connector.expressions.Transform 9 | import org.apache.spark.sql.connector.write._ 10 | import org.apache.spark.sql.types.{StringType, StructType} 11 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 12 | 13 | import scala.collection.JavaConverters._ 14 | 15 | /* 16 | * Default source should some kind of relation provider 17 | * 18 | */ 19 | 20 | class DefaultSource extends TableProvider{ 21 | override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType = 22 | getTable(null,Array.empty[Transform],caseInsensitiveStringMap.asCaseSensitiveMap()).schema() 23 | 24 | override def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table ={ 25 | new MysqlTable 26 | } 27 | } 28 | 29 | 30 | class MysqlTable extends SupportsWrite{ 31 | 32 | private val tableSchema = new StructType().add("user", StringType) 33 | 34 | 35 | override def name(): String = this.getClass.toString 36 | 37 | override def schema(): StructType = tableSchema 38 | 39 | override def capabilities(): util.Set[TableCapability] = Set(TableCapability.BATCH_WRITE, 40 | TableCapability.TRUNCATE).asJava 41 | 42 | override def newWriteBuilder(logicalWriteInfo: LogicalWriteInfo): WriteBuilder = new MysqlWriterBuilder 43 | } 44 | 45 | class MysqlWriterBuilder extends WriteBuilder{ 46 | override def buildForBatch(): BatchWrite = new MysqlBatchWriter() 47 | } 48 | 49 | class MysqlBatchWriter extends BatchWrite{ 50 | override def createBatchWriterFactory(physicalWriteInfo: PhysicalWriteInfo): DataWriterFactory = new 51 | MysqlDataWriterFactory 52 | 53 | override def commit(writerCommitMessages: Array[WriterCommitMessage]): Unit = {} 54 | 55 | override def abort(writerCommitMessages: Array[WriterCommitMessage]): Unit = {} 56 | } 57 | 58 | class MysqlDataWriterFactory extends DataWriterFactory { 59 | override def createWriter(partitionId: Int, taskId:Long): DataWriter[InternalRow] = new MysqlWriter() 60 | } 61 | 62 | 63 | 64 | object WriteSucceeded extends WriterCommitMessage 65 | 66 | class MysqlWriter extends DataWriter[InternalRow] { 67 | val url = "jdbc:mysql://localhost/test" 68 | val user = "root" 69 | val password = "abc123" 70 | val table ="userwrite" 71 | 72 | val connection = DriverManager.getConnection(url,user,password) 73 | val statement = "insert into userwrite (user) values (?)" 74 | val preparedStatement = connection.prepareStatement(statement) 75 | 76 | 77 | override def write(record: InternalRow): Unit = { 78 | val value = record.getString(0) 79 | preparedStatement.setString(1,value) 80 | preparedStatement.executeUpdate() 81 | } 82 | 83 | override def commit(): WriterCommitMessage = WriteSucceeded 84 | 85 | override def abort(): Unit = {} 86 | 87 | override def close(): Unit = {} 88 | } 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/streamandbatch/DataSourceV2StreamAndBatchExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sources.datasourcev2.streamandbatch 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.OutputMode 5 | 6 | object DataSourceV2StreamAndBatchExample { 7 | def main(args: Array[String]): Unit = { 8 | val sparkSession = SparkSession.builder. 9 | master("local[2]") 10 | .appName("streaming example") 11 | .getOrCreate() 12 | 13 | 14 | val dataSource = "com.madhukaraphatak.spark.sources.datasourcev2.streamandbatch.simple" 15 | 16 | val batchDf = sparkSession 17 | .read 18 | .format(dataSource) 19 | .load() 20 | 21 | batchDf.show() 22 | 23 | val streamingDf = sparkSession. 24 | readStream. 25 | format(dataSource) 26 | .load() 27 | 28 | val query = streamingDf.writeStream 29 | .format("console") 30 | .queryName("simple_source") 31 | .outputMode(OutputMode.Append()) 32 | 33 | query.start().awaitTermination() 34 | 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/streamandbatch/SimpleStreamAndBatchDataSource.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sources.datasourcev2.streamandbatch.simple 2 | 3 | import java.util 4 | 5 | import org.apache.spark.sql.catalyst.InternalRow 6 | import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability, TableProvider} 7 | import org.apache.spark.sql.connector.expressions.Transform 8 | import org.apache.spark.sql.connector.read._ 9 | import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset} 10 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 11 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 12 | import org.apache.spark.unsafe.types.UTF8String 13 | 14 | import scala.collection.JavaConverters._ 15 | 16 | /* 17 | * Default source should some kind of relation provider 18 | */ 19 | class DefaultSource extends TableProvider{ 20 | 21 | override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType = 22 | getTable(null,Array.empty[Transform],caseInsensitiveStringMap.asCaseSensitiveMap()).schema() 23 | 24 | override def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table = 25 | new SimpleStreamingTable() 26 | } 27 | 28 | 29 | /* 30 | Defines Read Support and Initial Schema 31 | */ 32 | 33 | class SimpleStreamingTable extends Table with SupportsRead { 34 | override def name(): String = this.getClass.toString 35 | 36 | override def schema(): StructType = StructType(Array(StructField("value", StringType))) 37 | 38 | override def capabilities(): util.Set[TableCapability] = Set(TableCapability.MICRO_BATCH_READ, 39 | TableCapability.BATCH_READ).asJava 40 | 41 | override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = new SimpleScanBuilder() 42 | } 43 | 44 | 45 | /* 46 | Scan object with no mixins 47 | */ 48 | class SimpleScanBuilder extends ScanBuilder { 49 | override def build(): Scan = new SimpleScan 50 | } 51 | 52 | /* 53 | Batch Reading Support 54 | 55 | The schema is repeated here as it can change after column pruning etc 56 | */ 57 | 58 | class SimpleScan extends Scan{ 59 | override def readSchema(): StructType = StructType(Array(StructField("value", StringType))) 60 | 61 | override def toMicroBatchStream(checkpointLocation: String): MicroBatchStream = new SimpleMicroBatchStream() 62 | 63 | override def toBatch: Batch = new SimpleBatch 64 | } 65 | 66 | class SimpleBatch extends Batch{ 67 | override def planInputPartitions(): Array[InputPartition] = Array(new SimplePartition) 68 | 69 | override def createReaderFactory(): PartitionReaderFactory = new SimplePartitionReaderFactory 70 | } 71 | 72 | class SimpleOffset(value:Int) extends Offset { 73 | override def json(): String = s"""{"value":"$value"}""" 74 | } 75 | 76 | class SimpleMicroBatchStream extends MicroBatchStream { 77 | var latestOffsetValue = 0 78 | 79 | override def latestOffset(): Offset = { 80 | latestOffsetValue += 10 81 | new SimpleOffset(latestOffsetValue) 82 | } 83 | 84 | override def planInputPartitions(offset: Offset, offset1: Offset): Array[InputPartition] = Array(new SimplePartition) 85 | 86 | override def createReaderFactory(): PartitionReaderFactory = new SimplePartitionReaderFactory() 87 | 88 | override def initialOffset(): Offset = new SimpleOffset(latestOffsetValue) 89 | 90 | override def deserializeOffset(s: String): Offset = new SimpleOffset(latestOffsetValue) 91 | 92 | override def commit(offset: Offset): Unit = {} 93 | 94 | override def stop(): Unit = {} 95 | } 96 | 97 | 98 | // simple class to organise the partition 99 | class SimplePartition extends InputPartition 100 | 101 | // reader factory 102 | class SimplePartitionReaderFactory extends PartitionReaderFactory { 103 | override def createReader(partition: InputPartition): PartitionReader[InternalRow] = new SimplePartitionReader 104 | } 105 | 106 | 107 | // parathion reader 108 | class SimplePartitionReader extends PartitionReader[InternalRow] { 109 | 110 | val values = Array("1", "2", "3", "4", "5") 111 | 112 | var index = 0 113 | 114 | def next = index < values.length 115 | 116 | def get = { 117 | val stringValue = values(index) 118 | val stringUtf = UTF8String.fromString(stringValue) 119 | val row = InternalRow(stringUtf) 120 | index = index + 1 121 | row 122 | } 123 | 124 | def close() = Unit 125 | 126 | } 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/streaming/DataSourceV2StreamingExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sources.datasourcev2.streaming 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.OutputMode 5 | 6 | object DataSourceV2StreamingExample { 7 | def main(args: Array[String]): Unit = { 8 | val sparkSession = SparkSession.builder. 9 | master("local[2]") 10 | .appName("streaming example") 11 | .getOrCreate() 12 | 13 | val streamingDf = sparkSession. 14 | readStream. 15 | format("com.madhukaraphatak.spark.sources.datasourcev2.streaming.simple") 16 | .load() 17 | 18 | 19 | 20 | val query = streamingDf.writeStream 21 | .format("console") 22 | .queryName("simple_source") 23 | .outputMode(OutputMode.Append()) 24 | 25 | query.start().awaitTermination() 26 | 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/streaming/SimpleStreamingDataSource.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sources.datasourcev2.streaming.simple 2 | 3 | import java.util 4 | 5 | import org.apache.spark.sql.catalyst.InternalRow 6 | import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability, TableProvider} 7 | import org.apache.spark.sql.connector.expressions.Transform 8 | import org.apache.spark.sql.connector.read._ 9 | import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset} 10 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 11 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 12 | import org.apache.spark.unsafe.types.UTF8String 13 | 14 | import scala.collection.JavaConverters._ 15 | 16 | /* 17 | * Default source should some kind of relation provider 18 | */ 19 | class DefaultSource extends TableProvider{ 20 | 21 | override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType = 22 | getTable(null,Array.empty[Transform],caseInsensitiveStringMap.asCaseSensitiveMap()).schema() 23 | 24 | override def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table = 25 | new SimpleStreamingTable() 26 | } 27 | 28 | 29 | /* 30 | Defines Read Support and Initial Schema 31 | */ 32 | 33 | class SimpleStreamingTable extends Table with SupportsRead { 34 | override def name(): String = this.getClass.toString 35 | 36 | override def schema(): StructType = StructType(Array(StructField("value", StringType))) 37 | 38 | override def capabilities(): util.Set[TableCapability] = Set(TableCapability.MICRO_BATCH_READ).asJava 39 | 40 | override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = new SimpleScanBuilder() 41 | } 42 | 43 | 44 | /* 45 | Scan object with no mixins 46 | */ 47 | class SimpleScanBuilder extends ScanBuilder { 48 | override def build(): Scan = new SimpleScan 49 | } 50 | 51 | /* 52 | Batch Reading Support 53 | 54 | The schema is repeated here as it can change after column pruning etc 55 | */ 56 | 57 | class SimpleScan extends Scan{ 58 | override def readSchema(): StructType = StructType(Array(StructField("value", StringType))) 59 | 60 | override def toMicroBatchStream(checkpointLocation: String): MicroBatchStream = new SimpleMicroBatchStream() 61 | } 62 | 63 | class SimpleOffset(value:Int) extends Offset { 64 | override def json(): String = s"""{"value":"$value"}""" 65 | } 66 | 67 | class SimpleMicroBatchStream extends MicroBatchStream { 68 | var latestOffsetValue = 0 69 | 70 | override def latestOffset(): Offset = { 71 | latestOffsetValue += 10 72 | new SimpleOffset(latestOffsetValue) 73 | } 74 | 75 | override def planInputPartitions(offset: Offset, offset1: Offset): Array[InputPartition] = Array(new SimplePartition) 76 | 77 | override def createReaderFactory(): PartitionReaderFactory = new SimplePartitionReaderFactory() 78 | 79 | override def initialOffset(): Offset = new SimpleOffset(latestOffsetValue) 80 | 81 | override def deserializeOffset(s: String): Offset = new SimpleOffset(latestOffsetValue) 82 | 83 | override def commit(offset: Offset): Unit = {} 84 | 85 | override def stop(): Unit = {} 86 | } 87 | 88 | 89 | // simple class to organise the partition 90 | class SimplePartition extends InputPartition 91 | 92 | // reader factory 93 | class SimplePartitionReaderFactory extends PartitionReaderFactory { 94 | override def createReader(partition: InputPartition): PartitionReader[InternalRow] = new SimplePartitionReader 95 | } 96 | 97 | 98 | // parathion reader 99 | class SimplePartitionReader extends PartitionReader[InternalRow] { 100 | 101 | val values = Array("1", "2", "3", "4", "5") 102 | 103 | var index = 0 104 | 105 | def next = index < values.length 106 | 107 | def get = { 108 | val stringValue = values(index) 109 | val stringUtf = UTF8String.fromString(stringValue) 110 | val row = InternalRow(stringUtf) 111 | index = index + 1 112 | row 113 | } 114 | 115 | def close() = Unit 116 | 117 | } 118 | 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sql/DataFrameTail.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sql 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object DataFrameTail { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val sparkSession = SparkSession.builder(). 10 | appName("example").master("local").getOrCreate() 11 | 12 | val df = sparkSession.range(100) 13 | 14 | //head 15 | println(df.head(2).toList) 16 | 17 | println(df.tail(5).toList) 18 | 19 | 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sql/InMemoryTableScanExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sql 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object InMemoryTableScanExample { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val sparkSession = SparkSession.builder. 10 | master("local[2]") 11 | .appName("in memory table in UI example") 12 | .getOrCreate() 13 | 14 | 15 | val firstDF = sparkSession.createDataFrame(Seq( 16 | ("1", 10), 17 | ("2", 20) 18 | )).toDF("id", "sales") 19 | 20 | firstDF.createOrReplaceTempView("firstDf") 21 | sparkSession.catalog.cacheTable("firstDf") 22 | 23 | val secondDF = sparkSession.createDataFrame(Seq( 24 | ("1", 40), 25 | ("2", 50) 26 | )).toDF("id", "volume") 27 | 28 | secondDF.createOrReplaceTempView("secondDf") 29 | sparkSession.catalog.cacheTable("secondDf") 30 | 31 | val thirdDF = sparkSession.createDataFrame(Seq( 32 | ("1", 70), 33 | ("2", 80) 34 | )).toDF("id", "value") 35 | 36 | thirdDF.createOrReplaceTempView("thirdDf") 37 | sparkSession.catalog.cacheTable("thirdDf") 38 | 39 | val joinDF = firstDF.join(secondDF, "id").join(thirdDF,"id") 40 | 41 | joinDF.count() 42 | 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sql/JoinHintsExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sql 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object JoinHintsExample { 6 | def main(args: Array[String]): Unit = { 7 | 8 | val sparkSession = SparkSession.builder. 9 | master("local") 10 | .appName("join hits example") 11 | .getOrCreate() 12 | 13 | val salesDf = sparkSession.read. 14 | format("csv") 15 | .option("header", "true") 16 | .option("inferSchema", "true") 17 | .load("src/main/resources/sales.csv") 18 | 19 | 20 | val customerDf = sparkSession.read. 21 | format("csv") 22 | .option("header", "true") 23 | .option("inferSchema", "true") 24 | .load("src/main/resources/customers.csv") 25 | 26 | 27 | //broadcast hint 28 | 29 | val broadcastJoin = salesDf.hint("broadcast").join(customerDf,"customerId") 30 | broadcastJoin.show() 31 | 32 | // merge join 33 | 34 | val mergeJoin = salesDf.hint("merge").join(customerDf, "customerId") 35 | mergeJoin.show() 36 | 37 | // shuffle_hash 38 | 39 | val shuffleHashJoin = salesDf.hint("shuffle_hash").join(customerDf,"customerId") 40 | shuffleHashJoin.show() 41 | 42 | //shuffle_replicate_nl 43 | val cartesianProduct = salesDf.hint("shuffle_replicate_nl").join(customerDf) 44 | cartesianProduct.show() 45 | 46 | 47 | //Thread.sleep(1000000) 48 | 49 | 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sql/MinAndMaxByExample.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sql 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | 6 | object MinAndMaxByExample { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val sparkSession = SparkSession.builder. 11 | master("local") 12 | .appName("min and max by example") 13 | .getOrCreate() 14 | 15 | 16 | val df = sparkSession.createDataFrame(Seq( 17 | ("1", 10), 18 | ("2", 20), 19 | ("3", 30), 20 | ("4", 40) 21 | )).toDF("id","value") 22 | df.createOrReplaceTempView("table") 23 | 24 | 25 | // min by window function 26 | 27 | import org.apache.spark.sql.expressions.Window 28 | import org.apache.spark.sql.functions.dense_rank 29 | 30 | val orderedDf = Window.orderBy(df.col("value")) 31 | val rankedDf = df.withColumn("rank", dense_rank.over(orderedDf)) 32 | val minDf = rankedDf.filter("rank == 1") 33 | minDf.show() 34 | 35 | 36 | 37 | // find the id which has maximum value 38 | 39 | val resultDf = sparkSession.sql("select max_by(id,value) max_id, min_by(id,value) min_id from table") 40 | 41 | resultDf.show() 42 | 43 | 44 | 45 | 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sql/MultiColumnSampleBy.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sql 2 | 3 | import org.apache.spark.sql.{Row, SparkSession} 4 | 5 | object MultiColumnSampleBy { 6 | 7 | def main(args: Array[String]): Unit = { 8 | 9 | val sparkSession = SparkSession.builder. 10 | master("local") 11 | .appName("multi column sample") 12 | .getOrCreate() 13 | 14 | val df = sparkSession.createDataFrame(Seq( 15 | (1, "p1", "s1", 20), 16 | (1, "p2", "s1", 30), 17 | (1, "p1", "s2", 40), 18 | (1, "p2", "s2", 50), 19 | (2, "p1", "s1", 20), 20 | (2, "p2", "s1", 30), 21 | (2, "p1", "s2", 40), 22 | (2, "p2", "s2", 50))) 23 | .toDF("day", "product", "store", "sales") 24 | 25 | //single sampleBy 26 | 27 | val singleFractions = Map("p1" -> 0.5, "p2" -> 0.5) 28 | val singleSampleDf = df.stat.sampleBy("product", singleFractions, -1) 29 | singleSampleDf.sort("product").show() 30 | 31 | // multi column sampleBy on product and store 32 | import org.apache.spark.sql.functions.struct 33 | val multipleFractions = Map(Row("p1", "s1") -> 0.5, 34 | Row("p1", "s1") -> 0.5, 35 | Row("p1", "s2") -> 0.5, 36 | Row("p2", "s2") -> 0.5) 37 | val multiSampleDf = df.stat.sampleBy(struct("product", "store"), multipleFractions, -1) 38 | multiSampleDf.show() 39 | 40 | 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sql/adaptive/shuffle/AdaptiveShuffle.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sql.adaptive.shuffle 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object AdaptiveShuffle { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val conf = new SparkConf() 11 | .setAppName("test plan") 12 | .setMaster("local[2]") 13 | .set("spark.sql.adaptive.enabled", "true") 14 | .set("spark.sql.adaptive.coalescePartitions.enabled", "true") 15 | 16 | val sparkSession = SparkSession.builder().config(conf).getOrCreate() 17 | 18 | val df = sparkSession.read. 19 | format("csv") 20 | .option("header", "true") 21 | .option("inferSchema", "true") 22 | .load("src/main/resources/sales.csv").repartition(500) 23 | 24 | df.groupBy("customerId").count().count() 25 | 26 | //Thread.sleep(1000000) 27 | 28 | 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/com/madhukaraphatak/spark/sql/adaptive/shuffle/NoAdaptiveShuffle.scala: -------------------------------------------------------------------------------- 1 | package com.madhukaraphatak.spark.sql.adaptive.shuffle 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object NoAdaptiveShuffle { 7 | 8 | def main(args: Array[String]): Unit = { 9 | 10 | val conf = new SparkConf() 11 | .setAppName("no adaptive shuffle") 12 | .setMaster("local[2]") 13 | 14 | val sparkSession = SparkSession.builder().config(conf).getOrCreate() 15 | 16 | val df = sparkSession.read. 17 | format("csv") 18 | .option("header", "true") 19 | .option("inferSchema", "true") 20 | .load("src/main/resources/sales.csv").repartition(500) 21 | 22 | 23 | df.groupBy("customerId").count().count() 24 | 25 | 26 | //Thread.sleep(1000000) 27 | 28 | 29 | } 30 | 31 | } 32 | --------------------------------------------------------------------------------