├── .github
    └── workflows
    │   └── scala.yml
├── .gitignore
├── README.md
├── build.sbt
├── project
    └── build.properties
└── src
    └── main
        ├── resources
            ├── adult.csv
            ├── customers.csv
            ├── metrics.properties
            ├── multicharacterseperator.csv
            ├── nested
            │   ├── a.csv
            │   └── folder1
            │   │   └── b.csv
            └── sales.csv
        └── scala
            └── com
                └── madhukaraphatak
                    └── spark
                        ├── barrier
                            ├── BarrierContextExample.scala
                            ├── BarrierExceptionExample.scala
                            ├── BarrierMethodExample.scala
                            └── BarrierRddExample.scala
                        ├── core
                            ├── BarrierExample.scala
                            └── plugins
                            │   ├── custommetrics
                            │       ├── CustomMetricExample.scala
                            │       └── CustomMetricSparkPlugin.scala
                            │   ├── driverplugin
                            │       ├── CustomDriverPlugin.scala
                            │       ├── CustomSparkPlugin.scala
                            │       └── DriverPluginExample.scala
                            │   ├── dynamicconfig
                            │       ├── Configuration.scala
                            │       ├── CustomConfigDriverPlugin.scala
                            │       ├── CustomConfigSparkPlugin.scala
                            │       └── DynamicConfigExample.scala
                            │   └── rpccommunication
                            │       ├── RpcCommunicationExample.scala
                            │       └── RpcSparkPlugin.scala
                        ├── ml
                            ├── MLUtils.scala
                            ├── MultiColumnTransformer.scala
                            └── WeightedLogisticRegression.scala
                        ├── sources
                            ├── BinaryFile.scala
                            ├── MultiCharacterDelimiterCSV.scala
                            ├── RecursiveFolderReadExample.scala
                            └── datasourcev2
                            │   ├── DataSourceV2Example.scala
                            │   ├── SimpleCsvDataSource.scala
                            │   ├── SimpleDataSource.scala
                            │   ├── SimpleMultiDataSource.scala
                            │   ├── SimpleMysqlWriterDataSource.scala
                            │   ├── streamandbatch
                            │       ├── DataSourceV2StreamAndBatchExample.scala
                            │       └── SimpleStreamAndBatchDataSource.scala
                            │   └── streaming
                            │       ├── DataSourceV2StreamingExample.scala
                            │       └── SimpleStreamingDataSource.scala
                        └── sql
                            ├── DataFrameTail.scala
                            ├── InMemoryTableScanExample.scala
                            ├── JoinHintsExample.scala
                            ├── MinAndMaxByExample.scala
                            ├── MultiColumnSampleBy.scala
                            └── adaptive
                                └── shuffle
                                    ├── AdaptiveShuffle.scala
                                    └── NoAdaptiveShuffle.scala


/.github/workflows/scala.yml:
--------------------------------------------------------------------------------
 1 | name: Scala CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v2
16 |     - name: Set up JDK 1.8
17 |       uses: actions/setup-java@v1
18 |       with:
19 |         java-version: 1.8
20 |     - name: Compile
21 |       run: sbt compile
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.iml
 2 | .idea/
 3 | *.iml
 4 | target/
 5 | project/target
 6 | dependency-reduced-pom.xml
 7 | *.pdf
 8 | *.swp
 9 | *.sw*
10 | metastore_db
11 | *.log
12 | creditcard*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Examples for Spark 3.0 release.
2 | 
3 | # Build
4 | 
5 |     sbt clean package


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "spark-three-examples"
 2 | 
 3 | version := "0.1"
 4 | 
 5 | scalaVersion := "2.12.4"
 6 | 
 7 | val sparkVersion = "3.0.1"
 8 | 
 9 | 
10 | resolvers += "Spark Snapshot Repository" at "https://repository.apache.org/snapshots"
11 | 
12 | libraryDependencies ++= Seq(
13 |   "org.apache.spark" %% "spark-sql" % sparkVersion,
14 |   "org.apache.spark" %% "spark-mllib" % sparkVersion,
15 |   "mysql" % "mysql-connector-java" % "5.1.6"
16 | )
17 | 
18 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 1.3.8


--------------------------------------------------------------------------------
/src/main/resources/adult.csv:
--------------------------------------------------------------------------------
  1 | age,workclass,fnlwgt,education,education_num,martial_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,salary
  2 | 39, State-gov, 77516, Bachelors, 13, Never-married, Adm-clerical, Not-in-family, White, Male, 2174, 0, 40, United-States, <=50K
  3 | 50, Self-emp-not-inc, 83311, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 13, United-States, <=50K
  4 | 38, Private, 215646, HS-grad, 9, Divorced, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
  5 | 53, Private, 234721, 11th, 7, Married-civ-spouse, Handlers-cleaners, Husband, Black, Male, 0, 0, 40, United-States, <=50K
  6 | 28, Private, 338409, Bachelors, 13, Married-civ-spouse, Prof-specialty, Wife, Black, Female, 0, 0, 40, Cuba, <=50K
  7 | 37, Private, 284582, Masters, 14, Married-civ-spouse, Exec-managerial, Wife, White, Female, 0, 0, 40, United-States, <=50K
  8 | 49, Private, 160187, 9th, 5, Married-spouse-absent, Other-service, Not-in-family, Black, Female, 0, 0, 16, Jamaica, <=50K
  9 | 52, Self-emp-not-inc, 209642, HS-grad, 9, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 45, United-States, >50K
 10 | 31, Private, 45781, Masters, 14, Never-married, Prof-specialty, Not-in-family, White, Female, 14084, 0, 50, United-States, >50K
 11 | 42, Private, 159449, Bachelors, 13, Married-civ-spouse, Exec-managerial, Husband, White, Male, 5178, 0, 40, United-States, >50K
 12 | 37, Private, 280464, Some-college, 10, Married-civ-spouse, Exec-managerial, Husband, Black, Male, 0, 0, 80, United-States, >50K
 13 | 30, State-gov, 141297, Bachelors, 13, Married-civ-spouse, Prof-specialty, Husband, Asian-Pac-Islander, Male, 0, 0, 40, India, >50K
 14 | 23, Private, 122272, Bachelors, 13, Never-married, Adm-clerical, Own-child, White, Female, 0, 0, 30, United-States, <=50K
 15 | 32, Private, 205019, Assoc-acdm, 12, Never-married, Sales, Not-in-family, Black, Male, 0, 0, 50, United-States, <=50K
 16 | 40, Private, 121772, Assoc-voc, 11, Married-civ-spouse, Craft-repair, Husband, Asian-Pac-Islander, Male, 0, 0, 40, ?, >50K
 17 | 34, Private, 245487, 7th-8th, 4, Married-civ-spouse, Transport-moving, Husband, Amer-Indian-Eskimo, Male, 0, 0, 45, Mexico, <=50K
 18 | 25, Self-emp-not-inc, 176756, HS-grad, 9, Never-married, Farming-fishing, Own-child, White, Male, 0, 0, 35, United-States, <=50K
 19 | 32, Private, 186824, HS-grad, 9, Never-married, Machine-op-inspct, Unmarried, White, Male, 0, 0, 40, United-States, <=50K
 20 | 38, Private, 28887, 11th, 7, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 50, United-States, <=50K
 21 | 43, Self-emp-not-inc, 292175, Masters, 14, Divorced, Exec-managerial, Unmarried, White, Female, 0, 0, 45, United-States, >50K
 22 | 40, Private, 193524, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 60, United-States, >50K
 23 | 54, Private, 302146, HS-grad, 9, Separated, Other-service, Unmarried, Black, Female, 0, 0, 20, United-States, <=50K
 24 | 35, Federal-gov, 76845, 9th, 5, Married-civ-spouse, Farming-fishing, Husband, Black, Male, 0, 0, 40, United-States, <=50K
 25 | 43, Private, 117037, 11th, 7, Married-civ-spouse, Transport-moving, Husband, White, Male, 0, 2042, 40, United-States, <=50K
 26 | 59, Private, 109015, HS-grad, 9, Divorced, Tech-support, Unmarried, White, Female, 0, 0, 40, United-States, <=50K
 27 | 56, Local-gov, 216851, Bachelors, 13, Married-civ-spouse, Tech-support, Husband, White, Male, 0, 0, 40, United-States, >50K
 28 | 19, Private, 168294, HS-grad, 9, Never-married, Craft-repair, Own-child, White, Male, 0, 0, 40, United-States, <=50K
 29 | 54, ?, 180211, Some-college, 10, Married-civ-spouse, ?, Husband, Asian-Pac-Islander, Male, 0, 0, 60, South, >50K
 30 | 39, Private, 367260, HS-grad, 9, Divorced, Exec-managerial, Not-in-family, White, Male, 0, 0, 80, United-States, <=50K
 31 | 49, Private, 193366, HS-grad, 9, Married-civ-spouse, Craft-repair, Husband, White, Male, 0, 0, 40, United-States, <=50K
 32 | 23, Local-gov, 190709, Assoc-acdm, 12, Never-married, Protective-serv, Not-in-family, White, Male, 0, 0, 52, United-States, <=50K
 33 | 20, Private, 266015, Some-college, 10, Never-married, Sales, Own-child, Black, Male, 0, 0, 44, United-States, <=50K
 34 | 45, Private, 386940, Bachelors, 13, Divorced, Exec-managerial, Own-child, White, Male, 0, 1408, 40, United-States, <=50K
 35 | 30, Federal-gov, 59951, Some-college, 10, Married-civ-spouse, Adm-clerical, Own-child, White, Male, 0, 0, 40, United-States, <=50K
 36 | 22, State-gov, 311512, Some-college, 10, Married-civ-spouse, Other-service, Husband, Black, Male, 0, 0, 15, United-States, <=50K
 37 | 48, Private, 242406, 11th, 7, Never-married, Machine-op-inspct, Unmarried, White, Male, 0, 0, 40, Puerto-Rico, <=50K
 38 | 21, Private, 197200, Some-college, 10, Never-married, Machine-op-inspct, Own-child, White, Male, 0, 0, 40, United-States, <=50K
 39 | 19, Private, 544091, HS-grad, 9, Married-AF-spouse, Adm-clerical, Wife, White, Female, 0, 0, 25, United-States, <=50K
 40 | 31, Private, 84154, Some-college, 10, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 38, ?, >50K
 41 | 48, Self-emp-not-inc, 265477, Assoc-acdm, 12, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 40, United-States, <=50K
 42 | 31, Private, 507875, 9th, 5, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 0, 0, 43, United-States, <=50K
 43 | 53, Self-emp-not-inc, 88506, Bachelors, 13, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 40, United-States, <=50K
 44 | 24, Private, 172987, Bachelors, 13, Married-civ-spouse, Tech-support, Husband, White, Male, 0, 0, 50, United-States, <=50K
 45 | 49, Private, 94638, HS-grad, 9, Separated, Adm-clerical, Unmarried, White, Female, 0, 0, 40, United-States, <=50K
 46 | 25, Private, 289980, HS-grad, 9, Never-married, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 35, United-States, <=50K
 47 | 57, Federal-gov, 337895, Bachelors, 13, Married-civ-spouse, Prof-specialty, Husband, Black, Male, 0, 0, 40, United-States, >50K
 48 | 53, Private, 144361, HS-grad, 9, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 0, 0, 38, United-States, <=50K
 49 | 44, Private, 128354, Masters, 14, Divorced, Exec-managerial, Unmarried, White, Female, 0, 0, 40, United-States, <=50K
 50 | 41, State-gov, 101603, Assoc-voc, 11, Married-civ-spouse, Craft-repair, Husband, White, Male, 0, 0, 40, United-States, <=50K
 51 | 29, Private, 271466, Assoc-voc, 11, Never-married, Prof-specialty, Not-in-family, White, Male, 0, 0, 43, United-States, <=50K
 52 | 25, Private, 32275, Some-college, 10, Married-civ-spouse, Exec-managerial, Wife, Other, Female, 0, 0, 40, United-States, <=50K
 53 | 18, Private, 226956, HS-grad, 9, Never-married, Other-service, Own-child, White, Female, 0, 0, 30, ?, <=50K
 54 | 47, Private, 51835, Prof-school, 15, Married-civ-spouse, Prof-specialty, Wife, White, Female, 0, 1902, 60, Honduras, >50K
 55 | 50, Federal-gov, 251585, Bachelors, 13, Divorced, Exec-managerial, Not-in-family, White, Male, 0, 0, 55, United-States, >50K
 56 | 47, Self-emp-inc, 109832, HS-grad, 9, Divorced, Exec-managerial, Not-in-family, White, Male, 0, 0, 60, United-States, <=50K
 57 | 43, Private, 237993, Some-college, 10, Married-civ-spouse, Tech-support, Husband, White, Male, 0, 0, 40, United-States, >50K
 58 | 46, Private, 216666, 5th-6th, 3, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 0, 0, 40, Mexico, <=50K
 59 | 35, Private, 56352, Assoc-voc, 11, Married-civ-spouse, Other-service, Husband, White, Male, 0, 0, 40, Puerto-Rico, <=50K
 60 | 41, Private, 147372, HS-grad, 9, Married-civ-spouse, Adm-clerical, Husband, White, Male, 0, 0, 48, United-States, <=50K
 61 | 30, Private, 188146, HS-grad, 9, Married-civ-spouse, Machine-op-inspct, Husband, White, Male, 5013, 0, 40, United-States, <=50K
 62 | 30, Private, 59496, Bachelors, 13, Married-civ-spouse, Sales, Husband, White, Male, 2407, 0, 40, United-States, <=50K
 63 | 32, ?, 293936, 7th-8th, 4, Married-spouse-absent, ?, Not-in-family, White, Male, 0, 0, 40, ?, <=50K
 64 | 48, Private, 149640, HS-grad, 9, Married-civ-spouse, Transport-moving, Husband, White, Male, 0, 0, 40, United-States, <=50K
 65 | 42, Private, 116632, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 45, United-States, >50K
 66 | 29, Private, 105598, Some-college, 10, Divorced, Tech-support, Not-in-family, White, Male, 0, 0, 58, United-States, <=50K
 67 | 36, Private, 155537, HS-grad, 9, Married-civ-spouse, Craft-repair, Husband, White, Male, 0, 0, 40, United-States, <=50K
 68 | 28, Private, 183175, Some-college, 10, Divorced, Adm-clerical, Not-in-family, White, Female, 0, 0, 40, United-States, <=50K
 69 | 53, Private, 169846, HS-grad, 9, Married-civ-spouse, Adm-clerical, Wife, White, Female, 0, 0, 40, United-States, >50K
 70 | 49, Self-emp-inc, 191681, Some-college, 10, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 50, United-States, >50K
 71 | 25, ?, 200681, Some-college, 10, Never-married, ?, Own-child, White, Male, 0, 0, 40, United-States, <=50K
 72 | 19, Private, 101509, Some-college, 10, Never-married, Prof-specialty, Own-child, White, Male, 0, 0, 32, United-States, <=50K
 73 | 31, Private, 309974, Bachelors, 13, Separated, Sales, Own-child, Black, Female, 0, 0, 40, United-States, <=50K
 74 | 29, Self-emp-not-inc, 162298, Bachelors, 13, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 70, United-States, >50K
 75 | 23, Private, 211678, Some-college, 10, Never-married, Machine-op-inspct, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
 76 | 79, Private, 124744, Some-college, 10, Married-civ-spouse, Prof-specialty, Other-relative, White, Male, 0, 0, 20, United-States, <=50K
 77 | 27, Private, 213921, HS-grad, 9, Never-married, Other-service, Own-child, White, Male, 0, 0, 40, Mexico, <=50K
 78 | 40, Private, 32214, Assoc-acdm, 12, Married-civ-spouse, Adm-clerical, Husband, White, Male, 0, 0, 40, United-States, <=50K
 79 | 67, ?, 212759, 10th, 6, Married-civ-spouse, ?, Husband, White, Male, 0, 0, 2, United-States, <=50K
 80 | 18, Private, 309634, 11th, 7, Never-married, Other-service, Own-child, White, Female, 0, 0, 22, United-States, <=50K
 81 | 31, Local-gov, 125927, 7th-8th, 4, Married-civ-spouse, Farming-fishing, Husband, White, Male, 0, 0, 40, United-States, <=50K
 82 | 18, Private, 446839, HS-grad, 9, Never-married, Sales, Not-in-family, White, Male, 0, 0, 30, United-States, <=50K
 83 | 52, Private, 276515, Bachelors, 13, Married-civ-spouse, Other-service, Husband, White, Male, 0, 0, 40, Cuba, <=50K
 84 | 46, Private, 51618, HS-grad, 9, Married-civ-spouse, Other-service, Wife, White, Female, 0, 0, 40, United-States, <=50K
 85 | 59, Private, 159937, HS-grad, 9, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 48, United-States, <=50K
 86 | 44, Private, 343591, HS-grad, 9, Divorced, Craft-repair, Not-in-family, White, Female, 14344, 0, 40, United-States, >50K
 87 | 53, Private, 346253, HS-grad, 9, Divorced, Sales, Own-child, White, Female, 0, 0, 35, United-States, <=50K
 88 | 49, Local-gov, 268234, HS-grad, 9, Married-civ-spouse, Protective-serv, Husband, White, Male, 0, 0, 40, United-States, >50K
 89 | 33, Private, 202051, Masters, 14, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 50, United-States, <=50K
 90 | 30, Private, 54334, 9th, 5, Never-married, Sales, Not-in-family, White, Male, 0, 0, 40, United-States, <=50K
 91 | 43, Federal-gov, 410867, Doctorate, 16, Never-married, Prof-specialty, Not-in-family, White, Female, 0, 0, 50, United-States, >50K
 92 | 57, Private, 249977, Assoc-voc, 11, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 0, 40, United-States, <=50K
 93 | 37, Private, 286730, Some-college, 10, Divorced, Craft-repair, Unmarried, White, Female, 0, 0, 40, United-States, <=50K
 94 | 28, Private, 212563, Some-college, 10, Divorced, Machine-op-inspct, Unmarried, Black, Female, 0, 0, 25, United-States, <=50K
 95 | 30, Private, 117747, HS-grad, 9, Married-civ-spouse, Sales, Wife, Asian-Pac-Islander, Female, 0, 1573, 35, ?, <=50K
 96 | 34, Local-gov, 226296, Bachelors, 13, Married-civ-spouse, Protective-serv, Husband, White, Male, 0, 0, 40, United-States, >50K
 97 | 29, Local-gov, 115585, Some-college, 10, Never-married, Handlers-cleaners, Not-in-family, White, Male, 0, 0, 50, United-States, <=50K
 98 | 48, Self-emp-not-inc, 191277, Doctorate, 16, Married-civ-spouse, Prof-specialty, Husband, White, Male, 0, 1902, 60, United-States, >50K
 99 | 37, Private, 202683, Some-college, 10, Married-civ-spouse, Sales, Husband, White, Male, 0, 0, 48, United-States, >50K
100 | 48, Private, 171095, Assoc-acdm, 12, Divorced, Exec-managerial, Unmarried, White, Female, 0, 0, 40, England, <=50K
101 | 32, Federal-gov, 249409, HS-grad, 9, Never-married, Other-service, Own-child, Black, Male, 0, 0, 40, United-States, <=50K
102 | 76, Private, 124191, Masters, 14, Married-civ-spouse, Exec-managerial, Husband, White, Male, 0, 0, 40, United-States, >50K
103 | 


--------------------------------------------------------------------------------
/src/main/resources/customers.csv:
--------------------------------------------------------------------------------
1 | customerId,customerName
2 | 1,John
3 | 2,Clerk
4 | 3,Micheal
5 | 4,Sample
6 | 


--------------------------------------------------------------------------------
/src/main/resources/metrics.properties:
--------------------------------------------------------------------------------
1 | *.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink


--------------------------------------------------------------------------------
/src/main/resources/multicharacterseperator.csv:
--------------------------------------------------------------------------------
1 | a||b||c||d
2 | 1||2||3||4
3 | 5||6||7||8


--------------------------------------------------------------------------------
/src/main/resources/nested/a.csv:
--------------------------------------------------------------------------------
1 | a||b||c||d
2 | 1||2||3||4
3 | 5||6||7||8


--------------------------------------------------------------------------------
/src/main/resources/nested/folder1/b.csv:
--------------------------------------------------------------------------------
1 | a||b||c||d
2 | 1||2||3||4
3 | 5||6||7||8


--------------------------------------------------------------------------------
/src/main/resources/sales.csv:
--------------------------------------------------------------------------------
 1 | transactionId,customerId,itemId,amountPaid
 2 | 111,1,1,100.0
 3 | 112,2,2,505.0
 4 | 113,3,3,510.0
 5 | 114,4,4,600.0
 6 | 115,1,2,500.0
 7 | 116,1,2,500.0
 8 | 117,1,2,500.0
 9 | 118,1,2,500.0
10 | 119,2,3,500.0
11 | 120,1,2,500.0
12 | 121,1,4,500.0
13 | 122,1,2,500.0
14 | 123,1,4,500.0
15 | 124,1,2,500.0


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/barrier/BarrierContextExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.barrier
 2 | 
 3 | import org.apache.spark.{BarrierTaskContext, TaskContext}
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object BarrierContextExample {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     val sparkSession = SparkSession.builder.
11 |           master("local[4]")
12 |           .appName("example")
13 |           .getOrCreate()
14 | 
15 |     val df = sparkSession.range(0,100).repartition(4)
16 | 
17 |     val barrierRdd = df.rdd.barrier()
18 | 
19 |     val mappedRDD = barrierRdd.mapPartitionsWithIndex{ case (index,iterator) => {
20 |       val taskContext = BarrierTaskContext.get()
21 |       val taskInfos = taskContext.getTaskInfos().map(_.address)
22 |       println(taskInfos)
23 |       iterator
24 |     }}
25 | 
26 |     mappedRDD.count()
27 | 
28 |   }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/barrier/BarrierExceptionExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.barrier
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object BarrierExceptionExample {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val sparkSession = SparkSession.builder.
10 |           master("local")
11 |           .appName("example")
12 |           .getOrCreate()
13 | 
14 |     val df = sparkSession.range(0,100).repartition(4)
15 | 
16 |     val barrierRdd = df.rdd.barrier()
17 | 
18 |     //fails running as it needs minimum four cores for four partitions
19 |     val count = barrierRdd.mapPartitions(v => v).count()
20 | 
21 |     println("count is " + count)
22 | 
23 |   }
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/barrier/BarrierMethodExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.barrier
 2 | 
 3 | import org.apache.spark.BarrierTaskContext
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object BarrierMethodExample {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     val sparkSession = SparkSession.builder.
11 |           master("local[4]")
12 |           .appName("example")
13 |           .getOrCreate()
14 | 
15 |     val df = sparkSession.range(0,100).repartition(4)
16 | 
17 |     val barrierRdd = df.rdd.barrier()
18 | 
19 |     val mappedRDD = barrierRdd.mapPartitionsWithIndex{ case (index,iterator) => {
20 |       val taskContext = BarrierTaskContext.get()  
21 |       taskContext.barrier()
22 |       println("barrier context completed")
23 |       iterator
24 |     }}
25 | 
26 |     mappedRDD.count()
27 | 
28 |   }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/barrier/BarrierRddExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.barrier
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object BarrierRddExample {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val sparkSession = SparkSession.builder.
10 |           master("local[4]")
11 |           .appName("example")
12 |           .getOrCreate()
13 | 
14 |     val df = sparkSession.range(0,100).repartition(4)
15 | 
16 |     val barrierRdd = df.rdd.barrier()
17 | 
18 |     val count = barrierRdd.mapPartitions(v => v).count()
19 | 
20 |     println("count is " + count)
21 | 
22 |   }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/core/BarrierExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.core
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object BarrierExample {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val sparkSession = SparkSession.builder().
10 |       appName("simple").master("local[4]").getOrCreate()
11 | 
12 |     val df = sparkSession.range(100).repartition(2)
13 | 
14 |     //run barrier mode
15 | 
16 |     val barrierRDD = df.rdd.barrier()
17 | 
18 |     barrierRDD.mapPartitionsWithIndex{
19 |       case (index,value) => {
20 |       //first wait for 10s
21 |       Thread.sleep(10000)
22 |       if(index == 1) throw new IllegalArgumentException
23 |       Thread.sleep(100000)
24 |       value
25 |     }}.count()
26 | 
27 | 
28 | 
29 | 
30 |   }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/core/plugins/custommetrics/CustomMetricExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.core.plugins.custommetrics
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object CustomMetricExample {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     val sparkConf = new SparkConf()
11 |       .setMaster("local[4]")
12 |       .set("spark.plugins","com.madhukaraphatak.spark.core.plugins.custommetrics.CustomMetricSparkPlugin")
13 |        .set("spark.metrics.conf","src/main/resources/metrics.properties")
14 |       .setAppName("executor plugin example")
15 | 
16 | 
17 |     val sparkSession = SparkSession.builder.config(sparkConf).getOrCreate()
18 | 
19 |     import sparkSession.implicits._
20 | 
21 |     val df = sparkSession.range(5000).repartition(5)
22 | 
23 |     val incrementedDf = df.mapPartitions(iterator => {
24 |       var evenCount = 0
25 |       val incrementedIterator = iterator.toList.map(value => {
26 |         if(value % 2 == 0) evenCount = evenCount +1
27 |         value +1
28 |       }).toIterator
29 |       CustomMetricSparkPlugin.value.inc(evenCount)
30 |       incrementedIterator
31 |     })
32 | 
33 | 
34 |     incrementedDf.count()
35 | 
36 | 
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/core/plugins/custommetrics/CustomMetricSparkPlugin.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.core.plugins.custommetrics
 2 | 
 3 | import java.util
 4 | 
 5 | import com.codahale.metrics.Counter
 6 | import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext, SparkPlugin}
 7 | 
 8 | object CustomMetricSparkPlugin {
 9 |   val value = new Counter
10 | }
11 | 
12 | class CustomMetricSparkPlugin extends SparkPlugin{
13 | 
14 |   override def driverPlugin(): DriverPlugin = null
15 |   override def executorPlugin(): ExecutorPlugin = new ExecutorPlugin {
16 |    override def init(ctx: PluginContext, extraConf: util.Map[String, String]): Unit = {
17 |       val metricRegistry = ctx.metricRegistry()
18 |       metricRegistry.register("evenMetrics",CustomMetricSparkPlugin.value)
19 |     }
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/core/plugins/driverplugin/CustomDriverPlugin.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.core.plugins.driverplugin
 2 | import java.net.ServerSocket
 3 | import java.util
 4 | 
 5 | import org.apache.spark.SparkContext
 6 | import org.apache.spark.api.plugin.{DriverPlugin, PluginContext}
 7 | import org.apache.spark.sql.SparkSession
 8 | 
 9 | class CustomDriverPlugin extends DriverPlugin{
10 | 
11 |   var sparkContext:SparkContext =null
12 |   var runningThread:Thread = null
13 | 
14 |   class  ServerSocketListener {
15 |     var port = 9999
16 |     val listener = new ServerSocket(port)
17 |     while (true) {
18 |       val socket = listener.accept()
19 |       new Thread(){
20 |         override def run(): Unit = {
21 |           println(" got client " + socket.getInetAddress)
22 |           val sparkSession = SparkSession.builder().getOrCreate()
23 |           sparkSession.catalog.uncacheTable("test")
24 |         }
25 |         socket.close()
26 |       }.start()
27 |     }
28 |   }
29 | 
30 |   override def init(sc: SparkContext, pluginContext: PluginContext): util.Map[String, String] = {
31 |     println("########### called init of custom driver plugin")
32 |     this.sparkContext =sparkContext
33 | 
34 |     runningThread = new Thread(){
35 |       override def run(): Unit = {
36 |         new ServerSocketListener()
37 |       }
38 |     }
39 |     runningThread.start()
40 | 
41 |     super.init(sc, pluginContext)
42 |   }
43 |   override def shutdown(): Unit = {
44 |     println("############ called shutdown")
45 |     runningThread.interrupt()
46 |     System.exit(0)
47 |     super.shutdown()
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/core/plugins/driverplugin/CustomSparkPlugin.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.core.plugins.driverplugin
 2 | 
 3 | import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, SparkPlugin}
 4 | 
 5 | class CustomSparkPlugin extends SparkPlugin{
 6 |   override def driverPlugin(): DriverPlugin = new CustomDriverPlugin
 7 | 
 8 |   override def executorPlugin(): ExecutorPlugin = null
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/core/plugins/driverplugin/DriverPluginExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.core.plugins.driverplugin
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object DriverPluginExample {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     val sparkConf = new SparkConf()
11 |       .setMaster("local[2]")
12 |       .set("spark.plugins","com.madhukaraphatak.spark.core.plugins.driverplugin.CustomSparkPlugin")
13 |       .setAppName("executor plugin example")
14 | 
15 | 
16 |     val sparkSession = SparkSession.builder.config(sparkConf).getOrCreate()
17 |     val df = sparkSession.range(5000)
18 | 
19 |     //cache the table
20 |     df.createOrReplaceTempView("test")
21 |     sparkSession.catalog.cacheTable("test")
22 | 
23 |     df.count()
24 | 
25 |     Thread.sleep(10000)
26 | 
27 |     sparkSession.stop()
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/core/plugins/dynamicconfig/Configuration.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.core.plugins.dynamicconfig
 2 | 
 3 | object Configuration {
 4 | 
 5 |   private var value = 10
 6 | 
 7 |   def getConfig: Int = value
 8 | 
 9 |   def changeConfig(newValue : Int):Int = {value = newValue; value}
10 | 
11 | }
12 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/core/plugins/dynamicconfig/CustomConfigDriverPlugin.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.core.plugins.dynamicconfig
 2 | 
 3 | import java.io.PrintWriter
 4 | import java.net.ServerSocket
 5 | import java.util
 6 | 
 7 | import org.apache.spark.SparkContext
 8 | import org.apache.spark.api.plugin.{DriverPlugin, PluginContext}
 9 | 
10 | 
11 | class CustomConfigDriverPlugin extends DriverPlugin {
12 | 
13 |   var sparkContext: SparkContext = null
14 |   var runningThread: Thread = null
15 | 
16 |   class ServerSocketListener {
17 |     var port = 9999
18 |     val listener = new ServerSocket(port)
19 |     while (true) {
20 |       val socket = listener.accept()
21 |       new Thread() {
22 |         override def run(): Unit = {
23 |           val currentValue = Configuration.getConfig
24 |           Configuration.changeConfig(currentValue + 10)
25 |           val response = "HTTP/1.1 200 OK \r\n\r\n "+s" the latest configuration is ${Configuration.getConfig}"
26 |           socket.getOutputStream().write(response.getBytes("UTF-8"))
27 |           socket.getOutputStream.flush()
28 |           socket.close()
29 |         }
30 |       }.start()
31 |     }
32 |   }
33 | 
34 |   override def init(sc: SparkContext, pluginContext: PluginContext): util.Map[String, String] = {
35 |     this.sparkContext = sparkContext
36 | 
37 |     runningThread = new Thread() {
38 |       override def run(): Unit = {
39 |         new ServerSocketListener()
40 |       }
41 |     }
42 |     runningThread.start()
43 | 
44 |     super.init(sc, pluginContext)
45 |   }
46 | 
47 |   override def shutdown(): Unit = {
48 |     runningThread.interrupt()
49 |     System.exit(0)
50 |     super.shutdown()
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/core/plugins/dynamicconfig/CustomConfigSparkPlugin.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.core.plugins.dynamicconfig
 2 | 
 3 | import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, SparkPlugin}
 4 | 
 5 | class CustomConfigSparkPlugin extends SparkPlugin{
 6 |   override def driverPlugin(): DriverPlugin = new CustomConfigDriverPlugin
 7 | 
 8 |   override def executorPlugin(): ExecutorPlugin = null
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/core/plugins/dynamicconfig/DynamicConfigExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.core.plugins.dynamicconfig
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.streaming.{OutputMode, Trigger}
 6 | 
 7 | object DynamicConfigExample {
 8 | 
 9 |   def main(args: Array[String]): Unit = {
10 | 
11 |     val sparkConf = new SparkConf()
12 |       .setMaster("local[2]")
13 |       .set("spark.plugins","com.madhukaraphatak.spark.core.plugins.dynamicconfig.CustomConfigSparkPlugin")
14 |       .setAppName("executor plugin example")
15 | 
16 | 
17 | 
18 |     val sparkSession = SparkSession.builder.config(sparkConf).getOrCreate()
19 | 
20 |     sparkSession.sparkContext.setLogLevel("ERROR")
21 | 
22 |     import sparkSession.implicits._
23 | 
24 |     val df = sparkSession.readStream
25 |       .format("socket")
26 |       .option("host","localhost")
27 |       .option("port",8888).load().as[String]
28 | 
29 |     val returnDf = df.map(value => value + Configuration.getConfig)
30 | 
31 |     val query = returnDf.writeStream.
32 |       queryName("something")
33 |       .format("console")
34 |       .outputMode(OutputMode.Append())
35 | 
36 |     query.start().awaitTermination()
37 | 
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/core/plugins/rpccommunication/RpcCommunicationExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.core.plugins.rpccommunication
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object RpcCommunicationExample {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     val sparkConf = new SparkConf()
11 |       .setMaster("local[2]")
12 |       .set("spark.plugins","com.madhukaraphatak.spark.core.plugins.rpccommunication.RpcSparkPlugin")
13 |       .setAppName("rpc communication example")
14 | 
15 |     val sparkSession = SparkSession.builder.config(sparkConf).getOrCreate()
16 |     sparkSession.stop()
17 | 
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/core/plugins/rpccommunication/RpcSparkPlugin.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.core.plugins.rpccommunication
 2 | 
 3 | import java.util
 4 | 
 5 | import com.codahale.metrics.Counter
 6 | import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext, SparkPlugin}
 7 | 
 8 | 
 9 | case object InitialConfigRequest extends  Serializable
10 | case class InitialConfigResponse(value:Int) extends Serializable
11 | 
12 | case class FinalValueResponse(value : Int) extends Serializable
13 | case class RpcMessage(message:String) extends Serializable
14 | 
15 | 
16 | class RpcSparkPlugin extends SparkPlugin{
17 |   override def driverPlugin(): DriverPlugin = new DriverPlugin {
18 |    override def receive(message: scala.Any): AnyRef = {
19 |      message match {
20 |        case InitialConfigRequest => InitialConfigResponse(10)
21 |        case FinalValueResponse(value)  => println("the final value is "+ value); Unit
22 | 
23 |      }
24 |  }
25 | }
26 | 
27 |   override def executorPlugin(): ExecutorPlugin = new ExecutorPlugin {
28 |     var pluginContext:PluginContext = null
29 |     var initialConfiguration:Int = 0
30 | 
31 |     override def init(ctx: PluginContext, extraConf: util.Map[String, String]): Unit = {
32 |       pluginContext = ctx
33 |       initialConfiguration = pluginContext.ask(InitialConfigRequest).asInstanceOf[InitialConfigResponse].value
34 |       println("the initial configuration is " + initialConfiguration)
35 |     }
36 | 
37 |     override def shutdown(): Unit = {
38 |       val rpcMessage = FinalValueResponse(10 * initialConfiguration)
39 |       pluginContext.send(rpcMessage)
40 |     }
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/ml/MLUtils.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.ml
 2 | 
 3 | import org.apache.spark.mllib.evaluation.MulticlassMetrics
 4 | import org.apache.spark.sql.DataFrame
 5 | 
 6 | object MLUtils {
 7 | 
 8 |   def accuracyScore(df: DataFrame, label: String, predictCol: String) = {
 9 |     val rdd = df.select(predictCol,label).rdd.map(row ⇒ (row.getDouble(0), row.getInt(1).toDouble))
10 |     new MulticlassMetrics(rdd).accuracy
11 |   }
12 |   def recall(df: DataFrame, labelCol: String, predictCol: String, labelValue:Double) = {
13 |     val rdd = df.select(predictCol,labelCol).rdd.map(row ⇒ (row.getDouble(0), row.getInt(1).toDouble))
14 |     new MulticlassMetrics(rdd).recall(labelValue)
15 |   }
16 | 
17 |   def trainTestSplit(df:DataFrame, testSize:Double = 0.3):(DataFrame,DataFrame) = {
18 |     val dfs = df.randomSplit(Array(1-testSize, testSize))
19 |     val trainDf = dfs(0)
20 |     val crossDf = dfs(1)
21 |     (trainDf,crossDf)
22 |   }
23 | 
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/ml/MultiColumnTransformer.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.ml
 2 | 
 3 | import org.apache.spark.ml.feature.StringIndexer
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object MultiColumnTransformer {
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val sparkSession = SparkSession.builder.
10 |       master("local")
11 |       .appName("example")
12 |       .getOrCreate()
13 | 
14 | 
15 |     val salaryDf = sparkSession.read.format("csv").option("header", "true").load("src/main/resources/adult.csv")
16 | 
17 |     val inputColumns = Array("workclass","education")
18 | 
19 |     val outputColumns = Array("workclass_indexed", "education_indexed")
20 | 
21 | 
22 |     // indexer multiple column
23 |     val stringIndexer = new StringIndexer()
24 |     stringIndexer.setInputCols(inputColumns)
25 |     stringIndexer.setOutputCols(outputColumns)
26 |     val indexedDf = stringIndexer.fit(salaryDf).transform(salaryDf)
27 | 
28 |     indexedDf.select(outputColumns.head, outputColumns.tail:_*).show()
29 | 
30 |   }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/ml/WeightedLogisticRegression.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.ml
 2 | 
 3 | import org.apache.spark.ml._
 4 | import org.apache.spark.ml.classification.LogisticRegression
 5 | import org.apache.spark.ml.feature._
 6 | import org.apache.spark.sql.{DataFrame, SparkSession}
 7 | import MLUtils._
 8 | import org.apache.spark.sql.functions._
 9 | 
10 | /**
11 |  * Weighted Logistic Regression for Credit Card Fraud
12 |  *
13 |  */
14 | object WeightedLogisticRegression {
15 | 
16 |   def main(args: Array[String]) {
17 | 
18 |     val sparkSession = SparkSession.builder.
19 |       master("local[4]")
20 |       .appName("example")
21 |       .getOrCreate()
22 | 
23 |     sparkSession.sparkContext.setLogLevel("ERROR")
24 |     //load train df
25 |     // Download the data from : https://www.kaggle.com/dalpozz/creditcardfraud/downloads/creditcard.csv
26 |     val df = sparkSession.read.option("header", "true").option("inferSchema", "true").csv("src/main/resources/creditcard.csv")
27 |     df.printSchema()
28 | 
29 |     val amountVectorAssembler = new VectorAssembler().setInputCols(Array("Amount")).setOutputCol("Amount_vector")
30 |     val standarScaler = new StandardScaler().setInputCol("Amount_vector").setOutputCol("Amount_scaled")
31 |     val dropColumns = Array("Time","Amount","Class")
32 |     
33 |     val cols = df.columns.filter( column => !dropColumns.contains(column)) ++ Array("Amount_scaled")
34 |     val vectorAssembler = new VectorAssembler().setInputCols(cols).setOutputCol("features")
35 | 
36 |     // pipeline 
37 |     val logisticRegression = new LogisticRegression().setLabelCol("Class")
38 |     val trainPipeline = new Pipeline().setStages(Array(amountVectorAssembler,standarScaler,vectorAssembler,logisticRegression))
39 | 
40 |     println("for imbalanced data")
41 |     runPipeline(trainPipeline, df)
42 | 
43 |     // add weight column
44 |     val ratioOfFraud = getRatio(df)
45 |     val fraudWeight  = 1 - ratioOfFraud
46 |     val nonFraudWeight = ratioOfFraud
47 | 
48 |     val weightedDF = df.withColumn("weight",
49 |       when(df.col("Class").===("1.0"),fraudWeight)
50 |      .otherwise(nonFraudWeight))
51 | 
52 |     logisticRegression.setWeightCol("weight")
53 |     println("for balanced data")
54 |     val balancedModel = runPipeline(trainPipeline, weightedDF)
55 | 
56 |     println("balanced model for full data")
57 |     printScores(balancedModel, weightedDF)
58 | 
59 |    }
60 | 
61 | 
62 |   def getRatio(df:DataFrame) = {
63 |     val fraudDf = df.filter("Class=1.0")
64 |     val sampleRatio = fraudDf.count().toDouble / df.count().toDouble
65 |     sampleRatio
66 |   }
67 | 
68 |   def runPipeline(pipeline:Pipeline, df:DataFrame):PipelineModel = {
69 |     val (trainDf,crossDf) = trainTestSplit(df)
70 |     val model = pipeline.fit(trainDf)
71 |     printScores(model, crossDf)
72 |     model
73 |   }
74 | 
75 |   def printScores(model:PipelineModel, df:DataFrame) = {
76 |     println("test accuracy with pipeline " + accuracyScore(model.transform(df), "Class", "prediction"))
77 |     println("test recall for 1.0 is " + recall(model.transform(df), "Class", "prediction", 1.0))
78 |   }
79 | }
80 |  
81 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sources/BinaryFile.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sources
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object BinaryFile {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val sparkSession = SparkSession.builder().
10 |       appName("simple").master("local").getOrCreate()
11 | 
12 |     val df = sparkSession.read.format("binaryFile")
13 |         .load("/home/madhu/Downloads/IMG_20190506_210110-EFFECTS.jpg")
14 | 
15 |     df.select("content").show()
16 | 
17 |   }
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sources/MultiCharacterDelimiterCSV.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sources
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object MultiCharacterDelimiterCSV {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val sparkSession = SparkSession.builder.
10 |           master("local")
11 |           .appName("csvexample")
12 |           .getOrCreate()
13 | 
14 | 
15 |     // throws java.lang.IllegalArgumentException: Delimiter cannot be more than one character: ||
16 |     // in spark 2.x
17 | 
18 |     val df  = sparkSession.read
19 |       .option("delimiter","||")
20 |       .option("header","true")
21 |       .csv("src/main/resources/multicharacterseperator.csv")
22 | 
23 |     df.show()
24 | 
25 |   }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sources/RecursiveFolderReadExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sources
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object RecursiveFolderReadExample {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val sparkSession = SparkSession.builder.
10 |           master("local")
11 |           .appName("csvexample")
12 |           .getOrCreate()
13 | 
14 | 
15 |     // normal read
16 | 
17 |     val df  = sparkSession.read
18 |       .option("delimiter","||")
19 |       .option("header","true")
20 |       .csv("src/main/resources/nested")
21 | 
22 |     assert(df.count() == 2)
23 | 
24 |     // recursive read
25 |      val recursiveDf  = sparkSession.read
26 |       .option("delimiter","||")
27 |        .option("recursiveFileLookup","true")
28 |       .option("header","true")
29 |       .csv("src/main/resources/nested")
30 | 
31 |     assert(recursiveDf.count() == 4)
32 | 
33 |   }
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/DataSourceV2Example.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sources.datasourcev2
 2 | 
 3 | import org.apache.spark.Partition
 4 | import org.apache.spark.sql.{SaveMode, SparkSession}
 5 | import shapeless.Tuple
 6 | 
 7 | object DataSourceV2Example {
 8 | 
 9 |   def main(args: Array[String]) {
10 | 
11 |     val sparkSession = SparkSession.builder
12 |       .master("local[2]")
13 |       .appName("example")
14 |       .getOrCreate()
15 | 
16 |     val simpleDf = sparkSession.read
17 |       .format("com.madhukaraphatak.spark.sources.datasourcev2.simple")
18 |       .load()
19 | 
20 |     simpleDf.show()
21 |     println(
22 |       "number of partitions in simple source is " + simpleDf.rdd.getNumPartitions)
23 | 
24 | 
25 |     val simpleMultiDf = sparkSession.read
26 |       .format("com.madhukaraphatak.spark.sources.datasourcev2.simplemulti")
27 |       .load()
28 | 
29 |     simpleMultiDf.show()
30 |     println(
31 |       "number of partitions in simple multi source is " + simpleMultiDf.rdd.getNumPartitions)
32 | 
33 | 
34 |     val simpleCsvDf = sparkSession.read
35 |       .format("com.madhukaraphatak.spark.sources.datasourcev2.simplecsv")
36 |       .load("src/main/resources/adult.csv")
37 | 
38 |     simpleCsvDf.printSchema()
39 |     simpleCsvDf.show()
40 |     println(
41 |       "number of partitions in simple csv source is " + simpleCsvDf.rdd.getNumPartitions)
42 | 
43 | 
44 | 
45 | 
46 |     val simpleMysqlDf = sparkSession.createDataFrame(Seq(
47 |       Tuple1("test1"),
48 |       Tuple1("test2")
49 |     )).toDF("user")
50 | 
51 |     //write examples
52 |     simpleMysqlDf.write
53 |       .format(
54 |         "com.madhukaraphatak.spark.sources.datasourcev2.simplemysqlwriter")
55 |       .mode(SaveMode.Append)
56 |       .save()
57 | 
58 |    /* simpleMysqlDf.write
59 |       .format(
60 |         "com.madhukaraphatak.examples.sparktwo.datasourcev2.mysqlwithtransaction")
61 |       .save()
62 | 
63 |     val simplePartitoningDf = sparkSession.read
64 |       .format(
65 |         "com.madhukaraphatak.examples.sparktwo.datasourcev2.partitionaffinity")
66 |       .load()
67 | 
68 |     val dfRDD = simplePartitoningDf.rdd
69 |     val baseRDD =
70 |       dfRDD.dependencies.head.rdd.dependencies.head.rdd.dependencies.head.rdd
71 | 
72 |     val partition = baseRDD.partitions(0)
73 |     val getPrefferedLocationDef = baseRDD.getClass
74 |       .getMethod("getPreferredLocations", classOf[Partition])
75 |     val preferredLocation = getPrefferedLocationDef
76 |       .invoke(baseRDD, partition)
77 |       .asInstanceOf[Seq[String]]
78 |     println("preferred location is " + preferredLocation)
79 | 
80 |     */
81 | 
82 |     sparkSession.stop()
83 | 
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/SimpleCsvDataSource.scala:
--------------------------------------------------------------------------------
  1 | package com.madhukaraphatak.spark.sources.datasourcev2.simplecsv
  2 | 
  3 | import java.util
  4 | 
  5 | import org.apache.spark.sql.SparkSession
  6 | import org.apache.spark.sql.catalyst.InternalRow
  7 | import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability, TableProvider}
  8 | import org.apache.spark.sql.connector.expressions.Transform
  9 | import org.apache.spark.sql.connector.read._
 10 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 11 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
 12 | import org.apache.spark.unsafe.types.UTF8String
 13 | 
 14 | import scala.collection.JavaConverters._
 15 | 
 16 | /*
 17 |   * Default source should some kind of relation provider
 18 |   */
 19 | class DefaultSource extends TableProvider{
 20 | 
 21 |     override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType =
 22 |     getTable(null,Array.empty[Transform],caseInsensitiveStringMap.asCaseSensitiveMap()).schema()
 23 | 
 24 |   override def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table ={
 25 |       val path = map.get("path")
 26 |       new CsvBatchTable(path)
 27 |     }
 28 | 
 29 | }
 30 | 
 31 | object SchemaUtils {
 32 |   def getSchema(path:String):StructType = {
 33 |     val sparkContext = SparkSession.builder.getOrCreate().sparkContext
 34 |     val firstLine = sparkContext.textFile(path).first()
 35 |     val columnNames = firstLine.split(",")
 36 |     val structFields = columnNames.map(value ⇒ StructField(value, StringType))
 37 |     StructType(structFields)
 38 |   }
 39 | }
 40 | /*
 41 |   Defines Read Support and Initial Schema
 42 |  */
 43 | 
 44 | class CsvBatchTable(path:String) extends Table with SupportsRead {
 45 |   override def name(): String = this.getClass.toString
 46 | 
 47 |   override def schema(): StructType = SchemaUtils.getSchema(path)
 48 | 
 49 |   override def capabilities(): util.Set[TableCapability] = Set(TableCapability.BATCH_READ).asJava
 50 | 
 51 |   override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = new CsvScanBuilder(path)
 52 | }
 53 | 
 54 | 
 55 | /*
 56 |    Scan object with no mixins
 57 |  */
 58 | class CsvScanBuilder(path:String) extends ScanBuilder {
 59 |   override def build(): Scan = new CsvScan(path)
 60 | }
 61 | 
 62 | 
 63 | // simple class to organise the partition
 64 | case class CsvPartition(val partitionNumber:Int, path:String, header:Boolean=true) extends InputPartition
 65 | 
 66 | 
 67 | /*
 68 |     Batch Reading Support
 69 | 
 70 |     The schema is repeated here as it can change after column pruning etc
 71 |  */
 72 | 
 73 | class CsvScan(path:String) extends Scan with Batch{
 74 |   override def readSchema(): StructType = SchemaUtils.getSchema(path)
 75 | 
 76 |   override def toBatch: Batch = this
 77 | 
 78 |   override def planInputPartitions(): Array[InputPartition] = {
 79 |     val sparkContext = SparkSession.builder.getOrCreate().sparkContext
 80 |     val rdd = sparkContext.textFile(path)
 81 |     val partitions = ( 0 to rdd.partitions.length - 1).map(value => CsvPartition(value, path))
 82 |     partitions.toArray
 83 | 
 84 |  }
 85 |   override def createReaderFactory(): PartitionReaderFactory = new CsvPartitionReaderFactory()
 86 | }
 87 | 
 88 | 
 89 | // reader factory
 90 | class CsvPartitionReaderFactory extends PartitionReaderFactory {
 91 |   override def createReader(partition: InputPartition): PartitionReader[InternalRow] = new
 92 |       CsvPartitionReader(partition.asInstanceOf[CsvPartition])
 93 | }
 94 | 
 95 | 
 96 | // parathion reader
 97 | class CsvPartitionReader(inputPartition: CsvPartition) extends PartitionReader[InternalRow] {
 98 | 
 99 |   var iterator: Iterator[String] = null
100 | 
101 |   @transient
102 |   def next = {
103 |     if (iterator == null) {
104 |       val sparkContext = SparkSession.builder.getOrCreate().sparkContext
105 |       val rdd = sparkContext.textFile(inputPartition.path)
106 |       val filterRDD = if (inputPartition.header) {
107 |         val firstLine = rdd.first
108 |         rdd.filter(_ != firstLine)
109 |       }
110 |       else rdd
111 |       val partition = filterRDD.partitions(inputPartition.partitionNumber)
112 |       iterator = filterRDD.iterator(partition, org.apache.spark.TaskContext.get())
113 |     }
114 |     iterator.hasNext
115 |   }
116 | 
117 |   def get = {
118 |     val line = iterator.next()
119 |     InternalRow.fromSeq(line.split(",").map(value => UTF8String.fromString(value)))
120 |   }
121 | 
122 |  def close() = Unit
123 | 
124 | }
125 | 
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/SimpleDataSource.scala:
--------------------------------------------------------------------------------
  1 | package com.madhukaraphatak.spark.sources.datasourcev2.simple
  2 | 
  3 | import java.util
  4 | 
  5 | import org.apache.spark.sql.catalyst.InternalRow
  6 | import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability, TableProvider}
  7 | import org.apache.spark.sql.connector.expressions.Transform
  8 | import org.apache.spark.sql.connector.read._
  9 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 10 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
 11 | import org.apache.spark.unsafe.types.UTF8String
 12 | 
 13 | import scala.collection.JavaConverters._
 14 | 
 15 | /*
 16 |   * Default source should some kind of relation provider
 17 |   */
 18 | class DefaultSource extends TableProvider{
 19 | 
 20 |   override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType =
 21 |     getTable(null,Array.empty[Transform],caseInsensitiveStringMap.asCaseSensitiveMap()).schema()
 22 | 
 23 |   override def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table =
 24 |     new SimpleBatchTable()
 25 | }
 26 | 
 27 | 
 28 | /*
 29 |   Defines Read Support and Initial Schema
 30 |  */
 31 | 
 32 | class SimpleBatchTable extends Table with SupportsRead {
 33 |   override def name(): String = this.getClass.toString
 34 | 
 35 |   override def schema(): StructType = StructType(Array(StructField("value", StringType)))
 36 | 
 37 |   override def capabilities(): util.Set[TableCapability] = Set(TableCapability.BATCH_READ).asJava
 38 | 
 39 |   override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = new SimpleScanBuilder()
 40 | }
 41 | 
 42 | 
 43 | 
 44 | /*
 45 |    Scan object with no mixins
 46 |  */
 47 | class SimpleScanBuilder extends ScanBuilder {
 48 |   override def build(): Scan = new SimpleScan
 49 | }
 50 | 
 51 | /*
 52 |     Batch Reading Support
 53 | 
 54 |     The schema is repeated here as it can change after column pruning etc
 55 |  */
 56 | 
 57 | class SimpleScan extends Scan with Batch{
 58 |   override def readSchema(): StructType =  StructType(Array(StructField("value", StringType)))
 59 | 
 60 |   override def toBatch: Batch = this
 61 | 
 62 |   override def planInputPartitions(): Array[InputPartition] = {
 63 |     Array(new SimplePartition())
 64 |   }
 65 |   override def createReaderFactory(): PartitionReaderFactory = new SimplePartitionReaderFactory()
 66 | }
 67 | 
 68 | // simple class to organise the partition
 69 | class SimplePartition extends InputPartition
 70 | 
 71 | // reader factory
 72 | class SimplePartitionReaderFactory extends PartitionReaderFactory {
 73 |   override def createReader(partition: InputPartition): PartitionReader[InternalRow] = new SimplePartitionReader
 74 | }
 75 | 
 76 | 
 77 | // parathion reader
 78 | class SimplePartitionReader extends PartitionReader[InternalRow] {
 79 | 
 80 |   val values = Array("1", "2", "3", "4", "5")
 81 | 
 82 |   var index = 0
 83 | 
 84 |   def next = index < values.length
 85 | 
 86 |   def get = {
 87 |     val stringValue = values(index)
 88 |     val stringUtf = UTF8String.fromString(stringValue)
 89 |     val row = InternalRow(stringUtf)
 90 |     index = index + 1
 91 |     row
 92 |   }
 93 | 
 94 |   def close() = Unit
 95 | 
 96 | }
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/SimpleMultiDataSource.scala:
--------------------------------------------------------------------------------
  1 | package com.madhukaraphatak.spark.sources.datasourcev2.simplemulti
  2 | 
  3 | import java.util
  4 | 
  5 | import org.apache.spark.sql.catalyst.InternalRow
  6 | import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability, TableProvider}
  7 | import org.apache.spark.sql.connector.expressions.Transform
  8 | import org.apache.spark.sql.connector.read._
  9 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 10 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
 11 | import org.apache.spark.unsafe.types.UTF8String
 12 | 
 13 | import scala.collection.JavaConverters._
 14 | 
 15 | /*
 16 |   * Default source should some kind of relation provider
 17 |   */
 18 | class DefaultSource extends TableProvider{
 19 | 
 20 |     override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType =
 21 |     getTable(null,Array.empty[Transform],caseInsensitiveStringMap.asCaseSensitiveMap()).schema()
 22 | 
 23 |   override def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table =
 24 |     new SimpleBatchTable()
 25 | 
 26 | }
 27 | 
 28 | 
 29 | /*
 30 |   Defines Read Support and Initial Schema
 31 |  */
 32 | 
 33 | class SimpleBatchTable extends Table with SupportsRead {
 34 |   override def name(): String = this.getClass.toString
 35 | 
 36 |   override def schema(): StructType = StructType(Array(StructField("value", StringType)))
 37 | 
 38 |   override def capabilities(): util.Set[TableCapability] = Set(TableCapability.BATCH_READ).asJava
 39 | 
 40 |   override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = new SimpleScanBuilder()
 41 | }
 42 | 
 43 | 
 44 | /*
 45 |    Scan object with no mixins
 46 |  */
 47 | class SimpleScanBuilder extends ScanBuilder {
 48 |   override def build(): Scan = new SimpleScan
 49 | }
 50 | 
 51 | /*
 52 |     Batch Reading Support
 53 | 
 54 |     The schema is repeated here as it can change after column pruning etc
 55 |  */
 56 | 
 57 | class SimpleScan extends Scan with Batch{
 58 |   override def readSchema(): StructType =  StructType(Array(StructField("value", StringType)))
 59 | 
 60 |   override def toBatch: Batch = this
 61 | 
 62 |   override def planInputPartitions(): Array[InputPartition] = {
 63 |     Array(new SimplePartition(0,4),
 64 |       new SimplePartition(5,9))
 65 |   }
 66 |   override def createReaderFactory(): PartitionReaderFactory = new SimplePartitionReaderFactory()
 67 | }
 68 | 
 69 | // simple class to organise the partition
 70 | class SimplePartition(val start:Int, val end:Int) extends InputPartition
 71 | 
 72 | // reader factory
 73 | class SimplePartitionReaderFactory extends PartitionReaderFactory {
 74 |   override def createReader(partition: InputPartition): PartitionReader[InternalRow] = new
 75 |       SimplePartitionReader(partition.asInstanceOf[SimplePartition])
 76 | }
 77 | 
 78 | 
 79 | // parathion reader
 80 | class SimplePartitionReader(inputPartition: SimplePartition) extends PartitionReader[InternalRow] {
 81 | 
 82 |   val values = Array("1", "2", "3", "4", "5","6","7","8","9","10")
 83 | 
 84 |   var index = inputPartition.start
 85 | 
 86 |   def next = index <= inputPartition.end
 87 | 
 88 |   def get = {
 89 |     val stringValue = values(index)
 90 |     val stringUtf = UTF8String.fromString(stringValue)
 91 |     val row = InternalRow(stringUtf)
 92 |     index = index + 1
 93 |     row
 94 |   }
 95 | 
 96 |   def close() = Unit
 97 | 
 98 | }
 99 | 
100 | 
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/SimpleMysqlWriterDataSource.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sources.datasourcev2.simplemysqlwriter
 2 | 
 3 | import java.sql.DriverManager
 4 | import java.util
 5 | 
 6 | import org.apache.spark.sql.catalyst.InternalRow
 7 | import org.apache.spark.sql.connector.catalog._
 8 | import org.apache.spark.sql.connector.expressions.Transform
 9 | import org.apache.spark.sql.connector.write._
10 | import org.apache.spark.sql.types.{StringType, StructType}
11 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
12 | 
13 | import scala.collection.JavaConverters._
14 | 
15 | /*
16 |   * Default source should some kind of relation provider
17 |   *
18 |   */
19 | 
20 | class DefaultSource extends TableProvider{
21 |     override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType =
22 |     getTable(null,Array.empty[Transform],caseInsensitiveStringMap.asCaseSensitiveMap()).schema()
23 | 
24 |   override def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table ={
25 |       new MysqlTable
26 |     }
27 | }
28 | 
29 | 
30 | class MysqlTable extends SupportsWrite{
31 | 
32 |   private val tableSchema = new StructType().add("user", StringType)
33 | 
34 | 
35 |   override def name(): String = this.getClass.toString
36 | 
37 |   override def schema(): StructType = tableSchema
38 | 
39 |   override def capabilities(): util.Set[TableCapability] = Set(TableCapability.BATCH_WRITE,
40 |     TableCapability.TRUNCATE).asJava
41 | 
42 |   override def newWriteBuilder(logicalWriteInfo: LogicalWriteInfo): WriteBuilder = new MysqlWriterBuilder
43 | }
44 | 
45 | class MysqlWriterBuilder extends WriteBuilder{
46 |   override def buildForBatch(): BatchWrite = new MysqlBatchWriter()
47 | }
48 | 
49 | class MysqlBatchWriter extends BatchWrite{
50 |   override def createBatchWriterFactory(physicalWriteInfo: PhysicalWriteInfo): DataWriterFactory = new
51 |   MysqlDataWriterFactory
52 | 
53 |   override def commit(writerCommitMessages: Array[WriterCommitMessage]): Unit = {}
54 | 
55 |   override def abort(writerCommitMessages: Array[WriterCommitMessage]): Unit = {}
56 | }
57 | 
58 | class MysqlDataWriterFactory extends DataWriterFactory {
59 |   override def createWriter(partitionId: Int, taskId:Long): DataWriter[InternalRow] = new MysqlWriter()
60 | }
61 | 
62 | 
63 | 
64 | object WriteSucceeded extends WriterCommitMessage
65 | 
66 | class MysqlWriter extends DataWriter[InternalRow] {
67 |   val url = "jdbc:mysql://localhost/test"
68 |   val user = "root"
69 |   val password = "abc123"
70 |   val table ="userwrite"
71 | 
72 |   val connection = DriverManager.getConnection(url,user,password)
73 |   val statement = "insert into userwrite (user) values (?)"
74 |   val preparedStatement = connection.prepareStatement(statement)
75 | 
76 | 
77 |   override def write(record: InternalRow): Unit = {
78 |     val value = record.getString(0)
79 |     preparedStatement.setString(1,value)
80 |     preparedStatement.executeUpdate()
81 |   }
82 | 
83 |   override def commit(): WriterCommitMessage = WriteSucceeded
84 | 
85 |   override def abort(): Unit = {}
86 | 
87 |   override def close(): Unit = {}
88 | }
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 
96 | 
97 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/streamandbatch/DataSourceV2StreamAndBatchExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sources.datasourcev2.streamandbatch
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.OutputMode
 5 | 
 6 | object DataSourceV2StreamAndBatchExample {
 7 |   def main(args: Array[String]): Unit = {
 8 |     val sparkSession = SparkSession.builder.
 9 |           master("local[2]")
10 |           .appName("streaming example")
11 |           .getOrCreate()
12 | 
13 | 
14 |     val dataSource = "com.madhukaraphatak.spark.sources.datasourcev2.streamandbatch.simple"
15 | 
16 |     val batchDf = sparkSession
17 |       .read
18 |       .format(dataSource)
19 |       .load()
20 | 
21 |     batchDf.show()
22 | 
23 |     val streamingDf = sparkSession.
24 |       readStream.
25 |       format(dataSource)
26 |       .load()
27 | 
28 |     val query = streamingDf.writeStream
29 |       .format("console")
30 |       .queryName("simple_source")
31 |     .outputMode(OutputMode.Append())
32 | 
33 |     query.start().awaitTermination()
34 | 
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/streamandbatch/SimpleStreamAndBatchDataSource.scala:
--------------------------------------------------------------------------------
  1 | package com.madhukaraphatak.spark.sources.datasourcev2.streamandbatch.simple
  2 | 
  3 | import java.util
  4 | 
  5 | import org.apache.spark.sql.catalyst.InternalRow
  6 | import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability, TableProvider}
  7 | import org.apache.spark.sql.connector.expressions.Transform
  8 | import org.apache.spark.sql.connector.read._
  9 | import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset}
 10 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 11 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
 12 | import org.apache.spark.unsafe.types.UTF8String
 13 | 
 14 | import scala.collection.JavaConverters._
 15 | 
 16 | /*
 17 |   * Default source should some kind of relation provider
 18 |   */
 19 | class DefaultSource extends TableProvider{
 20 | 
 21 |   override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType =
 22 |     getTable(null,Array.empty[Transform],caseInsensitiveStringMap.asCaseSensitiveMap()).schema()
 23 | 
 24 |   override def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table =
 25 |     new SimpleStreamingTable()
 26 | }
 27 | 
 28 | 
 29 | /*
 30 |   Defines Read Support and Initial Schema
 31 |  */
 32 | 
 33 | class SimpleStreamingTable extends Table with SupportsRead {
 34 |   override def name(): String = this.getClass.toString
 35 | 
 36 |   override def schema(): StructType = StructType(Array(StructField("value", StringType)))
 37 | 
 38 |   override def capabilities(): util.Set[TableCapability] = Set(TableCapability.MICRO_BATCH_READ,
 39 |     TableCapability.BATCH_READ).asJava
 40 | 
 41 |   override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = new SimpleScanBuilder()
 42 | }
 43 | 
 44 | 
 45 | /*
 46 |    Scan object with no mixins
 47 |  */
 48 | class SimpleScanBuilder extends ScanBuilder {
 49 |   override def build(): Scan = new SimpleScan
 50 | }
 51 | 
 52 | /*
 53 |     Batch Reading Support
 54 | 
 55 |     The schema is repeated here as it can change after column pruning etc
 56 |  */
 57 | 
 58 | class SimpleScan extends Scan{
 59 |   override def readSchema(): StructType =  StructType(Array(StructField("value", StringType)))
 60 | 
 61 |   override def toMicroBatchStream(checkpointLocation: String): MicroBatchStream = new SimpleMicroBatchStream()
 62 | 
 63 |   override def toBatch: Batch = new SimpleBatch
 64 | }
 65 | 
 66 | class SimpleBatch extends Batch{
 67 |   override def planInputPartitions(): Array[InputPartition] = Array(new SimplePartition)
 68 | 
 69 |   override def createReaderFactory(): PartitionReaderFactory = new SimplePartitionReaderFactory
 70 | }
 71 | 
 72 | class SimpleOffset(value:Int) extends Offset {
 73 |   override def json(): String = s"""{"value":"$value"}"""
 74 | }
 75 | 
 76 | class SimpleMicroBatchStream extends MicroBatchStream {
 77 |   var latestOffsetValue = 0
 78 | 
 79 |   override def latestOffset(): Offset = {
 80 |     latestOffsetValue += 10
 81 |     new SimpleOffset(latestOffsetValue)
 82 |   }
 83 | 
 84 |   override def planInputPartitions(offset: Offset, offset1: Offset): Array[InputPartition] = Array(new SimplePartition)
 85 | 
 86 |   override def createReaderFactory(): PartitionReaderFactory = new SimplePartitionReaderFactory()
 87 | 
 88 |   override def initialOffset(): Offset = new SimpleOffset(latestOffsetValue)
 89 | 
 90 |   override def deserializeOffset(s: String): Offset = new SimpleOffset(latestOffsetValue)
 91 | 
 92 |   override def commit(offset: Offset): Unit = {}
 93 | 
 94 |   override def stop(): Unit = {}
 95 | }
 96 | 
 97 | 
 98 | // simple class to organise the partition
 99 | class SimplePartition extends InputPartition
100 | 
101 | // reader factory
102 | class SimplePartitionReaderFactory extends PartitionReaderFactory {
103 |   override def createReader(partition: InputPartition): PartitionReader[InternalRow] = new SimplePartitionReader
104 | }
105 | 
106 | 
107 | // parathion reader
108 | class SimplePartitionReader extends PartitionReader[InternalRow] {
109 | 
110 |   val values = Array("1", "2", "3", "4", "5")
111 | 
112 |   var index = 0
113 | 
114 |   def next = index < values.length
115 | 
116 |   def get = {
117 |     val stringValue = values(index)
118 |     val stringUtf = UTF8String.fromString(stringValue)
119 |     val row = InternalRow(stringUtf)
120 |     index = index + 1
121 |     row
122 |   }
123 | 
124 |   def close() = Unit
125 | 
126 | }
127 | 
128 | 
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/streaming/DataSourceV2StreamingExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sources.datasourcev2.streaming
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.OutputMode
 5 | 
 6 | object DataSourceV2StreamingExample {
 7 |   def main(args: Array[String]): Unit = {
 8 |     val sparkSession = SparkSession.builder.
 9 |           master("local[2]")
10 |           .appName("streaming example")
11 |           .getOrCreate()
12 | 
13 |     val streamingDf = sparkSession.
14 |       readStream.
15 |       format("com.madhukaraphatak.spark.sources.datasourcev2.streaming.simple")
16 |       .load()
17 | 
18 | 
19 | 
20 |     val query = streamingDf.writeStream
21 |       .format("console")
22 |       .queryName("simple_source")
23 |     .outputMode(OutputMode.Append())
24 | 
25 |     query.start().awaitTermination()
26 | 
27 |   }
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sources/datasourcev2/streaming/SimpleStreamingDataSource.scala:
--------------------------------------------------------------------------------
  1 | package com.madhukaraphatak.spark.sources.datasourcev2.streaming.simple
  2 | 
  3 | import java.util
  4 | 
  5 | import org.apache.spark.sql.catalyst.InternalRow
  6 | import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability, TableProvider}
  7 | import org.apache.spark.sql.connector.expressions.Transform
  8 | import org.apache.spark.sql.connector.read._
  9 | import org.apache.spark.sql.connector.read.streaming.{MicroBatchStream, Offset}
 10 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 11 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
 12 | import org.apache.spark.unsafe.types.UTF8String
 13 | 
 14 | import scala.collection.JavaConverters._
 15 | 
 16 | /*
 17 |   * Default source should some kind of relation provider
 18 |   */
 19 | class DefaultSource extends TableProvider{
 20 | 
 21 |   override def inferSchema(caseInsensitiveStringMap: CaseInsensitiveStringMap): StructType =
 22 |     getTable(null,Array.empty[Transform],caseInsensitiveStringMap.asCaseSensitiveMap()).schema()
 23 | 
 24 |   override def getTable(structType: StructType, transforms: Array[Transform], map: util.Map[String, String]): Table =
 25 |     new SimpleStreamingTable()
 26 | }
 27 | 
 28 | 
 29 | /*
 30 |   Defines Read Support and Initial Schema
 31 |  */
 32 | 
 33 | class SimpleStreamingTable extends Table with SupportsRead {
 34 |   override def name(): String = this.getClass.toString
 35 | 
 36 |   override def schema(): StructType = StructType(Array(StructField("value", StringType)))
 37 | 
 38 |   override def capabilities(): util.Set[TableCapability] = Set(TableCapability.MICRO_BATCH_READ).asJava
 39 | 
 40 |   override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = new SimpleScanBuilder()
 41 | }
 42 | 
 43 | 
 44 | /*
 45 |    Scan object with no mixins
 46 |  */
 47 | class SimpleScanBuilder extends ScanBuilder {
 48 |   override def build(): Scan = new SimpleScan
 49 | }
 50 | 
 51 | /*
 52 |     Batch Reading Support
 53 | 
 54 |     The schema is repeated here as it can change after column pruning etc
 55 |  */
 56 | 
 57 | class SimpleScan extends Scan{
 58 |   override def readSchema(): StructType =  StructType(Array(StructField("value", StringType)))
 59 | 
 60 |   override def toMicroBatchStream(checkpointLocation: String): MicroBatchStream = new SimpleMicroBatchStream()
 61 | }
 62 | 
 63 | class SimpleOffset(value:Int) extends Offset {
 64 |   override def json(): String = s"""{"value":"$value"}"""
 65 | }
 66 | 
 67 | class SimpleMicroBatchStream extends MicroBatchStream {
 68 |   var latestOffsetValue = 0
 69 | 
 70 |   override def latestOffset(): Offset = {
 71 |     latestOffsetValue += 10
 72 |     new SimpleOffset(latestOffsetValue)
 73 |   }
 74 | 
 75 |   override def planInputPartitions(offset: Offset, offset1: Offset): Array[InputPartition] = Array(new SimplePartition)
 76 | 
 77 |   override def createReaderFactory(): PartitionReaderFactory = new SimplePartitionReaderFactory()
 78 | 
 79 |   override def initialOffset(): Offset = new SimpleOffset(latestOffsetValue)
 80 | 
 81 |   override def deserializeOffset(s: String): Offset = new SimpleOffset(latestOffsetValue)
 82 | 
 83 |   override def commit(offset: Offset): Unit = {}
 84 | 
 85 |   override def stop(): Unit = {}
 86 | }
 87 | 
 88 | 
 89 | // simple class to organise the partition
 90 | class SimplePartition extends InputPartition
 91 | 
 92 | // reader factory
 93 | class SimplePartitionReaderFactory extends PartitionReaderFactory {
 94 |   override def createReader(partition: InputPartition): PartitionReader[InternalRow] = new SimplePartitionReader
 95 | }
 96 | 
 97 | 
 98 | // parathion reader
 99 | class SimplePartitionReader extends PartitionReader[InternalRow] {
100 | 
101 |   val values = Array("1", "2", "3", "4", "5")
102 | 
103 |   var index = 0
104 | 
105 |   def next = index < values.length
106 | 
107 |   def get = {
108 |     val stringValue = values(index)
109 |     val stringUtf = UTF8String.fromString(stringValue)
110 |     val row = InternalRow(stringUtf)
111 |     index = index + 1
112 |     row
113 |   }
114 | 
115 |   def close() = Unit
116 | 
117 | }
118 | 
119 | 
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sql/DataFrameTail.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sql
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object DataFrameTail {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val sparkSession = SparkSession.builder().
10 |       appName("example").master("local").getOrCreate()
11 | 
12 |     val df = sparkSession.range(100)
13 | 
14 |     //head
15 |     println(df.head(2).toList)
16 | 
17 |     println(df.tail(5).toList)
18 | 
19 | 
20 |   }
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sql/InMemoryTableScanExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sql
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object InMemoryTableScanExample {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val sparkSession = SparkSession.builder.
10 |       master("local[2]")
11 |       .appName("in memory table in UI example")
12 |       .getOrCreate()
13 | 
14 | 
15 |     val firstDF = sparkSession.createDataFrame(Seq(
16 |       ("1", 10),
17 |       ("2", 20)
18 |     )).toDF("id", "sales")
19 | 
20 |     firstDF.createOrReplaceTempView("firstDf")
21 |     sparkSession.catalog.cacheTable("firstDf")
22 | 
23 |     val secondDF = sparkSession.createDataFrame(Seq(
24 |       ("1", 40),
25 |       ("2", 50)
26 |     )).toDF("id", "volume")
27 | 
28 |     secondDF.createOrReplaceTempView("secondDf")
29 |     sparkSession.catalog.cacheTable("secondDf")
30 | 
31 |     val thirdDF = sparkSession.createDataFrame(Seq(
32 |       ("1", 70),
33 |       ("2", 80)
34 |     )).toDF("id", "value")
35 | 
36 |     thirdDF.createOrReplaceTempView("thirdDf")
37 |     sparkSession.catalog.cacheTable("thirdDf")
38 | 
39 |     val joinDF = firstDF.join(secondDF, "id").join(thirdDF,"id")
40 | 
41 |     joinDF.count()
42 |     
43 |   }
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sql/JoinHintsExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sql
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object JoinHintsExample {
 6 |   def main(args: Array[String]): Unit = {
 7 | 
 8 |     val sparkSession = SparkSession.builder.
 9 |           master("local")
10 |           .appName("join hits example")
11 |           .getOrCreate()
12 | 
13 |     val salesDf = sparkSession.read.
14 |       format("csv")
15 |       .option("header", "true")
16 |       .option("inferSchema", "true")
17 |       .load("src/main/resources/sales.csv")
18 | 
19 | 
20 |     val customerDf = sparkSession.read.
21 |       format("csv")
22 |       .option("header", "true")
23 |       .option("inferSchema", "true")
24 |       .load("src/main/resources/customers.csv")
25 | 
26 | 
27 |     //broadcast hint
28 | 
29 |     val broadcastJoin = salesDf.hint("broadcast").join(customerDf,"customerId")
30 |     broadcastJoin.show()
31 | 
32 |     // merge join
33 | 
34 |     val mergeJoin = salesDf.hint("merge").join(customerDf, "customerId")
35 |     mergeJoin.show()
36 | 
37 |     // shuffle_hash
38 | 
39 |     val shuffleHashJoin = salesDf.hint("shuffle_hash").join(customerDf,"customerId")
40 |     shuffleHashJoin.show()
41 | 
42 |     //shuffle_replicate_nl
43 |     val cartesianProduct = salesDf.hint("shuffle_replicate_nl").join(customerDf)
44 |     cartesianProduct.show()
45 | 
46 | 
47 |     //Thread.sleep(1000000)
48 | 
49 | 
50 |   }
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sql/MinAndMaxByExample.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sql
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | 
 6 | object MinAndMaxByExample {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     val sparkSession = SparkSession.builder.
11 |       master("local")
12 |       .appName("min and max by example")
13 |       .getOrCreate()
14 | 
15 | 
16 |     val df = sparkSession.createDataFrame(Seq(
17 |       ("1", 10),
18 |       ("2", 20),
19 |       ("3", 30),
20 |       ("4", 40)
21 |     )).toDF("id","value")
22 |     df.createOrReplaceTempView("table")
23 | 
24 | 
25 |     // min by window function
26 | 
27 |     import org.apache.spark.sql.expressions.Window
28 |     import org.apache.spark.sql.functions.dense_rank
29 | 
30 |     val orderedDf = Window.orderBy(df.col("value"))
31 |     val rankedDf = df.withColumn("rank", dense_rank.over(orderedDf))
32 |     val minDf = rankedDf.filter("rank == 1")
33 |     minDf.show()
34 | 
35 | 
36 | 
37 |     // find the id which has maximum value
38 | 
39 |     val resultDf = sparkSession.sql("select max_by(id,value) max_id, min_by(id,value) min_id from table")
40 | 
41 |     resultDf.show()
42 | 
43 | 
44 | 
45 | 
46 |   }
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sql/MultiColumnSampleBy.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sql
 2 | 
 3 | import org.apache.spark.sql.{Row, SparkSession}
 4 | 
 5 | object MultiColumnSampleBy {
 6 | 
 7 |   def main(args: Array[String]): Unit = {
 8 | 
 9 |     val sparkSession = SparkSession.builder.
10 |       master("local")
11 |       .appName("multi column sample")
12 |       .getOrCreate()
13 | 
14 |     val df = sparkSession.createDataFrame(Seq(
15 |       (1, "p1", "s1", 20),
16 |       (1, "p2", "s1", 30),
17 |       (1, "p1", "s2", 40),
18 |       (1, "p2", "s2", 50),
19 |       (2, "p1", "s1", 20),
20 |       (2, "p2", "s1", 30),
21 |       (2, "p1", "s2", 40),
22 |       (2, "p2", "s2", 50)))
23 |       .toDF("day", "product", "store", "sales")
24 | 
25 |     //single sampleBy
26 | 
27 |     val singleFractions = Map("p1" -> 0.5, "p2" -> 0.5)
28 |     val singleSampleDf = df.stat.sampleBy("product", singleFractions, -1)
29 |     singleSampleDf.sort("product").show()
30 | 
31 |     // multi column sampleBy on product and store
32 |     import org.apache.spark.sql.functions.struct
33 |     val multipleFractions = Map(Row("p1", "s1") -> 0.5,
34 |       Row("p1", "s1") -> 0.5,
35 |       Row("p1", "s2") -> 0.5,
36 |       Row("p2", "s2") -> 0.5)
37 |     val multiSampleDf = df.stat.sampleBy(struct("product", "store"), multipleFractions, -1)
38 |     multiSampleDf.show()
39 | 
40 | 
41 |   }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sql/adaptive/shuffle/AdaptiveShuffle.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sql.adaptive.shuffle
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object AdaptiveShuffle {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     val conf = new SparkConf()
11 |       .setAppName("test plan")
12 |       .setMaster("local[2]")
13 |       .set("spark.sql.adaptive.enabled", "true")
14 |       .set("spark.sql.adaptive.coalescePartitions.enabled", "true")
15 | 
16 |     val sparkSession = SparkSession.builder().config(conf).getOrCreate()
17 | 
18 |     val df = sparkSession.read.
19 |       format("csv")
20 |       .option("header", "true")
21 |       .option("inferSchema", "true")
22 |       .load("src/main/resources/sales.csv").repartition(500)
23 | 
24 |     df.groupBy("customerId").count().count()
25 | 
26 |     //Thread.sleep(1000000)
27 | 
28 | 
29 |   }
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/scala/com/madhukaraphatak/spark/sql/adaptive/shuffle/NoAdaptiveShuffle.scala:
--------------------------------------------------------------------------------
 1 | package com.madhukaraphatak.spark.sql.adaptive.shuffle
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object NoAdaptiveShuffle {
 7 | 
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     val conf = new SparkConf()
11 |       .setAppName("no adaptive shuffle")
12 |       .setMaster("local[2]")
13 | 
14 |     val sparkSession = SparkSession.builder().config(conf).getOrCreate()
15 | 
16 |     val df = sparkSession.read.
17 |       format("csv")
18 |       .option("header", "true")
19 |       .option("inferSchema", "true")
20 |       .load("src/main/resources/sales.csv").repartition(500)
21 | 
22 | 
23 |     df.groupBy("customerId").count().count()
24 | 
25 | 
26 |     //Thread.sleep(1000000)
27 | 
28 | 
29 |   }
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------