└── SparkStreamingPOC
    ├── build.sbt
    └── src
        └── main
            ├── resources
                ├── department.csv
                └── employee.csv
            └── scala
                ├── entity
                    ├── entities.scala
                    └── package.scala
                ├── listener
                    ├── KafkaMetrics.scala
                    └── SparkListenerKafkaWriter.scala
                ├── sink
                    └── JDBCSink.scala
                └── streaming
                    ├── MultiStreamTODO.scala
                    ├── Rate2Console.scala
                    ├── SparkListenerKafkaNotifier.scala
                    ├── jdbc
                        └── WriteToPostgress.scala
                    ├── join
                        ├── staticstream
                        │   ├── staticStreamFullOuterJoin.scala
                        │   ├── staticStreamInnerJoin.scala
                        │   ├── staticStreamLeftOuterJoin.scala
                        │   └── staticStreamRightOuterJoin.scala
                        ├── streamstatic
                        │   ├── streamStaticFullOuterJoin.scala
                        │   ├── streamStaticInnerJoin.scala
                        │   ├── streamStaticLeftOuterJoin.scala
                        │   └── streamStaticRightOuterJoin.scala
                        └── streamstream
                        │   ├── streamStreamFullOuterJoin.scala
                        │   ├── streamStreamInnerJoin.scala
                        │   ├── streamStreamLeftOuterJoin.scala
                        │   └── streamStreamRightOuterJoin.scala
                    ├── kafka
                        ├── Kafka2Kafka.scala
                        ├── Rate2Kafka.scala
                        ├── Rate2KafkaMultiStream.scala
                        └── Rate2KafkaSparkListener.scala
                    ├── mode
                        ├── Rate2ConsoleAggregateAppendMode.scala
                        ├── Rate2ConsoleAggregateCompleteMode.scala
                        ├── Rate2ConsoleAggregateUpdateMode.scala
                        ├── Rate2ConsoleCompleteMode.scala
                        ├── Rate2ConsoleDefaultMode.scala
                        └── Rate2ConsoleUpdateMode.scala
                    ├── state
                        ├── Rate2SparkState.scala
                        └── manageStateHelper.scala
                    ├── trigger
                        ├── Rate2ConsoleContinuousTrigger.scala
                        ├── Rate2ConsoleDefaultTrigger.scala
                        ├── Rate2ConsoleOnceTrigger.scala
                        ├── Rate2ConsoleProgressTrigger.scala
                        └── Rate2ConsoleTriggerOptions.scala
                    ├── unsupported
                        ├── Rate2ConsoleMultiStream.scala
                        └── UnsupportedFeatures.scala
                    └── watermark
                        └── Rate2ConsoleWatermark.scala


/SparkStreamingPOC/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "SparkStreamingSample"
 2 | 
 3 | version := "0.1"
 4 | 
 5 | scalaVersion := "2.11.11"
 6 | 
 7 | // grading libraries
 8 | libraryDependencies ++= Seq(
 9 |   "log4j" % "log4j" % "1.2.14",
10 |   "org.apache.kafka" % "kafka-clients" % "0.8.2.0",
11 |   "org.apache.spark" %% "spark-core" % "2.4.0",
12 |   "org.apache.spark" %% "spark-sql" % "2.4.0",
13 |   "org.apache.spark" %% "spark-sql-kafka-0-10" % "2.4.0"
14 | 
15 | )


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/resources/department.csv:
--------------------------------------------------------------------------------
 1 | id,name
 2 | 1,dept1
 3 | 2,dept2
 4 | 3,dept3
 5 | 4,dept4
 6 | 5,dept5
 7 | 6,dept6
 8 | 7,dept7
 9 | 8,dept8
10 | 9,dept9
11 | 10,dept10
12 | 11,dept11
13 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/resources/employee.csv:
--------------------------------------------------------------------------------
 1 | id,firstName,lastName,departmentId
 2 | 1,firstName1,lastName1,1
 3 | 2,firstName2,lastName2,1
 4 | 3,firstName3,lastName3,1
 5 | 4,firstName4,lastName4,1
 6 | 5,firstName5,lastName5,1
 7 | 6,firstName6,lastName6,2
 8 | 7,firstName7,lastName7,2
 9 | 8,firstName8,lastName8,2
10 | 9,firstName9,lastName9,2
11 | 10,firstName10,lastName10,2
12 | 11,firstName11,lastName11,2
13 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/entity/entities.scala:
--------------------------------------------------------------------------------
1 | package entity
2 | 
3 | import java.sql.Timestamp
4 | 
5 | case class RateData(timestamp: Timestamp, value: Long)
6 | case class Employee(id: Long, firstName: String, lastName: String, eventTime: Timestamp)
7 | case class Department(id: Long, name: String, eventTime: Timestamp)
8 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/entity/package.scala:
--------------------------------------------------------------------------------
1 | package object constant {
2 |   val sampleJSON:String = "{\"web-app\": {\n  \"servlet\": [   \n    {\n      \"servlet-name\": \"cofaxCDS\",\n      \"servlet-class\": \"org.cofax.cds.CDSServlet\",\n      \"init-param\": {\n        \"configGlossary:installationAt\": \"Philadelphia, PA\",\n        \"configGlossary:adminEmail\": \"ksm@pobox.com\",\n        \"configGlossary:poweredBy\": \"Cofax\",\n        \"configGlossary:poweredByIcon\": \"/images/cofax.gif\",\n        \"configGlossary:staticPath\": \"/content/static\",\n        \"templateProcessorClass\": \"org.cofax.WysiwygTemplate\",\n        \"templateLoaderClass\": \"org.cofax.FilesTemplateLoader\",\n        \"templatePath\": \"templates\",\n        \"templateOverridePath\": \"\",\n        \"defaultListTemplate\": \"listTemplate.htm\",\n        \"defaultFileTemplate\": \"articleTemplate.htm\",\n        \"useJSP\": false,\n        \"jspListTemplate\": \"listTemplate.jsp\",\n        \"jspFileTemplate\": \"articleTemplate.jsp\",\n        \"cachePackageTagsTrack\": 200,\n        \"cachePackageTagsStore\": 200,\n        \"cachePackageTagsRefresh\": 60,\n        \"cacheTemplatesTrack\": 100,\n        \"cacheTemplatesStore\": 50,\n        \"cacheTemplatesRefresh\": 15,\n        \"cachePagesTrack\": 200,\n        \"cachePagesStore\": 100,\n        \"cachePagesRefresh\": 10,\n        \"cachePagesDirtyRead\": 10,\n        \"searchEngineListTemplate\": \"forSearchEnginesList.htm\",\n        \"searchEngineFileTemplate\": \"forSearchEngines.htm\",\n        \"searchEngineRobotsDb\": \"WEB-INF/robots.db\",\n        \"useDataStore\": true,\n        \"dataStoreClass\": \"org.cofax.SqlDataStore\",\n        \"redirectionClass\": \"org.cofax.SqlRedirection\",\n        \"dataStoreName\": \"cofax\",\n        \"dataStoreDriver\": \"com.microsoft.jdbc.sqlserver.SQLServerDriver\",\n        \"dataStoreUrl\": \"jdbc:microsoft:sqlserver://LOCALHOST:1433;DatabaseName=goon\",\n        \"dataStoreUser\": \"sa\",\n        \"dataStorePassword\": \"dataStoreTestQuery\",\n        \"dataStoreTestQuery\": \"SET NOCOUNT ON;select test='test';\",\n        \"dataStoreLogFile\": \"/usr/local/tomcat/logs/datastore.log\",\n        \"dataStoreInitConns\": 10,\n        \"dataStoreMaxConns\": 100,\n        \"dataStoreConnUsageLimit\": 100,\n        \"dataStoreLogLevel\": \"debug\",\n        \"maxUrlLength\": 500}},\n    {\n      \"servlet-name\": \"cofaxEmail\",\n      \"servlet-class\": \"org.cofax.cds.EmailServlet\",\n      \"init-param\": {\n      \"mailHost\": \"mail1\",\n      \"mailHostOverride\": \"mail2\"}},\n    {\n      \"servlet-name\": \"cofaxAdmin\",\n      \"servlet-class\": \"org.cofax.cds.AdminServlet\"},\n \n    {\n      \"servlet-name\": \"fileServlet\",\n      \"servlet-class\": \"org.cofax.cds.FileServlet\"},\n    {\n      \"servlet-name\": \"cofaxTools\",\n      \"servlet-class\": \"org.cofax.cms.CofaxToolsServlet\",\n      \"init-param\": {\n        \"templatePath\": \"toolstemplates/\",\n        \"log\": 1,\n        \"logLocation\": \"/usr/local/tomcat/logs/CofaxTools.log\",\n        \"logMaxSize\": \"\",\n        \"dataLog\": 1,\n        \"dataLogLocation\": \"/usr/local/tomcat/logs/dataLog.log\",\n        \"dataLogMaxSize\": \"\",\n        \"removePageCache\": \"/content/admin/remove?cache=pages&id=\",\n        \"removeTemplateCache\": \"/content/admin/remove?cache=templates&id=\",\n        \"fileTransferFolder\": \"/usr/local/tomcat/webapps/content/fileTransferFolder\",\n        \"lookInContext\": 1,\n        \"adminGroupID\": 4,\n        \"betaServer\": true}}],\n  \"servlet-mapping\": {\n    \"cofaxCDS\": \"/\",\n    \"cofaxEmail\": \"/cofaxutil/aemail/*\",\n    \"cofaxAdmin\": \"/admin/*\",\n    \"fileServlet\": \"/static/*\",\n    \"cofaxTools\": \"/tools/*\"},\n \n  \"taglib\": {\n    \"taglib-uri\": \"cofax.tld\",\n    \"taglib-location\": \"/WEB-INF/tlds/cofax.tld\"}}}"
3 | 
4 | }
5 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/listener/KafkaMetrics.scala:
--------------------------------------------------------------------------------
 1 | package listener
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
 6 | import org.apache.kafka.common.serialization.StringSerializer
 7 | import org.apache.spark.sql.streaming.StreamingQueryListener
 8 | 
 9 | class KafkaMetrics(servers: String, metricsTopic: String, errorTopic: String) extends StreamingQueryListener {
10 | 
11 |   val kafkaProperties = new Properties()
12 |   kafkaProperties.put("bootstrap.servers", servers)
13 |   kafkaProperties.put("key.serializer", classOf[StringSerializer])
14 |   kafkaProperties.put("value.serializer", classOf[StringSerializer])
15 | 
16 |   val producer = new KafkaProducer[String, String](kafkaProperties)
17 | 
18 |   def onQueryProgress(event: org.apache.spark.sql.streaming.StreamingQueryListener.QueryProgressEvent): Unit = {
19 |     producer.send(new ProducerRecord(metricsTopic, event.progress.json))
20 |   }
21 |   def onQueryStarted(event: org.apache.spark.sql.streaming.StreamingQueryListener.QueryStartedEvent): Unit = {}
22 |   def onQueryTerminated(event: org.apache.spark.sql.streaming.StreamingQueryListener.QueryTerminatedEvent): Unit = {
23 |     producer.send(new ProducerRecord(errorTopic, event.exception.get))
24 |   }
25 | }


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/listener/SparkListenerKafkaWriter.scala:
--------------------------------------------------------------------------------
 1 | package listener
 2 | 
 3 | import java.util.Properties
 4 | 
 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord}
 6 | import org.apache.kafka.common.serialization.StringSerializer
 7 | import org.apache.spark.scheduler._
 8 | 
 9 | class SparkListenerKafkaWriter(servers: String, metricsTopic: String, errorTopic: String) extends SparkListener {
10 | 
11 |   val kafkaProperties = new Properties()
12 |   kafkaProperties.put("bootstrap.servers", servers)
13 |   kafkaProperties.put("key.serializer", classOf[StringSerializer])
14 |   kafkaProperties.put("value.serializer", classOf[StringSerializer])
15 | 
16 |   val producer = new KafkaProducer[String, String](kafkaProperties)
17 | 
18 |   override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = {
19 |     super.onStageCompleted(stageCompleted)
20 | //    stageCompleted.stageInfo.taskMetrics.
21 | //    println("")
22 |   }
23 | 
24 |   override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = super.onStageSubmitted(stageSubmitted)
25 | 
26 |   override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = super.onTaskStart(taskStart)
27 | 
28 |   override def onTaskGettingResult(taskGettingResult: SparkListenerTaskGettingResult): Unit = super.onTaskGettingResult(taskGettingResult)
29 | 
30 |   override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = super.onTaskEnd(taskEnd)
31 | 
32 |   override def onJobStart(jobStart: SparkListenerJobStart): Unit = super.onJobStart(jobStart)
33 | 
34 |   override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = super.onJobEnd(jobEnd)
35 | 
36 |   override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate): Unit = super.onEnvironmentUpdate(environmentUpdate)
37 | 
38 |   override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded): Unit = super.onBlockManagerAdded(blockManagerAdded)
39 | 
40 |   override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved): Unit = super.onBlockManagerRemoved(blockManagerRemoved)
41 | 
42 |   override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = super.onUnpersistRDD(unpersistRDD)
43 | 
44 |   override def onApplicationStart(applicationStart: SparkListenerApplicationStart): Unit = super.onApplicationStart(applicationStart)
45 | 
46 |   override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = super.onApplicationEnd(applicationEnd)
47 | 
48 |   override def onExecutorMetricsUpdate(executorMetricsUpdate: SparkListenerExecutorMetricsUpdate): Unit = super.onExecutorMetricsUpdate(executorMetricsUpdate)
49 | 
50 |   override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = super.onExecutorAdded(executorAdded)
51 | 
52 |   override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = super.onExecutorRemoved(executorRemoved)
53 | 
54 |   override def onExecutorBlacklisted(executorBlacklisted: SparkListenerExecutorBlacklisted): Unit = super.onExecutorBlacklisted(executorBlacklisted)
55 | 
56 |   override def onExecutorUnblacklisted(executorUnblacklisted: SparkListenerExecutorUnblacklisted): Unit = super.onExecutorUnblacklisted(executorUnblacklisted)
57 | 
58 |   override def onNodeBlacklisted(nodeBlacklisted: SparkListenerNodeBlacklisted): Unit = super.onNodeBlacklisted(nodeBlacklisted)
59 | 
60 |   override def onNodeUnblacklisted(nodeUnblacklisted: SparkListenerNodeUnblacklisted): Unit = super.onNodeUnblacklisted(nodeUnblacklisted)
61 | 
62 |   override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = super.onBlockUpdated(blockUpdated)
63 | 
64 |   override def onOtherEvent(event: SparkListenerEvent): Unit = super.onOtherEvent(event)
65 | }


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/sink/JDBCSink.scala:
--------------------------------------------------------------------------------
 1 | package sink
 2 | 
 3 | import java.sql._
 4 | 
 5 | import org.apache.spark.sql.ForeachWriter
 6 | 
 7 | class  JDBCSink(driver: String, url: String, user: String, pwd: String) extends ForeachWriter[org.apache.spark.sql.Row] {
 8 |   var connection:Connection = _
 9 |   var preparedStmt: PreparedStatement=_
10 | 
11 |   override def open(partitionId: Long,version: Long): Boolean = {
12 |     Class.forName(driver)
13 |     connection = DriverManager.getConnection(url, user, pwd)
14 |     true
15 |   }
16 | 
17 |   override def process(value: (org.apache.spark.sql.Row)): Unit = {
18 | 
19 |     println("value Size" + value.size)
20 |     println("value :: " + value)
21 |     println("Value at 0 index :: " + value(0))
22 |     println(value(0).toString.split(",").length)
23 |     var name=value(0).toString.split(",")(0)
24 |     var dep=value(0).toString.split(",")(1)
25 |     var mail=value(0).toString.split(",")(2)
26 | 
27 |     var sql :String =s"""INSERT INTO public.Employee(NAME,DEPARTMENT,MAIL)
28 |       VALUES (?,?,?)""";
29 | 
30 |     preparedStmt=connection.prepareStatement(sql)
31 |     preparedStmt.setString(1,name)
32 |     preparedStmt.setString(2,dep)
33 |     preparedStmt.setString(3,mail)
34 |     preparedStmt.execute()
35 |     /*/*statement = connection.createStatement
36 |     statement.execute(s"""INSERT INTO public.Employee(NAME,DEPARTMENT,MAIL)*/
37 |       VALUES ('wq','b','c')""")*/
38 | 
39 | 
40 |   }
41 | 
42 |   override def close(errorOrNull: Throwable): Unit = {
43 |     connection.close
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/MultiStreamTODO.scala:
--------------------------------------------------------------------------------
 1 | package streaming
 2 | 
 3 | import entity.RateData
 4 | import listener.KafkaMetrics
 5 | import org.apache.log4j.{Level, LogManager}
 6 | import org.apache.spark.sql.SparkSession
 7 | import org.apache.spark.sql.streaming.Trigger
 8 | 
 9 | 
10 | object MultiStreamTODO extends App {
11 | 
12 |   val spark: SparkSession = SparkSession.builder()
13 |     .appName("StreamingListenerKafkaNotifier")
14 |     .master("local[*]")
15 |     .config("spark.sql.streaming.metricsEnabled", true)
16 |     .getOrCreate()
17 | 
18 |   val logger = LogManager.getRootLogger
19 |   logger.setLevel(Level.ERROR)
20 | 
21 | 
22 |   val df = spark.readStream
23 |     .format("rate")
24 |     .option("rowsPerSecond", 1)
25 |     .option("numPartitions", 1)
26 |     .option("rampUpTime", 2)
27 |     .load()
28 | 
29 |   import spark.implicits._
30 |   
31 |   val rateData = df.as[RateData]
32 |   val filteredDS = rateData.where("value < 20")
33 |   val greaterThanDS = rateData.where("value > 21")
34 | 
35 |   val errorDS = greaterThanDS.where("value > 30")
36 |     .map(triggerException(_))
37 | 
38 |   val stringData = filteredDS.selectExpr("CAST(timestamp AS String)", "CAST(value AS String)")
39 | 
40 |   val kafkaWriteStream1 = stringData.writeStream
41 |     .format("kafka")
42 |     .queryName("First Kafka Stream")
43 |     .option("topic", "test2")
44 |     .option("checkpointLocation", "sparkCheckPoint\\StreamingListenerKafkaNotifier\\cp1")
45 |     .option("kafka.bootstrap.servers", "localhost:9092")
46 |     .trigger(Trigger.ProcessingTime("10 seconds"))
47 |     .start()
48 | 
49 |   val consoleDS = errorDS.selectExpr("CAST(timestamp AS String)", "CAST(value AS String)")
50 | 
51 |   consoleDS.writeStream.format("console")
52 |     .queryName("Console stream")
53 |     .trigger(Trigger.ProcessingTime("10 seconds"))
54 |     .start()
55 | 
56 |   spark.streams.addListener(new KafkaMetrics("localhost:9092", "streamingMetrics", "streamingTermination"))
57 |   spark.streams.awaitAnyTermination()
58 | 
59 |   def triggerException(rateData: RateData): RateData = {
60 |     throw new Exception()
61 |     rateData
62 |   }
63 | }


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/Rate2Console.scala:
--------------------------------------------------------------------------------
 1 | package streaming
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.Trigger
 5 | 
 6 | object Rate2Console extends App {
 7 | 
 8 |   val spark: SparkSession = SparkSession.builder()
 9 |     .appName("Rate2Console")
10 |     .master("local[*]")
11 |     .getOrCreate()
12 | 
13 |   val df = spark.readStream
14 |     .format("rate")
15 |     .option("rowsPerSecond", 2)
16 |     .option("numPartitions", 2)
17 |     .option("rampUpTime", 1)
18 |     .load()
19 | 
20 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
21 | 
22 |   val processingTimeStream = rateRawData.writeStream
23 |     .format("console")
24 |     .queryName("Micro Batch")
25 |     .trigger(Trigger.ProcessingTime("10 seconds"))
26 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2Console\\cp1")
27 |     .start()
28 | 
29 |   spark.streams.awaitAnyTermination()
30 | }
31 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/SparkListenerKafkaNotifier.scala:
--------------------------------------------------------------------------------
 1 | package streaming
 2 | 
 3 | import java.sql.Timestamp
 4 | 
 5 | import listener.KafkaMetrics
 6 | import org.apache.log4j.{Level, LogManager}
 7 | import org.apache.spark.sql.SparkSession
 8 | import org.apache.spark.sql.streaming.Trigger
 9 | 
10 | object SparkListenerKafkaNotifier extends App {
11 | 
12 |   val spark: SparkSession = SparkSession.builder()
13 |     .appName("StreamingListenerKafkaNotifier")
14 |     .master("local[*]")
15 |     .config("spark.sql.streaming.metricsEnabled", true)
16 |     .getOrCreate()
17 | 
18 |   case class RateData(timestamp: Timestamp, value: Long)
19 | 
20 |   val rawDF = spark.readStream
21 |     .format("rate")
22 |     .option("rowsPerSecond", 1)
23 |     .option("numPartitions", 1)
24 |     .option("rampUpTime", 2)
25 |     .load()
26 | 
27 |   val rateDF = rawDF.selectExpr("CAST(timestamp AS String)", "CAST(value AS String)")
28 | 
29 |   val kafkaWriteStream = rateDF.writeStream
30 |     .format("kafka")
31 |     .queryName("First Kafka Stream")
32 |     .option("topic", "test2")
33 |     .option("checkpointLocation", "sparkCheckPoint\\StreamingListenerKafkaNotifier\\cp1")
34 |     .option("kafka.bootstrap.servers", "localhost:9092")
35 |     .trigger(Trigger.ProcessingTime("10 seconds"))
36 |     .start()
37 | 
38 |   spark.streams.addListener(new KafkaMetrics("localhost:9092", "streamingMetrics", "streamingTermination"))
39 |   spark.streams.awaitAnyTermination()
40 | }


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/jdbc/WriteToPostgress.scala:
--------------------------------------------------------------------------------
 1 | package streaming.jdbc
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import sink.JDBCSink
 5 | 
 6 | object WriteToPostgreSQL extends App{
 7 | 
 8 | 
 9 | 
10 |  case class InputData(name:String, department:String, mail:String)
11 | 
12 |   val spark : SparkSession = SparkSession.builder()
13 |     .appName("WriteToPostgresSQL")
14 |     .master("local[*]")
15 |     .getOrCreate()
16 | 
17 |   val df = spark.readStream.format("kafka")
18 |     .option("kafka.bootstrap.servers", "localhost:9092")
19 |     .option("subscribe", "ci.etl.currencyCode")
20 |     .load()
21 | 
22 |   def splitfunc(input:String):String ={
23 |     input
24 |   }
25 | 
26 |   val records=df.selectExpr("CAST(value AS string)")
27 | 
28 |   //println("Sassas" + records)
29 | 
30 |   val driver = ""
31 |   val url: String="jdbc:postgresql://localhost:5432/sathish"
32 |   val userName:String="postgres"
33 |   val passWord:String="root"
34 |   val writer = new JDBCSink(driver, url, userName, passWord)
35 | 
36 |   records.writeStream.foreach(writer).start().awaitTermination()
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/join/staticstream/staticStreamFullOuterJoin.scala:
--------------------------------------------------------------------------------
 1 | package streaming.join.staticstream
 2 | 
 3 | import entity.RateData
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions._
 6 | import org.apache.spark.sql.streaming.Trigger
 7 | 
 8 | object staticStreamFullOuterJoin extends App {
 9 | 
10 |   val spark: SparkSession = SparkSession.builder()
11 |     .appName("StaticStreamFullOuterJoin")
12 |     .master("local[*]")
13 |     .getOrCreate()
14 | 
15 |   val df = spark.readStream
16 |     .format("rate")
17 |     .option("rowsPerSecond", 1)
18 |     .option("numPartitions", 1)
19 |     .option("rampUpTime", 1)
20 |     .load()
21 | 
22 |   import spark.implicits._
23 | 
24 |   val rateData = df.as[RateData]
25 |   val streamingEmployeeDS = rateData.where("value % 10 != 0")
26 |     .withColumn("firstName", concat(lit("firstName"), rateData.col("value")))
27 |     .withColumn("lastName", concat(lit("lastName"), rateData.col("value")))
28 |     .withColumn("departmentId", lit(floor(rateData.col("value") / 10)))
29 |   //    .withColumnRenamed("value", "id")
30 | 
31 |   val staticDepartmentDS = spark.read.format("csv").option("header", "true").load("src/main/resources/department.csv")
32 | 
33 |   val fullOuterJoinDS = staticDepartmentDS.join(streamingEmployeeDS, $"id" === $"departmentId", "full_outer")
34 | 
35 |   val fullOuterJoinStream = fullOuterJoinDS.writeStream
36 |     .format("console")
37 |     .queryName("InnerJoin")
38 |     .trigger(Trigger.ProcessingTime("10 seconds"))
39 |     .option("checkpointLocation", "sparkCheckPoint\\StaticStreamFullOuterJoin\\cp1")
40 |     .start()
41 | }
42 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/join/staticstream/staticStreamInnerJoin.scala:
--------------------------------------------------------------------------------
 1 | package streaming.join.staticstream
 2 | 
 3 | import entity.RateData
 4 | import org.apache.log4j.{Level, LogManager}
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.streaming.Trigger
 8 | import streaming.join.streamstream.streamStreamLeftOuterJoin.spark
 9 | 
10 | object staticStreamInnerJoin extends App {
11 | 
12 |   val spark: SparkSession = SparkSession.builder()
13 |     .appName("StaticStreamInnerJoin")
14 |     .master("local[*]")
15 |     .getOrCreate()
16 | 
17 |   spark.conf.set("spark.sql.shuffle.partitions", "1")
18 | 
19 |   val logger = LogManager.getRootLogger
20 |   logger.setLevel(Level.ERROR)
21 | 
22 |   val df = spark.readStream
23 |     .format("rate")
24 |     .option("rowsPerSecond", 1)
25 |     .option("numPartitions", 1)
26 |     .option("rampUpTime", 1)
27 |     .load()
28 | 
29 |   import spark.implicits._
30 | 
31 |   val rateData = df.as[RateData]
32 |   val streamingEmployeeDS = rateData.where("value % 10 != 0")
33 |     .withColumn("firstName",  concat(lit("firstName"),rateData.col("value")))
34 |     .withColumn("lastName",  concat(lit("lastName"),rateData.col("value")))
35 |     .withColumn("departmentId", lit(floor(rateData.col("value")/10)))
36 | 
37 |   val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv")
38 | 
39 |   val innerJoinDS =  staticDepartmentDS.join(streamingEmployeeDS, $"id" === $"departmentId")
40 | 
41 |   val innerJoinStream = innerJoinDS.writeStream
42 |     .format("console")
43 |     .queryName("InnerJoin")
44 |     .trigger(Trigger.ProcessingTime("20 seconds"))
45 |     .option("checkpointLocation", "sparkCheckPoint\\StaticStreamInnerJoin\\cp1")
46 |     .start()
47 | 
48 |   spark.streams.awaitAnyTermination()
49 | }
50 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/join/staticstream/staticStreamLeftOuterJoin.scala:
--------------------------------------------------------------------------------
 1 | package streaming.join.staticstream
 2 | 
 3 | import entity.RateData
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions._
 6 | import org.apache.spark.sql.streaming.Trigger
 7 | 
 8 | object staticStreamLeftOuterJoin extends App {
 9 | 
10 |   val spark: SparkSession = SparkSession.builder()
11 |     .appName("StaticStreamLeftOuterJoin")
12 |     .master("local[*]")
13 |     .getOrCreate()
14 | 
15 |   val df = spark.readStream
16 |     .format("rate")
17 |     .option("rowsPerSecond", 1)
18 |     .option("numPartitions", 1)
19 |     .option("rampUpTime", 1)
20 |     .load()
21 | 
22 |   import spark.implicits._
23 | 
24 |   val rateData = df.as[RateData]
25 |   val streamingEmployeeDS = rateData.where("value % 10 != 0")
26 |     .withColumn("firstName",  concat(lit("firstName"),rateData.col("value")))
27 |     .withColumn("lastName",  concat(lit("lastName"),rateData.col("value")))
28 |     .withColumn("departmentId", lit(floor(rateData.col("value")/10)))
29 | //    .withColumnRenamed("value", "id")
30 | 
31 |   val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv")
32 | 
33 |   val leftOuterJoinDS =  staticDepartmentDS.join(streamingEmployeeDS, $"id" === $"departmentId", "left_outer")
34 | 
35 |   val leftOuterJoinStream = leftOuterJoinDS.writeStream
36 |     .format("console")
37 |     .queryName("LeftOuterJoin")
38 |     .trigger(Trigger.ProcessingTime("10 seconds"))
39 |     .option("checkpointLocation", "sparkCheckPoint\\StaticStreamLeftOuterJoin\\cp2")
40 |     .start()
41 | 
42 |   spark.streams.awaitAnyTermination()
43 | }
44 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/join/staticstream/staticStreamRightOuterJoin.scala:
--------------------------------------------------------------------------------
 1 | package streaming.join.staticstream
 2 | 
 3 | import entity.RateData
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions._
 6 | import org.apache.spark.sql.streaming.Trigger
 7 | 
 8 | object staticStreamRightOuterJoin extends App {
 9 | 
10 |   val spark: SparkSession = SparkSession.builder()
11 |     .appName("StaticStreamRightOuterJoin")
12 |     .master("local[*]")
13 |     .getOrCreate()
14 | 
15 |   val df = spark.readStream
16 |     .format("rate")
17 |     .option("rowsPerSecond", 1)
18 |     .option("numPartitions", 1)
19 |     .option("rampUpTime", 1)
20 |     .load()
21 | 
22 |   import spark.implicits._
23 | 
24 |   val rateData = df.as[RateData]
25 |   val streamingEmployeeDS = rateData.where("value % 10 != 0")
26 |     .withColumn("firstName",  concat(lit("firstName"),rateData.col("value")))
27 |     .withColumn("lastName",  concat(lit("lastName"),rateData.col("value")))
28 |     .withColumn("departmentId", lit(floor(rateData.col("value")/10)))
29 | //    .withColumnRenamed("value", "id")
30 | 
31 |   val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv")
32 | 
33 |   val rightOuterJoinDS =  staticDepartmentDS
34 |     .join(streamingEmployeeDS, $"id" === $"departmentId", "right_outer")
35 | 
36 |   val rightOuterJoinStream = rightOuterJoinDS.writeStream
37 |     .format("console")
38 |     .queryName("InnerJoin")
39 |     .trigger(Trigger.ProcessingTime("10 seconds"))
40 |     .option("checkpointLocation", "sparkCheckPoint\\StaticStreamRightOuterJoin\\cp1")
41 |     .start()
42 | 
43 |   spark.streams.awaitAnyTermination()
44 | }
45 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/join/streamstatic/streamStaticFullOuterJoin.scala:
--------------------------------------------------------------------------------
 1 | package streaming.join.streamstatic
 2 | 
 3 | import entity.RateData
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions._
 6 | import org.apache.spark.sql.streaming.Trigger
 7 | 
 8 | object streamStaticFullOuterJoin extends App {
 9 | 
10 |   val spark: SparkSession = SparkSession.builder()
11 |     .appName("streamStaticFullOuterJoin")
12 |     .master("local[*]")
13 |     .getOrCreate()
14 | 
15 |   val df = spark.readStream
16 |     .format("rate")
17 |     .option("rowsPerSecond", 1)
18 |     .option("numPartitions", 1)
19 |     .option("rampUpTime", 1)
20 |     .load()
21 | 
22 |   import spark.implicits._
23 | 
24 |   val rateData = df.as[RateData]
25 |   val streamingEmployeeDS = rateData.where("value % 10 != 0")
26 |     .withColumn("firstName",  concat(lit("firstName"),rateData.col("value")))
27 |     .withColumn("lastName",  concat(lit("lastName"),rateData.col("value")))
28 |     .withColumn("departmentId", lit(floor(rateData.col("value")/10)))
29 | //    .withColumnRenamed("value", "id")
30 | 
31 |   val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv")
32 | 
33 |   val fullOuterJoinDS =  streamingEmployeeDS.join(staticDepartmentDS, $"departmentId" === $"id", "full_outer")
34 | 
35 |   val fullOuterJoinStream = fullOuterJoinDS.writeStream
36 |     .format("console")
37 |     .queryName("InnerJoin")
38 |     .trigger(Trigger.ProcessingTime("10 seconds"))
39 |     .option("checkpointLocation", "sparkCheckPoint\\streamStaticFullOuterJoin\\cp1")
40 |     .start()
41 | 
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/join/streamstatic/streamStaticInnerJoin.scala:
--------------------------------------------------------------------------------
 1 | package streaming.join.streamstatic
 2 | 
 3 | import entity.RateData
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions._
 6 | import org.apache.spark.sql.streaming.Trigger
 7 | 
 8 | object streamStaticInnerJoin extends App {
 9 | 
10 |   val spark: SparkSession = SparkSession.builder()
11 |     .appName("streamStaticInnerJoin")
12 |     .master("local[*]")
13 |     .getOrCreate()
14 | 
15 |   val df = spark.readStream
16 |     .format("rate")
17 |     .option("rowsPerSecond", 1)
18 |     .option("numPartitions", 1)
19 |     .option("rampUpTime", 1)
20 |     .load()
21 | 
22 |   import spark.implicits._
23 | 
24 |   val rateData = df.as[RateData]
25 |   val streamingEmployeeDS = rateData.where("value % 10 != 0")
26 |     .withColumn("firstName",  concat(lit("firstName"),rateData.col("value")))
27 |     .withColumn("lastName",  concat(lit("lastName"),rateData.col("value")))
28 |     .withColumn("departmentId", lit(floor(rateData.col("value")/10)))
29 | //    .withColumnRenamed("value", "id")
30 | 
31 |   val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv")
32 | 
33 |   val innerJoinDS =  streamingEmployeeDS.join(staticDepartmentDS, $"departmentId" === $"id")
34 | 
35 |   val innerJoinStream = innerJoinDS.writeStream
36 |     .format("console")
37 |     .queryName("InnerJoin")
38 |     .trigger(Trigger.ProcessingTime("10 seconds"))
39 |     .option("checkpointLocation", "sparkCheckPoint\\streamStaticInnerJoin\\cp1")
40 |     .start()
41 | 
42 |   spark.streams.awaitAnyTermination()
43 | }
44 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/join/streamstatic/streamStaticLeftOuterJoin.scala:
--------------------------------------------------------------------------------
 1 | package streaming.join.streamstatic
 2 | 
 3 | import entity.RateData
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions._
 6 | import org.apache.spark.sql.streaming.Trigger
 7 | 
 8 | object streamStaticLeftOuterJoin extends App {
 9 | 
10 |   val spark: SparkSession = SparkSession.builder()
11 |     .appName("streamStaticLeftOuterJoin")
12 |     .master("local[*]")
13 |     .getOrCreate()
14 | 
15 |   val df = spark.readStream
16 |     .format("rate")
17 |     .option("rowsPerSecond", 1)
18 |     .option("numPartitions", 1)
19 |     .option("rampUpTime", 1)
20 |     .load()
21 | 
22 |   import spark.implicits._
23 | 
24 |   val rateData = df.as[RateData]
25 |   val streamingEmployeeDS = rateData.where("value % 10 != 0")
26 |     .withColumn("firstName",  concat(lit("firstName"),rateData.col("value")))
27 |     .withColumn("lastName",  concat(lit("lastName"),rateData.col("value")))
28 |     .withColumn("departmentId", lit(floor(rateData.col("value")/10)))
29 | //    .withColumnRenamed("value", "id")
30 | 
31 |   val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv")
32 | 
33 |   val leftOuterJoinDS  =  streamingEmployeeDS
34 |     .join(staticDepartmentDS, $"departmentId" === $"id", "left_outer")
35 | 
36 |   val leftOuterJoinStream = leftOuterJoinDS.writeStream
37 |     .format("console")
38 |     .queryName("LeftOuterJoin")
39 |     .trigger(Trigger.ProcessingTime("10 seconds"))
40 |     .option("checkpointLocation", "sparkCheckPoint\\streamStaticLeftOuterJoin\\cp2")
41 |     .start()
42 | 
43 |   spark.streams.awaitAnyTermination()
44 | }
45 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/join/streamstatic/streamStaticRightOuterJoin.scala:
--------------------------------------------------------------------------------
 1 | package streaming.join.streamstatic
 2 | 
 3 | import entity.RateData
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions._
 6 | import org.apache.spark.sql.streaming.Trigger
 7 | 
 8 | object streamStaticRightOuterJoin extends App {
 9 | 
10 |   val spark: SparkSession = SparkSession.builder()
11 |     .appName("streamStaticRightOuterJoin")
12 |     .master("local[*]")
13 |     .getOrCreate()
14 | 
15 |   val df = spark.readStream
16 |     .format("rate")
17 |     .option("rowsPerSecond", 1)
18 |     .option("numPartitions", 1)
19 |     .option("rampUpTime", 1)
20 |     .load()
21 | 
22 |   import spark.implicits._
23 | 
24 |   val rateData = df.as[RateData]
25 |   val streamingEmployeeDS = rateData.where("value % 10 != 0")
26 |     .withColumn("firstName",  concat(lit("firstName"),rateData.col("value")))
27 |     .withColumn("lastName",  concat(lit("lastName"),rateData.col("value")))
28 |     .withColumn("departmentId", lit(floor(rateData.col("value")/10)))
29 | //    .withColumnRenamed("value", "id")
30 | 
31 |   val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv")
32 | 
33 |   val rightOuterJoinDS  =  streamingEmployeeDS.join(staticDepartmentDS, $"departmentId" === $"id", "right_outer")
34 | 
35 |   val rightOuterJoinStream = rightOuterJoinDS.writeStream
36 |     .format("console")
37 |     .queryName("InnerJoin")
38 |     .trigger(Trigger.ProcessingTime("10 seconds"))
39 |     .option("checkpointLocation", "sparkCheckPoint\\streamStaticRightOuterJoin\\cp1")
40 |     .start()
41 | 
42 |   spark.streams.awaitAnyTermination()
43 | }
44 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/join/streamstream/streamStreamFullOuterJoin.scala:
--------------------------------------------------------------------------------
 1 | package streaming.join.streamstream
 2 | 
 3 | import entity.RateData
 4 | import org.apache.log4j.{Level, LogManager}
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.streaming.Trigger
 8 | 
 9 | object streamStreamFullOuterJoin extends App {
10 | 
11 |   val spark: SparkSession = SparkSession.builder()
12 |     .appName("streamStreamFullOuterJoin")
13 |     .master("local[*]")
14 |     .getOrCreate()
15 | 
16 |   spark.conf.set("spark.sql.shuffle.partitions", "1")
17 | 
18 |   val logger = LogManager.getRootLogger
19 |   logger.setLevel(Level.ERROR)
20 | 
21 |   val df = spark.readStream
22 |     .format("rate")
23 |     .option("rowsPerSecond", 1)
24 |     .option("numPartitions", 1)
25 |     .option("rampUpTime", 1)
26 |     .load()
27 | 
28 |   import spark.implicits._
29 | 
30 |   val rateData = df.as[RateData]
31 |   val employeeStreamDS = rateData.where("value % 10 != 0")
32 |     .withColumn("firstName",  concat(lit("firstName"),rateData.col("value")))
33 |     .withColumn("lastName",  concat(lit("lastName"),rateData.col("value")))
34 |     .withColumn("departmentId", lit(floor(rateData.col("value")/10)))
35 |     .withColumnRenamed("timestamp", "empTimestamp")
36 |     .withWatermark("empTimestamp", "10 seconds")
37 | //    .withColumnRenamed("value", "id")
38 | 
39 |   val departmentStreamDS = rateData.where("value % 10 == 0")
40 |     .withColumn("name", concat(lit("name"),floor(rateData.col("value")/10)))
41 |     .withColumn("Id", lit(floor(rateData.col("value")/10)))
42 |     .drop("value")
43 |     .withColumnRenamed("timestamp", "depTimestamp")
44 |     .withWatermark("depTimestamp", "10 seconds")
45 | 
46 |   val joinedDS  =  departmentStreamDS
47 |     .join(employeeStreamDS, expr("""
48 |       id = departmentId AND
49 |       empTimestamp >= depTimestamp - interval 1 minutes AND
50 |       empTimestamp <= depTimestamp + interval 1 minutes
51 |       """
52 |     ), "full_outer")
53 | 
54 |   val joinedStream = joinedDS.writeStream
55 |     .format("console")
56 |     .queryName("joinedTable")
57 |     .option("checkpointLocation", "sparkCheckPoint\\streamStreamFullOuterJoin\\joinedTable")
58 |     .trigger(Trigger.ProcessingTime("5 seconds"))
59 |     .start()
60 | 
61 |   spark.streams.awaitAnyTermination()
62 | }
63 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/join/streamstream/streamStreamInnerJoin.scala:
--------------------------------------------------------------------------------
 1 | package streaming.join.streamstream
 2 | 
 3 | import entity.RateData
 4 | import org.apache.log4j.{Level, LogManager}
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.streaming.Trigger
 8 | 
 9 | object streamStreamInnerJoin extends App {
10 | 
11 |   val spark: SparkSession = SparkSession.builder()
12 |     .appName("streamStreamInnerJoin")
13 |     .master("local[*]")
14 |     .getOrCreate()
15 | 
16 |   spark.conf.set("spark.sql.shuffle.partitions", "1")
17 | 
18 |   val logger = LogManager.getRootLogger
19 |   logger.setLevel(Level.ERROR)
20 | 
21 |   val df = spark.readStream
22 |     .format("rate")
23 |     .option("rowsPerSecond", 1)
24 |     .option("numPartitions", 1)
25 |     .option("rampUpTime", 1)
26 |     .load()
27 | 
28 |   import spark.implicits._
29 | 
30 |   val rateData = df.as[RateData]
31 |   val employeeDS = rateData.where("value % 10 != 0")
32 |     .withColumn("firstName",  concat(lit("firstName"),rateData.col("value")))
33 |     .withColumn("lastName",  concat(lit("lastName"),rateData.col("value")))
34 |     .withColumn("departmentId", lit(floor(rateData.col("value")/10)))
35 |     .withColumnRenamed("value", "id")
36 | 
37 |   val departmentDS = rateData.where("value % 10 == 0")
38 |     .withColumn("name", concat(lit("name"),floor(rateData.col("value")/10)))
39 |     .withColumn("departmentId", lit(floor(rateData.col("value")/10)))
40 |     .drop("value")
41 | 
42 |   val joinedDS =  departmentDS.join(employeeDS,"departmentId")
43 | 
44 |   val joinedStream = joinedDS.writeStream
45 |     .format("console")
46 |     .queryName("joinedTable")
47 |     .option("checkpointLocation", "sparkCheckPoint\\streamStreamInnerJoin\\joinedTable")
48 |     .trigger(Trigger.ProcessingTime("5 seconds"))
49 |     .start()
50 | 
51 |   spark.streams.awaitAnyTermination()
52 | }
53 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/join/streamstream/streamStreamLeftOuterJoin.scala:
--------------------------------------------------------------------------------
 1 | package streaming.join.streamstream
 2 | 
 3 | import entity.RateData
 4 | import org.apache.log4j.{Level, LogManager}
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.streaming.Trigger
 8 | 
 9 | object streamStreamLeftOuterJoin extends App {
10 | 
11 |   val spark: SparkSession = SparkSession.builder()
12 |     .appName("streamStreamLeftOuterJoin")
13 |     .master("local[*]")
14 |     .getOrCreate()
15 | 
16 |   spark.conf.set("spark.sql.shuffle.partitions", "1")
17 | 
18 |   val logger = LogManager.getRootLogger
19 |   logger.setLevel(Level.ERROR)
20 | 
21 |   val rateSource = spark.readStream
22 |     .format("rate")
23 |     .option("rowsPerSecond", 10000)
24 |     .option("numPartitions", 1)
25 |     .option("rampUpTime", 1)
26 |     .load()
27 | 
28 |   import spark.implicits._
29 | 
30 |   val rateSourceData = rateSource.as[RateData]
31 |   val employeeStreamDS = rateSourceData.where("value % 10 != 0")
32 |     .withColumn("firstName",  concat(lit("firstName"),rateSourceData.col("value")))
33 |     .withColumn("lastName",  concat(lit("lastName"),rateSourceData.col("value")))
34 |     .withColumn("departmentId", lit(floor(rateSourceData.col("value")/10)))
35 |     .withColumnRenamed("timestamp", "empTimestamp")
36 |     .withWatermark("empTimestamp", "10 seconds")
37 | //    .withColumnRenamed("value", "id")
38 | 
39 |   val departmentStreamDS = rateSourceData.where("value % 10 == 0")
40 |     .withColumn("name", concat(lit("name"),floor(rateSourceData.col("value")/10)))
41 |     .withColumn("Id", lit(floor(rateSourceData.col("value")/10)))
42 |     .drop("value")
43 |     .withColumnRenamed("timestamp", "depTimestamp")
44 | //    .withWatermark("depTimestamp", "10 seconds")
45 | 
46 |   val joinedDS  =  departmentStreamDS
47 |     .join(employeeStreamDS, expr("""
48 |       id = departmentId AND
49 |       empTimestamp >= depTimestamp - interval 1 minutes AND
50 |       empTimestamp <= depTimestamp + interval 1 minutes
51 |       """
52 |     ), "left_outer")
53 | 
54 |   val joinedStream = joinedDS.writeStream
55 |     .format("console")
56 |     .queryName("joinedTable")
57 |     .option("checkpointLocation", "sparkCheckPoint\\streamStreamLeftOuterJoin\\joinedTable")
58 |     .trigger(Trigger.ProcessingTime("20 seconds"))
59 |     .start()
60 | 
61 |   spark.streams.awaitAnyTermination()
62 | }
63 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/join/streamstream/streamStreamRightOuterJoin.scala:
--------------------------------------------------------------------------------
 1 | package streaming.join.streamstream
 2 | 
 3 | import entity.RateData
 4 | import org.apache.log4j.{Level, LogManager}
 5 | import org.apache.spark.sql.SparkSession
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.streaming.Trigger
 8 | 
 9 | object streamStreamRightOuterJoin extends App {
10 | 
11 |   val spark: SparkSession = SparkSession.builder()
12 |     .appName("streamStreamRightOuterJoin")
13 |     .master("local[*]")
14 |     .getOrCreate()
15 | 
16 |   spark.conf.set("spark.sql.shuffle.partitions", "1")
17 | 
18 |   val logger = LogManager.getRootLogger
19 |   logger.setLevel(Level.ERROR)
20 | 
21 |   val df = spark.readStream
22 |     .format("rate")
23 |     .option("rowsPerSecond", 1)
24 |     .option("numPartitions", 1)
25 |     .option("rampUpTime", 1)
26 |     .load()
27 | 
28 |   import spark.implicits._
29 | 
30 |   val rateData = df.as[RateData]
31 |   val employeeStreamDS = rateData.where("value % 10 != 0")
32 |     .withColumn("firstName",  concat(lit("firstName"),rateData.col("value")))
33 |     .withColumn("lastName",  concat(lit("lastName"),rateData.col("value")))
34 |     .withColumn("departmentId", lit(floor(rateData.col("value")/10)))
35 |     .withColumnRenamed("timestamp", "empTimestamp")
36 | //    .withWatermark("empTimestamp", "10 seconds")
37 | //    .withColumnRenamed("value", "id")
38 | 
39 |   val departmentStreamDS = rateData.where("value % 10 == 0")
40 |     .withColumn("name", concat(lit("name"),floor(rateData.col("value")/10)))
41 |     .withColumn("Id", lit(floor(rateData.col("value")/10)))
42 |     .drop("value")
43 |     .withColumnRenamed("timestamp", "depTimestamp")
44 |     .withWatermark("depTimestamp", "10 seconds")
45 | 
46 |   val joinedDS  =  departmentStreamDS
47 |     .join(employeeStreamDS, expr("""
48 |       id = departmentId AND
49 |       empTimestamp >= depTimestamp - interval 1 minutes AND
50 |       empTimestamp <= depTimestamp + interval 1 minutes
51 |       """
52 |     ), "right_outer")
53 | 
54 |   val joinedStream = joinedDS.writeStream
55 |     .format("console")
56 |     .queryName("joinedTable")
57 |     .option("checkpointLocation", "sparkCheckPoint\\streamStreamRightOuterJoin\\joinedTable")
58 |     .trigger(Trigger.ProcessingTime("5 seconds"))
59 |     .start()
60 | 
61 |   spark.streams.awaitAnyTermination()
62 | }
63 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/kafka/Kafka2Kafka.scala:
--------------------------------------------------------------------------------
 1 | package streaming.kafka
 2 | 
 3 | import org.apache.log4j.{Level, LogManager}
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object Kafka2Kafka extends App{
 7 | 
 8 |   val spark : SparkSession = SparkSession.builder()
 9 |     .appName("Kafka2Kafka")
10 |     .master("local[*]")
11 |     .getOrCreate()
12 | 
13 |   val logger = LogManager.getRootLogger
14 |   logger.setLevel(Level.ERROR)
15 | 
16 |   val df = spark.readStream.format("kafka")
17 |     .option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "test")
18 |     .load()
19 | 
20 |    val kafkaRawData = df.selectExpr("CAST(key AS STRING)", "CAST(value AS string)", "topic", "partition", "offset", "timestamp","timestampType")
21 |    val kafkaWriteStream = kafkaRawData.writeStream.format("kafka").option("topic", "test2")
22 |         .option("checkpointLocation","sparkCheckPoint\\Kafka2Kafka")
23 |         .option("kafka.bootstrap.servers", "localhost:9092")
24 |         .start().awaitTermination()
25 | }
26 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/kafka/Rate2Kafka.scala:
--------------------------------------------------------------------------------
 1 | package streaming.kafka
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.Trigger
 5 | 
 6 | object Rate2Kafka extends App {
 7 | 
 8 |   val spark: SparkSession = SparkSession.builder()
 9 |     .appName("Rate2Kafka")
10 |     .master("local[*]")
11 |     .getOrCreate()
12 | 
13 |   val df = spark.readStream
14 |     .format("rate")
15 |     .option("rowsPerSecond", 2)
16 |     .option("numPartitions", 2)
17 |     .option("rampUpTime", 1)
18 |     .load()
19 | 
20 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
21 |   val kafkaStream = rateRawData.writeStream
22 |     .format("kafka")
23 |     .queryName("First Kafka Stream")
24 |     .option("topic", "test2")
25 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2Kafka\\cp1").option("kafka.bootstrap.servers", "localhost:9092")
26 |     .trigger(Trigger.ProcessingTime("10 seconds"))
27 |     .start()
28 | 
29 |   kafkaStream.awaitTermination()
30 | }
31 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/kafka/Rate2KafkaMultiStream.scala:
--------------------------------------------------------------------------------
 1 | package streaming.kafka
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.Trigger
 5 | 
 6 | object Rate2KafkaMultiStream extends App {
 7 | 
 8 |   val spark: SparkSession = SparkSession.builder()
 9 |     .appName("Rate2KafkaMultiStream")
10 |     .master("local[*]")
11 |     .getOrCreate()
12 | 
13 |   val df = spark.readStream
14 |     .format("rate")
15 |     .option("rowsPerSecond", 20000)
16 |     .option("numPartitions", 2)
17 |     .option("rampUpTime", 1)
18 |     .load()
19 | 
20 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
21 |   val kafkaWriteStream1 = rateRawData.writeStream
22 |     .queryName("First Kafka Stream")
23 |     .format("kafka")
24 |     .option("topic", "test2")
25 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2KafkaMultiStream\\cp1").option("kafka.bootstrap.servers", "localhost:9092")
26 |     .trigger(Trigger.ProcessingTime("10 seconds"))
27 |     .start()
28 | 
29 |   val kafkaWriteStream2 = rateRawData.writeStream
30 |     .queryName("Second Kafka Stream")
31 |     .format("kafka")
32 |     .option("topic", "test2")
33 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2KafkaMultiStream\\cp2")
34 |     .option("kafka.bootstrap.servers", "localhost:9092")
35 |     .trigger(Trigger.ProcessingTime("10 seconds"))
36 |     .start()
37 | 
38 |   spark.streams.awaitAnyTermination()
39 | }
40 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/kafka/Rate2KafkaSparkListener.scala:
--------------------------------------------------------------------------------
 1 | package streaming.kafka
 2 | 
 3 | import org.apache.log4j.{Level, LogManager}
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.streaming.{StreamingQueryListener, Trigger}
 6 | 
 7 | 
 8 | object Rate2KafkaSparkListener extends App {
 9 | 
10 |   val spark: SparkSession = SparkSession.builder()
11 |     .appName("Rate2KafkaSparkListener")
12 |     .master("local[*]")
13 |     .config("spark.sql.streaming.metricsEnabled", true)
14 |     .getOrCreate()
15 | 
16 |   val logger = LogManager.getRootLogger
17 |   logger.setLevel(Level.DEBUG)
18 | 
19 |   val df = spark.readStream
20 |     .format("rate")
21 | //    .option("rowsPerSecond", 1)
22 | //    .option("numPartitions", 10)
23 | //    .option("rampUpTime", 2)
24 |     .load()
25 | 
26 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
27 |   val kafkaWriteStream1 = rateRawData.writeStream
28 |     .format("kafka")
29 |     .queryName("First Kafka Stream")
30 |     .option("topic", "test2")
31 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2KafkaSparkListener\\cp1").option("kafka.bootstrap.servers", "localhost:9092")
32 |     .trigger(Trigger.ProcessingTime("10 seconds"))
33 |     .start()
34 | 
35 | //  val kafkaWriteStream2 = rateRawData.writeStream
36 | //    .format("kafka")
37 | //    .queryName("Second Kafka Stream")
38 | //    .option("topic", "test2")
39 | //    .option("checkpointLocation", "sparkCheckPoint\\Rate2KafkaSparkListener\\cp2").option("kafka.bootstrap.servers", "localhost:9092")
40 | //    .trigger(Trigger.ProcessingTime("20 seconds"))
41 | //    .start()
42 | 
43 |   val chartListener = new StreamingQueryListener() {
44 |     val MaxDataPoints = 100
45 |     // a mutable reference to an immutable container to buffer n data points
46 |     override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = ()
47 | 
48 |     override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = {
49 |       val queryProgress = event.progress
50 | 
51 |       if(queryProgress.numInputRows > 0) {
52 |         val time = queryProgress.timestamp
53 |         val inputRowsPerSecond = queryProgress.numInputRows
54 |         val name = queryProgress.name
55 |         val processedRowsPerSecond = queryProgress.processedRowsPerSecond
56 | 
57 |         println("Metrics name "+ name+" time "+ time + " inputRows "+ inputRowsPerSecond + " processedRowsPerSecond "+ processedRowsPerSecond)
58 |       }
59 |     }
60 | 
61 |     override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = ()
62 |   }
63 | 
64 |   spark.streams.addListener(chartListener)
65 |   spark.streams.awaitAnyTermination()
66 | 
67 | }


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/mode/Rate2ConsoleAggregateAppendMode.scala:
--------------------------------------------------------------------------------
 1 | package streaming.mode
 2 | 
 3 | import org.apache.log4j.{Level, LogManager}
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.streaming.Trigger
 6 | 
 7 | object Rate2ConsoleAggregateAppendMode extends App {
 8 | 
 9 |   val spark: SparkSession = SparkSession.builder()
10 |     .appName("Rate2ConsoleAggregateAppendMode")
11 |     .master("local[*]")
12 |     .getOrCreate()
13 | 
14 |   val logger = LogManager.getRootLogger
15 |   logger.setLevel(Level.ERROR)
16 | 
17 | //  spark.conf.set("spark.sql.shuffle.partitions", "1")
18 | 
19 |   val df = spark.readStream
20 |     .format("rate")
21 |     .option("rowsPerSecond", 1)
22 |     //    .option("numPartitions", 1)
23 |     .option("rampUpTime", 1)
24 |     .load()
25 | 
26 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
27 | 
28 |   val transformedData = rateRawData.withColumn("key", rateRawData.col("timestamp").substr(15, 2))
29 |   val countData = transformedData.groupBy("key").count()
30 | 
31 |   val defaultStream = countData.writeStream
32 |     .format("console")
33 |     .queryName("Append Mode")
34 |     .trigger(Trigger.ProcessingTime("10 seconds"))
35 | //    .outputMode("complete")
36 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleAggregateAppendMode\\cp1")
37 |     .start()
38 | 
39 |   spark.streams.awaitAnyTermination()
40 | }
41 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/mode/Rate2ConsoleAggregateCompleteMode.scala:
--------------------------------------------------------------------------------
 1 | package streaming.mode
 2 | 
 3 | import org.apache.log4j.{Level, LogManager}
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.streaming.Trigger
 6 | 
 7 | object Rate2ConsoleAggregateCompleteMode extends App {
 8 | 
 9 |   val spark: SparkSession = SparkSession.builder()
10 |     .appName("Rate2ConsoleAggregateCompleteMode")
11 |     .master("local[*]")
12 |     .getOrCreate()
13 | 
14 |   val logger = LogManager.getRootLogger
15 |   logger.setLevel(Level.ERROR)
16 | 
17 |   spark.conf.set("spark.sql.shuffle.partitions", "1")
18 | 
19 |   val df = spark.readStream
20 |     .format("rate")
21 |     .option("rowsPerSecond", 1)
22 |     //    .option("numPartitions", 1)
23 |     .option("rampUpTime", 1)
24 |     .load()
25 | 
26 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
27 | 
28 |   val transformedData = rateRawData.withColumn("key", rateRawData.col("timestamp").substr(15, 2))
29 |   val countData = transformedData.groupBy("key").count()
30 | 
31 |   val defaultStream = countData.writeStream
32 |     .format("console")
33 |     .queryName("Complete Mode")
34 |     .trigger(Trigger.ProcessingTime("10 seconds"))
35 |     .outputMode("complete")
36 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleAggregateCompleteMode\\cp1")
37 |     .start()
38 | 
39 |   spark.streams.awaitAnyTermination()
40 | }
41 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/mode/Rate2ConsoleAggregateUpdateMode.scala:
--------------------------------------------------------------------------------
 1 | package streaming.mode
 2 | 
 3 | import org.apache.log4j.{Level, LogManager}
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.streaming.Trigger
 6 | 
 7 | object Rate2ConsoleAggregateUpdateMode extends App {
 8 | 
 9 |   val spark: SparkSession = SparkSession.builder()
10 |     .appName("Rate2ConsoleAggregateUpdateMode")
11 |     .master("local[*]")
12 |     .getOrCreate()
13 | 
14 |   val logger = LogManager.getRootLogger
15 |   logger.setLevel(Level.ERROR)
16 | 
17 |   spark.conf.set("spark.sql.shuffle.partitions", "1")
18 | 
19 |   val df = spark.readStream
20 |     .format("rate")
21 |     .option("rowsPerSecond", 1)
22 |     //    .option("numPartitions", 1)
23 |     .option("rampUpTime", 1)
24 |     .load()
25 | 
26 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
27 | 
28 |   val transformedData = rateRawData.withColumn("key", rateRawData.col("timestamp").substr(15, 2))
29 |   val countData = transformedData.groupBy("key").count()
30 | 
31 |   val defaultStream = countData.writeStream
32 |     .format("console")
33 |     .queryName("Append Mode")
34 |     .trigger(Trigger.ProcessingTime("10 seconds"))
35 |     .outputMode("update")
36 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleAggregateUpdateMode\\cp1")
37 |     .start()
38 | 
39 |   spark.streams.awaitAnyTermination()
40 | }
41 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/mode/Rate2ConsoleCompleteMode.scala:
--------------------------------------------------------------------------------
 1 | package streaming.mode
 2 | 
 3 | import org.apache.log4j.{Level, LogManager}
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.streaming.Trigger
 6 | 
 7 | object Rate2ConsoleCompleteMode extends App {
 8 | 
 9 |   val spark: SparkSession = SparkSession.builder()
10 |     .appName("Rate2ConsoleCompleteMode")
11 |     .master("local[*]")
12 |     .getOrCreate()
13 | 
14 |   val logger = LogManager.getRootLogger
15 |   logger.setLevel(Level.ERROR)
16 | 
17 |   val df = spark.readStream
18 |     .format("rate")
19 |     .option("rowsPerSecond", 1)
20 |     //    .option("numPartitions", 1)
21 |     .option("rampUpTime", 1)
22 |     .load()
23 | 
24 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
25 |   val defaultStream = rateRawData.writeStream
26 |     .format("console")
27 |     .queryName("Complete Mode")
28 |     .trigger(Trigger.ProcessingTime("10 seconds"))
29 |     .outputMode("complete")
30 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleCompleteMode\\cp1")
31 |     .start()
32 | 
33 |   spark.streams.awaitAnyTermination()
34 | }
35 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/mode/Rate2ConsoleDefaultMode.scala:
--------------------------------------------------------------------------------
 1 | package streaming.mode
 2 | 
 3 | import org.apache.log4j.{Level, LogManager}
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.streaming.Trigger
 6 | 
 7 | object Rate2ConsoleDefaultMode extends App {
 8 | 
 9 |   val spark: SparkSession = SparkSession.builder()
10 |     .appName("Rate2ConsoleDefaultMode")
11 |     .master("local[*]")
12 |     .getOrCreate()
13 | 
14 |   val logger = LogManager.getRootLogger
15 |   logger.setLevel(Level.ERROR)
16 | 
17 |   val df = spark.readStream
18 |     .format("rate")
19 |     .option("rowsPerSecond", 1)
20 | //    .option("numPartitions", 1)
21 |     .option("rampUpTime", 1)
22 |     .load()
23 | 
24 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
25 |   val defaultStream = rateRawData.writeStream
26 |     .format("console")
27 |     .queryName("Default Mode")
28 |     .trigger(Trigger.ProcessingTime("10 seconds"))
29 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleDefaultMode\\cp1")
30 |     .start()
31 | 
32 |   spark.streams.awaitAnyTermination()
33 | }
34 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/mode/Rate2ConsoleUpdateMode.scala:
--------------------------------------------------------------------------------
 1 | package streaming.mode
 2 | 
 3 | import org.apache.log4j.{Level, LogManager}
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.streaming.Trigger
 6 | 
 7 | object Rate2ConsoleUpdateMode extends App {
 8 | 
 9 |   val spark: SparkSession = SparkSession.builder()
10 |     .appName("Rate2ConsoleUpdateMode")
11 |     .master("local[*]")
12 |     .getOrCreate()
13 | 
14 |   val logger = LogManager.getRootLogger
15 |   logger.setLevel(Level.ERROR)
16 | 
17 |   val df = spark.readStream
18 |     .format("rate")
19 |     .option("rowsPerSecond", 1)
20 |     //    .option("numPartitions", 1)
21 |     .option("rampUpTime", 1)
22 |     .load()
23 | 
24 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
25 |   val defaultStream = rateRawData.writeStream
26 |     .format("console")
27 |     .queryName("Update Mode")
28 |     .trigger(Trigger.ProcessingTime("10 seconds"))
29 |     .outputMode("update")
30 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleUpdateMode\\cp1")
31 |     .start()
32 | 
33 |   spark.streams.awaitAnyTermination()
34 | }
35 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/state/Rate2SparkState.scala:
--------------------------------------------------------------------------------
 1 | package streaming.state
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.functions._
 5 | import org.apache.spark.sql.streaming.{GroupStateTimeout, Trigger}
 6 | 
 7 | case class InputValue(timestamp: String, value: String, key: String,
 8 |                       data1: String, data2: String, data3: String)
 9 | 
10 | object Rate2SparkState extends App with manageStateHelper {
11 | 
12 | 
13 |   val spark: SparkSession = SparkSession.builder()
14 |     .appName("Rate2SparkState")
15 |     .master("local[*]")
16 |     .getOrCreate()
17 | 
18 |   val df = spark.readStream
19 |     .format("rate")
20 |     .option("rowsPerSecond", 2)
21 |     .option("numPartitions", 2)
22 |     .option("rampUpTime", 1)
23 |     .load()
24 | 
25 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
26 | 
27 |   import spark.implicits._
28 | 
29 |   val transformedData = rateRawData
30 |     .withColumn("key", lit(1))
31 |     .withColumn("data1", lit(constant.sampleJSON))
32 |     .withColumn("data2", lit(constant.sampleJSON))
33 |     .withColumn("data3", lit(constant.sampleJSON)).as[InputValue]
34 | 
35 |   import spark.implicits._
36 | 
37 |   val stateManagement = transformedData
38 |     .groupByKey(_.key)
39 |     .mapGroupsWithState(GroupStateTimeout.NoTimeout())(manageState)
40 |   val processingTimeStream = stateManagement.writeStream
41 |     .format("console")
42 |     .outputMode("update")
43 |     .queryName("Micro Batch")
44 |     .trigger(Trigger.ProcessingTime("10 seconds"))
45 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2SparkState\\cp1")
46 |     .option("truncate", false)
47 |     .start()
48 | 
49 |   spark.streams.awaitAnyTermination()
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/state/manageStateHelper.scala:
--------------------------------------------------------------------------------
 1 | package streaming.state
 2 | 
 3 | import org.apache.spark.sql.streaming.GroupState
 4 | 
 5 | 
 6 | case class StateEvents(key:String=null,value:List[InputValue]=List())
 7 | 
 8 | trait manageStateHelper {
 9 | 
10 |   var state :StateEvents = _
11 | 
12 |   def manageState(key: String, inputEvents: Iterator[InputValue], groupState: GroupState[StateEvents]): List[InputValue] = {
13 |     println("Key ::" + key)
14 |     println("inputEvents ::" + inputEvents.toList)
15 |     //println("groupState ::" + groupState.)
16 | 
17 |     val inputEventList = inputEvents.toList
18 | 
19 |     state = groupState.getOption.getOrElse(StateEvents())
20 | 
21 | 
22 |     inputEventList.map( event => updateState(state,event))
23 |     groupState.update(state)
24 |     // state = groupState.getOption.getOrElse(InputValue())
25 | 
26 |     //groupState.update(updateState(state,inputEventList))
27 |     if(groupState.exists) groupState.get.value else List()
28 |   }
29 | 
30 |   def updateState(stateData:StateEvents,inputEvent:InputValue): StateEvents ={
31 |     state = state.copy(value = state.value:+inputEvent)
32 |     println("State :: " + state)
33 |     state
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/trigger/Rate2ConsoleContinuousTrigger.scala:
--------------------------------------------------------------------------------
 1 | package streaming.trigger
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.Trigger
 5 | 
 6 | object Rate2ConsoleContinuousTrigger extends App {
 7 | 
 8 |   val spark: SparkSession = SparkSession.builder()
 9 |     .appName("Rate2ConsoleContinuousTrigger")
10 |     .master("local[*]")
11 |     .getOrCreate()
12 | 
13 |   spark.sparkContext.setLogLevel("ERROR")
14 | 
15 |   val df = spark.readStream
16 |     .format("rate")
17 |     .option("rowsPerSecond", 1)
18 |     .option("numPartitions", 1)
19 |     .option("rampUpTime", 1)
20 |     .load()
21 | 
22 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
23 |   val firstContinuousStream = rateRawData.writeStream
24 |     .format("console")
25 |     .queryName("First Continuous Stream ")
26 |     .trigger(Trigger.Continuous("1 seconds"))
27 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleContinuousTrigger\\cp1")
28 |     .start()
29 | 
30 |   val secondContinuousStream = rateRawData.writeStream
31 |     .format("console")
32 |     .queryName("Second Continuous Stream ")
33 |     .trigger(Trigger.Continuous("1 seconds"))
34 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleContinuousTrigger\\cp2")
35 |     .start()
36 | 
37 |   spark.streams.awaitAnyTermination()
38 | }
39 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/trigger/Rate2ConsoleDefaultTrigger.scala:
--------------------------------------------------------------------------------
 1 | package streaming.trigger
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object Rate2ConsoleDefaultTrigger extends App {
 6 | 
 7 |   val spark: SparkSession = SparkSession.builder()
 8 |     .appName("Rate2ConsoleDefaultTrigger")
 9 |     .master("local[*]")
10 |     .getOrCreate()
11 | 
12 |   val df = spark.readStream
13 |     .format("rate")
14 |     .option("rowsPerSecond", 90000)
15 | //    .option("numPartitions", 1)
16 |     .option("rampUpTime", 1)
17 |     .load()
18 | 
19 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
20 |   val defaultStream = rateRawData.writeStream
21 |     .format("console")
22 |     .queryName("Default")
23 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleDefaultTrigger\\cp1")
24 |     .start()
25 | 
26 |   spark.streams.awaitAnyTermination()
27 | }
28 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/trigger/Rate2ConsoleOnceTrigger.scala:
--------------------------------------------------------------------------------
 1 | package streaming.trigger
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.Trigger
 5 | 
 6 | object Rate2ConsoleOnceTrigger extends App {
 7 | 
 8 |   val spark: SparkSession = SparkSession.builder()
 9 |     .appName("Rate2ConsoleOnceTrigger")
10 |     .master("local[*]")
11 |     .getOrCreate()
12 | 
13 |   val df = spark.readStream
14 |     .format("rate")
15 |     .option("rowsPerSecond", 1000)
16 |     .option("numPartitions", 4)
17 |     .option("rampUpTime", 1)
18 |     .load()
19 | 
20 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
21 |   val onceStream = rateRawData.writeStream
22 |     .format("console")
23 |     .queryName("Once")
24 |     .trigger(Trigger.Once())
25 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleOnceTrigger\\cp1")
26 |     .start()
27 | 
28 | 
29 |   spark.streams.awaitAnyTermination(100000)
30 | }
31 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/trigger/Rate2ConsoleProgressTrigger.scala:
--------------------------------------------------------------------------------
 1 | package streaming.trigger
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.Trigger
 5 | 
 6 | object Rate2ConsoleProgressTrigger extends App {
 7 | 
 8 |   val spark: SparkSession = SparkSession.builder()
 9 |     .appName("Rate2ConsoleProgressTrigger")
10 |     .master("local[*]")
11 |     .getOrCreate()
12 | 
13 |   val df = spark.readStream
14 |     .format("rate")
15 |     .option("rowsPerSecond", 90000)
16 | //    .option("numPartitions", 1)
17 |     .option("rampUpTime", 1)
18 |     .load()
19 | 
20 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
21 |   val processingTimeStream = rateRawData.writeStream
22 |     .format("console")
23 |     .queryName("Micro Batch")
24 |     .trigger(Trigger.ProcessingTime("20 seconds"))
25 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleProgressTrigger\\cp1")
26 |     .start()
27 | 
28 |   spark.streams.awaitAnyTermination()
29 | }
30 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/trigger/Rate2ConsoleTriggerOptions.scala:
--------------------------------------------------------------------------------
 1 | package streaming.trigger
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.sql.streaming.Trigger
 5 | 
 6 | object Rate2ConsoleTriggerOptions extends App {
 7 | 
 8 |   val spark: SparkSession = SparkSession.builder()
 9 |     .appName("Rate2ConsoleTriggerOptions")
10 |     .master("local[*]")
11 |     .getOrCreate()
12 | 
13 |   val df = spark.readStream
14 |     .format("rate")
15 |     .option("rowsPerSecond", 100000)
16 |     .option("numPartitions", 1)
17 |     .option("rampUpTime", 1)
18 |     .load()
19 | 
20 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
21 |   val defaultStream = rateRawData.writeStream
22 |     .format("console")
23 |     .queryName("Default")
24 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleTriggerOptions\\cp1")
25 |     .start()
26 | 
27 |   val onceStream = rateRawData.writeStream
28 |     .format("console")
29 |     .queryName("Once")
30 |     .trigger(Trigger.Once())
31 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleTriggerOptions\\cp2")
32 |     .start()
33 | 
34 |   val processingTimeStream = rateRawData.writeStream
35 |     .format("console")
36 |     .queryName("Micro Batch")
37 |     .trigger(Trigger.ProcessingTime("20 seconds"))
38 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleTriggerOptions\\cp3")
39 |     .start()
40 | 
41 |   val countinuousTimeStream = rateRawData.writeStream
42 |     .format("console")
43 |     .queryName("Micro Batch")
44 | //    .trigger(Trigger.C("20 seconds"))
45 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleTriggerOptions\\cp3")
46 |     .start()
47 | 
48 |   defaultStream.awaitTermination()
49 |   onceStream.awaitTermination()
50 |   processingTimeStream.awaitTermination()
51 | 
52 | //  spark.streams.awaitAnyTermination()
53 | }
54 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/unsupported/Rate2ConsoleMultiStream.scala:
--------------------------------------------------------------------------------
 1 | package streaming.unsupported
 2 | 
 3 | import org.apache.log4j.{Level, LogManager}
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.streaming.Trigger
 6 | 
 7 | object Rate2ConsoleMultiStream extends App {
 8 | 
 9 |   val spark: SparkSession = SparkSession.builder()
10 |     .appName("Rate2ConsoleMultiStream")
11 |     .master("local[*]")
12 |     .getOrCreate()
13 | 
14 |   val logger = LogManager.getRootLogger
15 |   logger.setLevel(Level.ERROR)
16 | 
17 |   val df = spark.readStream
18 |     .format("rate")
19 |     .option("rowsPerSecond", 1)
20 |     //    .option("numPartitions", 1)
21 |     .option("rampUpTime", 1)
22 |     .load()
23 | 
24 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
25 | 
26 |   val transformedData = rateRawData.withColumn("key", rateRawData.col("timestamp").substr(15,2))
27 |   val groupedData = transformedData.groupBy("key").count()
28 |   groupedData.createTempView("countTable")
29 |   val countData = spark.sql("select count(*) from countTable")
30 | 
31 |   val defaultStream = countData.writeStream
32 |     .format("console")
33 |     .queryName("Complete Mode")
34 |     .trigger(Trigger.ProcessingTime("10 seconds"))
35 |     .outputMode("complete")
36 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleMultiStream\\cp1")
37 |     .start()
38 | 
39 |   spark.streams.awaitAnyTermination()
40 | }
41 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/unsupported/UnsupportedFeatures.scala:
--------------------------------------------------------------------------------
 1 | package streaming.unsupported
 2 | 
 3 | import entity.RateData
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions._
 6 | import org.apache.spark.sql.streaming.Trigger
 7 | 
 8 | object UnsupportedFeatures extends App {
 9 | 
10 |   val spark: SparkSession = SparkSession.builder()
11 |     .appName("UnsupportedFeatures")
12 |     .master("local[*]")
13 |     .getOrCreate()
14 | 
15 |   val df = spark.readStream
16 |     .format("rate")
17 |     .option("rowsPerSecond", 1)
18 |     .option("numPartitions", 1)
19 |     .option("rampUpTime", 1)
20 |     .load()
21 | 
22 |   import spark.implicits._
23 | 
24 |   val rateData = df.as[RateData]
25 |   val employeeDS = rateData.where("value % 10 != 0")
26 |     .withColumn("firstName",  concat(lit("firstName"),rateData.col("value")))
27 |     .withColumn("lastName",  concat(lit("lastName"),rateData.col("value")))
28 |     .withColumn("departmentId", lit(floor(rateData.col("value")/10)))
29 | //    .withColumnRenamed("value", "id")
30 | 
31 |   val departmentDS = rateData.where("value % 10 == 0")
32 |     .withColumn("name", concat(lit("name"),floor(rateData.col("value")/10)))
33 |     .withColumn("id", lit(floor(rateData.col("value")/10)))
34 |     .drop("value")
35 | 
36 |   val targetDS =  departmentDS.join(employeeDS, $"id" === $"departmentId")
37 | 
38 | //  val targetDS =  departmentDS.join(employeeDS)
39 | 
40 |   val employeeStream = employeeDS.writeStream
41 |     .format("console")
42 |     .queryName("Employee")
43 |     .trigger(Trigger.ProcessingTime("10 seconds"))
44 |     .option("checkpointLocation", "sparkCheckPoint\\UnsupportedFeatures\\employee")
45 |     .start()
46 | 
47 |   val departmentStream = departmentDS.writeStream
48 |     .format("console")
49 |     .queryName("Department")
50 |     .trigger(Trigger.ProcessingTime("10 seconds"))
51 |     .option("checkpointLocation", "sparkCheckPoint\\UnsupportedFeatures\\department")
52 |     .start()
53 | 
54 | 
55 |   val targetStream = targetDS.writeStream
56 |     .format("console")
57 |     .queryName("joinedTable")
58 |     .trigger(Trigger.ProcessingTime("15 seconds"))
59 |     .option("checkpointLocation", "sparkCheckPoint\\UnsupportedFeatures\\joinedTable")
60 |     .start()
61 | 
62 |   spark.streams.awaitAnyTermination()
63 | }
64 | 


--------------------------------------------------------------------------------
/SparkStreamingPOC/src/main/scala/streaming/watermark/Rate2ConsoleWatermark.scala:
--------------------------------------------------------------------------------
 1 | package streaming.watermark
 2 | 
 3 | import org.apache.log4j.{Level, LogManager}
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.streaming.Trigger
 6 | 
 7 | object Rate2ConsoleWatermark extends App {
 8 | 
 9 |   val spark: SparkSession = SparkSession.builder()
10 |     .appName("Rate2ConsoleWatermark")
11 |     .master("local[*]")
12 |     .getOrCreate()
13 | 
14 |   val logger = LogManager.getRootLogger
15 |   logger.setLevel(Level.ERROR)
16 | 
17 |   spark.conf.set("spark.sql.shuffle.partitions", "1")
18 | 
19 |   val df = spark.readStream
20 |     .format("rate")
21 |     .option("rowsPerSecond", 1)
22 |     //    .option("numPartitions", 1)
23 |     .option("rampUpTime", 1)
24 |     .load()
25 | 
26 |   val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)")
27 | 
28 |   val transformedData = rateRawData.withColumn("key", rateRawData.col("timestamp").substr(15, 2))
29 |   val countData = transformedData.groupBy("key").count()
30 | 
31 |   val defaultStream = countData.writeStream
32 |     .format("console")
33 |     .queryName("Complete Mode")
34 |     .trigger(Trigger.ProcessingTime("10 seconds"))
35 |     .outputMode("complete")
36 |     .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleWatermark\\cp1")
37 |     .start()
38 | 
39 |   spark.streams.awaitAnyTermination()
40 | }
41 | 


--------------------------------------------------------------------------------