└── SparkStreamingPOC ├── build.sbt └── src └── main ├── resources ├── department.csv └── employee.csv └── scala ├── entity ├── entities.scala └── package.scala ├── listener ├── KafkaMetrics.scala └── SparkListenerKafkaWriter.scala ├── sink └── JDBCSink.scala └── streaming ├── MultiStreamTODO.scala ├── Rate2Console.scala ├── SparkListenerKafkaNotifier.scala ├── jdbc └── WriteToPostgress.scala ├── join ├── staticstream │ ├── staticStreamFullOuterJoin.scala │ ├── staticStreamInnerJoin.scala │ ├── staticStreamLeftOuterJoin.scala │ └── staticStreamRightOuterJoin.scala ├── streamstatic │ ├── streamStaticFullOuterJoin.scala │ ├── streamStaticInnerJoin.scala │ ├── streamStaticLeftOuterJoin.scala │ └── streamStaticRightOuterJoin.scala └── streamstream │ ├── streamStreamFullOuterJoin.scala │ ├── streamStreamInnerJoin.scala │ ├── streamStreamLeftOuterJoin.scala │ └── streamStreamRightOuterJoin.scala ├── kafka ├── Kafka2Kafka.scala ├── Rate2Kafka.scala ├── Rate2KafkaMultiStream.scala └── Rate2KafkaSparkListener.scala ├── mode ├── Rate2ConsoleAggregateAppendMode.scala ├── Rate2ConsoleAggregateCompleteMode.scala ├── Rate2ConsoleAggregateUpdateMode.scala ├── Rate2ConsoleCompleteMode.scala ├── Rate2ConsoleDefaultMode.scala └── Rate2ConsoleUpdateMode.scala ├── state ├── Rate2SparkState.scala └── manageStateHelper.scala ├── trigger ├── Rate2ConsoleContinuousTrigger.scala ├── Rate2ConsoleDefaultTrigger.scala ├── Rate2ConsoleOnceTrigger.scala ├── Rate2ConsoleProgressTrigger.scala └── Rate2ConsoleTriggerOptions.scala ├── unsupported ├── Rate2ConsoleMultiStream.scala └── UnsupportedFeatures.scala └── watermark └── Rate2ConsoleWatermark.scala /SparkStreamingPOC/build.sbt: -------------------------------------------------------------------------------- 1 | name := "SparkStreamingSample" 2 | 3 | version := "0.1" 4 | 5 | scalaVersion := "2.11.11" 6 | 7 | // grading libraries 8 | libraryDependencies ++= Seq( 9 | "log4j" % "log4j" % "1.2.14", 10 | "org.apache.kafka" % "kafka-clients" % "0.8.2.0", 11 | "org.apache.spark" %% "spark-core" % "2.4.0", 12 | "org.apache.spark" %% "spark-sql" % "2.4.0", 13 | "org.apache.spark" %% "spark-sql-kafka-0-10" % "2.4.0" 14 | 15 | ) -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/resources/department.csv: -------------------------------------------------------------------------------- 1 | id,name 2 | 1,dept1 3 | 2,dept2 4 | 3,dept3 5 | 4,dept4 6 | 5,dept5 7 | 6,dept6 8 | 7,dept7 9 | 8,dept8 10 | 9,dept9 11 | 10,dept10 12 | 11,dept11 13 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/resources/employee.csv: -------------------------------------------------------------------------------- 1 | id,firstName,lastName,departmentId 2 | 1,firstName1,lastName1,1 3 | 2,firstName2,lastName2,1 4 | 3,firstName3,lastName3,1 5 | 4,firstName4,lastName4,1 6 | 5,firstName5,lastName5,1 7 | 6,firstName6,lastName6,2 8 | 7,firstName7,lastName7,2 9 | 8,firstName8,lastName8,2 10 | 9,firstName9,lastName9,2 11 | 10,firstName10,lastName10,2 12 | 11,firstName11,lastName11,2 13 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/entity/entities.scala: -------------------------------------------------------------------------------- 1 | package entity 2 | 3 | import java.sql.Timestamp 4 | 5 | case class RateData(timestamp: Timestamp, value: Long) 6 | case class Employee(id: Long, firstName: String, lastName: String, eventTime: Timestamp) 7 | case class Department(id: Long, name: String, eventTime: Timestamp) 8 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/entity/package.scala: -------------------------------------------------------------------------------- 1 | package object constant { 2 | val sampleJSON:String = "{\"web-app\": {\n \"servlet\": [ \n {\n \"servlet-name\": \"cofaxCDS\",\n \"servlet-class\": \"org.cofax.cds.CDSServlet\",\n \"init-param\": {\n \"configGlossary:installationAt\": \"Philadelphia, PA\",\n \"configGlossary:adminEmail\": \"ksm@pobox.com\",\n \"configGlossary:poweredBy\": \"Cofax\",\n \"configGlossary:poweredByIcon\": \"/images/cofax.gif\",\n \"configGlossary:staticPath\": \"/content/static\",\n \"templateProcessorClass\": \"org.cofax.WysiwygTemplate\",\n \"templateLoaderClass\": \"org.cofax.FilesTemplateLoader\",\n \"templatePath\": \"templates\",\n \"templateOverridePath\": \"\",\n \"defaultListTemplate\": \"listTemplate.htm\",\n \"defaultFileTemplate\": \"articleTemplate.htm\",\n \"useJSP\": false,\n \"jspListTemplate\": \"listTemplate.jsp\",\n \"jspFileTemplate\": \"articleTemplate.jsp\",\n \"cachePackageTagsTrack\": 200,\n \"cachePackageTagsStore\": 200,\n \"cachePackageTagsRefresh\": 60,\n \"cacheTemplatesTrack\": 100,\n \"cacheTemplatesStore\": 50,\n \"cacheTemplatesRefresh\": 15,\n \"cachePagesTrack\": 200,\n \"cachePagesStore\": 100,\n \"cachePagesRefresh\": 10,\n \"cachePagesDirtyRead\": 10,\n \"searchEngineListTemplate\": \"forSearchEnginesList.htm\",\n \"searchEngineFileTemplate\": \"forSearchEngines.htm\",\n \"searchEngineRobotsDb\": \"WEB-INF/robots.db\",\n \"useDataStore\": true,\n \"dataStoreClass\": \"org.cofax.SqlDataStore\",\n \"redirectionClass\": \"org.cofax.SqlRedirection\",\n \"dataStoreName\": \"cofax\",\n \"dataStoreDriver\": \"com.microsoft.jdbc.sqlserver.SQLServerDriver\",\n \"dataStoreUrl\": \"jdbc:microsoft:sqlserver://LOCALHOST:1433;DatabaseName=goon\",\n \"dataStoreUser\": \"sa\",\n \"dataStorePassword\": \"dataStoreTestQuery\",\n \"dataStoreTestQuery\": \"SET NOCOUNT ON;select test='test';\",\n \"dataStoreLogFile\": \"/usr/local/tomcat/logs/datastore.log\",\n \"dataStoreInitConns\": 10,\n \"dataStoreMaxConns\": 100,\n \"dataStoreConnUsageLimit\": 100,\n \"dataStoreLogLevel\": \"debug\",\n \"maxUrlLength\": 500}},\n {\n \"servlet-name\": \"cofaxEmail\",\n \"servlet-class\": \"org.cofax.cds.EmailServlet\",\n \"init-param\": {\n \"mailHost\": \"mail1\",\n \"mailHostOverride\": \"mail2\"}},\n {\n \"servlet-name\": \"cofaxAdmin\",\n \"servlet-class\": \"org.cofax.cds.AdminServlet\"},\n \n {\n \"servlet-name\": \"fileServlet\",\n \"servlet-class\": \"org.cofax.cds.FileServlet\"},\n {\n \"servlet-name\": \"cofaxTools\",\n \"servlet-class\": \"org.cofax.cms.CofaxToolsServlet\",\n \"init-param\": {\n \"templatePath\": \"toolstemplates/\",\n \"log\": 1,\n \"logLocation\": \"/usr/local/tomcat/logs/CofaxTools.log\",\n \"logMaxSize\": \"\",\n \"dataLog\": 1,\n \"dataLogLocation\": \"/usr/local/tomcat/logs/dataLog.log\",\n \"dataLogMaxSize\": \"\",\n \"removePageCache\": \"/content/admin/remove?cache=pages&id=\",\n \"removeTemplateCache\": \"/content/admin/remove?cache=templates&id=\",\n \"fileTransferFolder\": \"/usr/local/tomcat/webapps/content/fileTransferFolder\",\n \"lookInContext\": 1,\n \"adminGroupID\": 4,\n \"betaServer\": true}}],\n \"servlet-mapping\": {\n \"cofaxCDS\": \"/\",\n \"cofaxEmail\": \"/cofaxutil/aemail/*\",\n \"cofaxAdmin\": \"/admin/*\",\n \"fileServlet\": \"/static/*\",\n \"cofaxTools\": \"/tools/*\"},\n \n \"taglib\": {\n \"taglib-uri\": \"cofax.tld\",\n \"taglib-location\": \"/WEB-INF/tlds/cofax.tld\"}}}" 3 | 4 | } 5 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/listener/KafkaMetrics.scala: -------------------------------------------------------------------------------- 1 | package listener 2 | 3 | import java.util.Properties 4 | 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 6 | import org.apache.kafka.common.serialization.StringSerializer 7 | import org.apache.spark.sql.streaming.StreamingQueryListener 8 | 9 | class KafkaMetrics(servers: String, metricsTopic: String, errorTopic: String) extends StreamingQueryListener { 10 | 11 | val kafkaProperties = new Properties() 12 | kafkaProperties.put("bootstrap.servers", servers) 13 | kafkaProperties.put("key.serializer", classOf[StringSerializer]) 14 | kafkaProperties.put("value.serializer", classOf[StringSerializer]) 15 | 16 | val producer = new KafkaProducer[String, String](kafkaProperties) 17 | 18 | def onQueryProgress(event: org.apache.spark.sql.streaming.StreamingQueryListener.QueryProgressEvent): Unit = { 19 | producer.send(new ProducerRecord(metricsTopic, event.progress.json)) 20 | } 21 | def onQueryStarted(event: org.apache.spark.sql.streaming.StreamingQueryListener.QueryStartedEvent): Unit = {} 22 | def onQueryTerminated(event: org.apache.spark.sql.streaming.StreamingQueryListener.QueryTerminatedEvent): Unit = { 23 | producer.send(new ProducerRecord(errorTopic, event.exception.get)) 24 | } 25 | } -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/listener/SparkListenerKafkaWriter.scala: -------------------------------------------------------------------------------- 1 | package listener 2 | 3 | import java.util.Properties 4 | 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 6 | import org.apache.kafka.common.serialization.StringSerializer 7 | import org.apache.spark.scheduler._ 8 | 9 | class SparkListenerKafkaWriter(servers: String, metricsTopic: String, errorTopic: String) extends SparkListener { 10 | 11 | val kafkaProperties = new Properties() 12 | kafkaProperties.put("bootstrap.servers", servers) 13 | kafkaProperties.put("key.serializer", classOf[StringSerializer]) 14 | kafkaProperties.put("value.serializer", classOf[StringSerializer]) 15 | 16 | val producer = new KafkaProducer[String, String](kafkaProperties) 17 | 18 | override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = { 19 | super.onStageCompleted(stageCompleted) 20 | // stageCompleted.stageInfo.taskMetrics. 21 | // println("") 22 | } 23 | 24 | override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = super.onStageSubmitted(stageSubmitted) 25 | 26 | override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = super.onTaskStart(taskStart) 27 | 28 | override def onTaskGettingResult(taskGettingResult: SparkListenerTaskGettingResult): Unit = super.onTaskGettingResult(taskGettingResult) 29 | 30 | override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = super.onTaskEnd(taskEnd) 31 | 32 | override def onJobStart(jobStart: SparkListenerJobStart): Unit = super.onJobStart(jobStart) 33 | 34 | override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = super.onJobEnd(jobEnd) 35 | 36 | override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate): Unit = super.onEnvironmentUpdate(environmentUpdate) 37 | 38 | override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded): Unit = super.onBlockManagerAdded(blockManagerAdded) 39 | 40 | override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved): Unit = super.onBlockManagerRemoved(blockManagerRemoved) 41 | 42 | override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = super.onUnpersistRDD(unpersistRDD) 43 | 44 | override def onApplicationStart(applicationStart: SparkListenerApplicationStart): Unit = super.onApplicationStart(applicationStart) 45 | 46 | override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = super.onApplicationEnd(applicationEnd) 47 | 48 | override def onExecutorMetricsUpdate(executorMetricsUpdate: SparkListenerExecutorMetricsUpdate): Unit = super.onExecutorMetricsUpdate(executorMetricsUpdate) 49 | 50 | override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = super.onExecutorAdded(executorAdded) 51 | 52 | override def onExecutorRemoved(executorRemoved: SparkListenerExecutorRemoved): Unit = super.onExecutorRemoved(executorRemoved) 53 | 54 | override def onExecutorBlacklisted(executorBlacklisted: SparkListenerExecutorBlacklisted): Unit = super.onExecutorBlacklisted(executorBlacklisted) 55 | 56 | override def onExecutorUnblacklisted(executorUnblacklisted: SparkListenerExecutorUnblacklisted): Unit = super.onExecutorUnblacklisted(executorUnblacklisted) 57 | 58 | override def onNodeBlacklisted(nodeBlacklisted: SparkListenerNodeBlacklisted): Unit = super.onNodeBlacklisted(nodeBlacklisted) 59 | 60 | override def onNodeUnblacklisted(nodeUnblacklisted: SparkListenerNodeUnblacklisted): Unit = super.onNodeUnblacklisted(nodeUnblacklisted) 61 | 62 | override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = super.onBlockUpdated(blockUpdated) 63 | 64 | override def onOtherEvent(event: SparkListenerEvent): Unit = super.onOtherEvent(event) 65 | } -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/sink/JDBCSink.scala: -------------------------------------------------------------------------------- 1 | package sink 2 | 3 | import java.sql._ 4 | 5 | import org.apache.spark.sql.ForeachWriter 6 | 7 | class JDBCSink(driver: String, url: String, user: String, pwd: String) extends ForeachWriter[org.apache.spark.sql.Row] { 8 | var connection:Connection = _ 9 | var preparedStmt: PreparedStatement=_ 10 | 11 | override def open(partitionId: Long,version: Long): Boolean = { 12 | Class.forName(driver) 13 | connection = DriverManager.getConnection(url, user, pwd) 14 | true 15 | } 16 | 17 | override def process(value: (org.apache.spark.sql.Row)): Unit = { 18 | 19 | println("value Size" + value.size) 20 | println("value :: " + value) 21 | println("Value at 0 index :: " + value(0)) 22 | println(value(0).toString.split(",").length) 23 | var name=value(0).toString.split(",")(0) 24 | var dep=value(0).toString.split(",")(1) 25 | var mail=value(0).toString.split(",")(2) 26 | 27 | var sql :String =s"""INSERT INTO public.Employee(NAME,DEPARTMENT,MAIL) 28 | VALUES (?,?,?)"""; 29 | 30 | preparedStmt=connection.prepareStatement(sql) 31 | preparedStmt.setString(1,name) 32 | preparedStmt.setString(2,dep) 33 | preparedStmt.setString(3,mail) 34 | preparedStmt.execute() 35 | /*/*statement = connection.createStatement 36 | statement.execute(s"""INSERT INTO public.Employee(NAME,DEPARTMENT,MAIL)*/ 37 | VALUES ('wq','b','c')""")*/ 38 | 39 | 40 | } 41 | 42 | override def close(errorOrNull: Throwable): Unit = { 43 | connection.close 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/MultiStreamTODO.scala: -------------------------------------------------------------------------------- 1 | package streaming 2 | 3 | import entity.RateData 4 | import listener.KafkaMetrics 5 | import org.apache.log4j.{Level, LogManager} 6 | import org.apache.spark.sql.SparkSession 7 | import org.apache.spark.sql.streaming.Trigger 8 | 9 | 10 | object MultiStreamTODO extends App { 11 | 12 | val spark: SparkSession = SparkSession.builder() 13 | .appName("StreamingListenerKafkaNotifier") 14 | .master("local[*]") 15 | .config("spark.sql.streaming.metricsEnabled", true) 16 | .getOrCreate() 17 | 18 | val logger = LogManager.getRootLogger 19 | logger.setLevel(Level.ERROR) 20 | 21 | 22 | val df = spark.readStream 23 | .format("rate") 24 | .option("rowsPerSecond", 1) 25 | .option("numPartitions", 1) 26 | .option("rampUpTime", 2) 27 | .load() 28 | 29 | import spark.implicits._ 30 | 31 | val rateData = df.as[RateData] 32 | val filteredDS = rateData.where("value < 20") 33 | val greaterThanDS = rateData.where("value > 21") 34 | 35 | val errorDS = greaterThanDS.where("value > 30") 36 | .map(triggerException(_)) 37 | 38 | val stringData = filteredDS.selectExpr("CAST(timestamp AS String)", "CAST(value AS String)") 39 | 40 | val kafkaWriteStream1 = stringData.writeStream 41 | .format("kafka") 42 | .queryName("First Kafka Stream") 43 | .option("topic", "test2") 44 | .option("checkpointLocation", "sparkCheckPoint\\StreamingListenerKafkaNotifier\\cp1") 45 | .option("kafka.bootstrap.servers", "localhost:9092") 46 | .trigger(Trigger.ProcessingTime("10 seconds")) 47 | .start() 48 | 49 | val consoleDS = errorDS.selectExpr("CAST(timestamp AS String)", "CAST(value AS String)") 50 | 51 | consoleDS.writeStream.format("console") 52 | .queryName("Console stream") 53 | .trigger(Trigger.ProcessingTime("10 seconds")) 54 | .start() 55 | 56 | spark.streams.addListener(new KafkaMetrics("localhost:9092", "streamingMetrics", "streamingTermination")) 57 | spark.streams.awaitAnyTermination() 58 | 59 | def triggerException(rateData: RateData): RateData = { 60 | throw new Exception() 61 | rateData 62 | } 63 | } -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/Rate2Console.scala: -------------------------------------------------------------------------------- 1 | package streaming 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.Trigger 5 | 6 | object Rate2Console extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .appName("Rate2Console") 10 | .master("local[*]") 11 | .getOrCreate() 12 | 13 | val df = spark.readStream 14 | .format("rate") 15 | .option("rowsPerSecond", 2) 16 | .option("numPartitions", 2) 17 | .option("rampUpTime", 1) 18 | .load() 19 | 20 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 21 | 22 | val processingTimeStream = rateRawData.writeStream 23 | .format("console") 24 | .queryName("Micro Batch") 25 | .trigger(Trigger.ProcessingTime("10 seconds")) 26 | .option("checkpointLocation", "sparkCheckPoint\\Rate2Console\\cp1") 27 | .start() 28 | 29 | spark.streams.awaitAnyTermination() 30 | } 31 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/SparkListenerKafkaNotifier.scala: -------------------------------------------------------------------------------- 1 | package streaming 2 | 3 | import java.sql.Timestamp 4 | 5 | import listener.KafkaMetrics 6 | import org.apache.log4j.{Level, LogManager} 7 | import org.apache.spark.sql.SparkSession 8 | import org.apache.spark.sql.streaming.Trigger 9 | 10 | object SparkListenerKafkaNotifier extends App { 11 | 12 | val spark: SparkSession = SparkSession.builder() 13 | .appName("StreamingListenerKafkaNotifier") 14 | .master("local[*]") 15 | .config("spark.sql.streaming.metricsEnabled", true) 16 | .getOrCreate() 17 | 18 | case class RateData(timestamp: Timestamp, value: Long) 19 | 20 | val rawDF = spark.readStream 21 | .format("rate") 22 | .option("rowsPerSecond", 1) 23 | .option("numPartitions", 1) 24 | .option("rampUpTime", 2) 25 | .load() 26 | 27 | val rateDF = rawDF.selectExpr("CAST(timestamp AS String)", "CAST(value AS String)") 28 | 29 | val kafkaWriteStream = rateDF.writeStream 30 | .format("kafka") 31 | .queryName("First Kafka Stream") 32 | .option("topic", "test2") 33 | .option("checkpointLocation", "sparkCheckPoint\\StreamingListenerKafkaNotifier\\cp1") 34 | .option("kafka.bootstrap.servers", "localhost:9092") 35 | .trigger(Trigger.ProcessingTime("10 seconds")) 36 | .start() 37 | 38 | spark.streams.addListener(new KafkaMetrics("localhost:9092", "streamingMetrics", "streamingTermination")) 39 | spark.streams.awaitAnyTermination() 40 | } -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/jdbc/WriteToPostgress.scala: -------------------------------------------------------------------------------- 1 | package streaming.jdbc 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import sink.JDBCSink 5 | 6 | object WriteToPostgreSQL extends App{ 7 | 8 | 9 | 10 | case class InputData(name:String, department:String, mail:String) 11 | 12 | val spark : SparkSession = SparkSession.builder() 13 | .appName("WriteToPostgresSQL") 14 | .master("local[*]") 15 | .getOrCreate() 16 | 17 | val df = spark.readStream.format("kafka") 18 | .option("kafka.bootstrap.servers", "localhost:9092") 19 | .option("subscribe", "ci.etl.currencyCode") 20 | .load() 21 | 22 | def splitfunc(input:String):String ={ 23 | input 24 | } 25 | 26 | val records=df.selectExpr("CAST(value AS string)") 27 | 28 | //println("Sassas" + records) 29 | 30 | val driver = "" 31 | val url: String="jdbc:postgresql://localhost:5432/sathish" 32 | val userName:String="postgres" 33 | val passWord:String="root" 34 | val writer = new JDBCSink(driver, url, userName, passWord) 35 | 36 | records.writeStream.foreach(writer).start().awaitTermination() 37 | 38 | } 39 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/join/staticstream/staticStreamFullOuterJoin.scala: -------------------------------------------------------------------------------- 1 | package streaming.join.staticstream 2 | 3 | import entity.RateData 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.streaming.Trigger 7 | 8 | object staticStreamFullOuterJoin extends App { 9 | 10 | val spark: SparkSession = SparkSession.builder() 11 | .appName("StaticStreamFullOuterJoin") 12 | .master("local[*]") 13 | .getOrCreate() 14 | 15 | val df = spark.readStream 16 | .format("rate") 17 | .option("rowsPerSecond", 1) 18 | .option("numPartitions", 1) 19 | .option("rampUpTime", 1) 20 | .load() 21 | 22 | import spark.implicits._ 23 | 24 | val rateData = df.as[RateData] 25 | val streamingEmployeeDS = rateData.where("value % 10 != 0") 26 | .withColumn("firstName", concat(lit("firstName"), rateData.col("value"))) 27 | .withColumn("lastName", concat(lit("lastName"), rateData.col("value"))) 28 | .withColumn("departmentId", lit(floor(rateData.col("value") / 10))) 29 | // .withColumnRenamed("value", "id") 30 | 31 | val staticDepartmentDS = spark.read.format("csv").option("header", "true").load("src/main/resources/department.csv") 32 | 33 | val fullOuterJoinDS = staticDepartmentDS.join(streamingEmployeeDS, $"id" === $"departmentId", "full_outer") 34 | 35 | val fullOuterJoinStream = fullOuterJoinDS.writeStream 36 | .format("console") 37 | .queryName("InnerJoin") 38 | .trigger(Trigger.ProcessingTime("10 seconds")) 39 | .option("checkpointLocation", "sparkCheckPoint\\StaticStreamFullOuterJoin\\cp1") 40 | .start() 41 | } 42 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/join/staticstream/staticStreamInnerJoin.scala: -------------------------------------------------------------------------------- 1 | package streaming.join.staticstream 2 | 3 | import entity.RateData 4 | import org.apache.log4j.{Level, LogManager} 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.streaming.Trigger 8 | import streaming.join.streamstream.streamStreamLeftOuterJoin.spark 9 | 10 | object staticStreamInnerJoin extends App { 11 | 12 | val spark: SparkSession = SparkSession.builder() 13 | .appName("StaticStreamInnerJoin") 14 | .master("local[*]") 15 | .getOrCreate() 16 | 17 | spark.conf.set("spark.sql.shuffle.partitions", "1") 18 | 19 | val logger = LogManager.getRootLogger 20 | logger.setLevel(Level.ERROR) 21 | 22 | val df = spark.readStream 23 | .format("rate") 24 | .option("rowsPerSecond", 1) 25 | .option("numPartitions", 1) 26 | .option("rampUpTime", 1) 27 | .load() 28 | 29 | import spark.implicits._ 30 | 31 | val rateData = df.as[RateData] 32 | val streamingEmployeeDS = rateData.where("value % 10 != 0") 33 | .withColumn("firstName", concat(lit("firstName"),rateData.col("value"))) 34 | .withColumn("lastName", concat(lit("lastName"),rateData.col("value"))) 35 | .withColumn("departmentId", lit(floor(rateData.col("value")/10))) 36 | 37 | val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv") 38 | 39 | val innerJoinDS = staticDepartmentDS.join(streamingEmployeeDS, $"id" === $"departmentId") 40 | 41 | val innerJoinStream = innerJoinDS.writeStream 42 | .format("console") 43 | .queryName("InnerJoin") 44 | .trigger(Trigger.ProcessingTime("20 seconds")) 45 | .option("checkpointLocation", "sparkCheckPoint\\StaticStreamInnerJoin\\cp1") 46 | .start() 47 | 48 | spark.streams.awaitAnyTermination() 49 | } 50 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/join/staticstream/staticStreamLeftOuterJoin.scala: -------------------------------------------------------------------------------- 1 | package streaming.join.staticstream 2 | 3 | import entity.RateData 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.streaming.Trigger 7 | 8 | object staticStreamLeftOuterJoin extends App { 9 | 10 | val spark: SparkSession = SparkSession.builder() 11 | .appName("StaticStreamLeftOuterJoin") 12 | .master("local[*]") 13 | .getOrCreate() 14 | 15 | val df = spark.readStream 16 | .format("rate") 17 | .option("rowsPerSecond", 1) 18 | .option("numPartitions", 1) 19 | .option("rampUpTime", 1) 20 | .load() 21 | 22 | import spark.implicits._ 23 | 24 | val rateData = df.as[RateData] 25 | val streamingEmployeeDS = rateData.where("value % 10 != 0") 26 | .withColumn("firstName", concat(lit("firstName"),rateData.col("value"))) 27 | .withColumn("lastName", concat(lit("lastName"),rateData.col("value"))) 28 | .withColumn("departmentId", lit(floor(rateData.col("value")/10))) 29 | // .withColumnRenamed("value", "id") 30 | 31 | val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv") 32 | 33 | val leftOuterJoinDS = staticDepartmentDS.join(streamingEmployeeDS, $"id" === $"departmentId", "left_outer") 34 | 35 | val leftOuterJoinStream = leftOuterJoinDS.writeStream 36 | .format("console") 37 | .queryName("LeftOuterJoin") 38 | .trigger(Trigger.ProcessingTime("10 seconds")) 39 | .option("checkpointLocation", "sparkCheckPoint\\StaticStreamLeftOuterJoin\\cp2") 40 | .start() 41 | 42 | spark.streams.awaitAnyTermination() 43 | } 44 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/join/staticstream/staticStreamRightOuterJoin.scala: -------------------------------------------------------------------------------- 1 | package streaming.join.staticstream 2 | 3 | import entity.RateData 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.streaming.Trigger 7 | 8 | object staticStreamRightOuterJoin extends App { 9 | 10 | val spark: SparkSession = SparkSession.builder() 11 | .appName("StaticStreamRightOuterJoin") 12 | .master("local[*]") 13 | .getOrCreate() 14 | 15 | val df = spark.readStream 16 | .format("rate") 17 | .option("rowsPerSecond", 1) 18 | .option("numPartitions", 1) 19 | .option("rampUpTime", 1) 20 | .load() 21 | 22 | import spark.implicits._ 23 | 24 | val rateData = df.as[RateData] 25 | val streamingEmployeeDS = rateData.where("value % 10 != 0") 26 | .withColumn("firstName", concat(lit("firstName"),rateData.col("value"))) 27 | .withColumn("lastName", concat(lit("lastName"),rateData.col("value"))) 28 | .withColumn("departmentId", lit(floor(rateData.col("value")/10))) 29 | // .withColumnRenamed("value", "id") 30 | 31 | val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv") 32 | 33 | val rightOuterJoinDS = staticDepartmentDS 34 | .join(streamingEmployeeDS, $"id" === $"departmentId", "right_outer") 35 | 36 | val rightOuterJoinStream = rightOuterJoinDS.writeStream 37 | .format("console") 38 | .queryName("InnerJoin") 39 | .trigger(Trigger.ProcessingTime("10 seconds")) 40 | .option("checkpointLocation", "sparkCheckPoint\\StaticStreamRightOuterJoin\\cp1") 41 | .start() 42 | 43 | spark.streams.awaitAnyTermination() 44 | } 45 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/join/streamstatic/streamStaticFullOuterJoin.scala: -------------------------------------------------------------------------------- 1 | package streaming.join.streamstatic 2 | 3 | import entity.RateData 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.streaming.Trigger 7 | 8 | object streamStaticFullOuterJoin extends App { 9 | 10 | val spark: SparkSession = SparkSession.builder() 11 | .appName("streamStaticFullOuterJoin") 12 | .master("local[*]") 13 | .getOrCreate() 14 | 15 | val df = spark.readStream 16 | .format("rate") 17 | .option("rowsPerSecond", 1) 18 | .option("numPartitions", 1) 19 | .option("rampUpTime", 1) 20 | .load() 21 | 22 | import spark.implicits._ 23 | 24 | val rateData = df.as[RateData] 25 | val streamingEmployeeDS = rateData.where("value % 10 != 0") 26 | .withColumn("firstName", concat(lit("firstName"),rateData.col("value"))) 27 | .withColumn("lastName", concat(lit("lastName"),rateData.col("value"))) 28 | .withColumn("departmentId", lit(floor(rateData.col("value")/10))) 29 | // .withColumnRenamed("value", "id") 30 | 31 | val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv") 32 | 33 | val fullOuterJoinDS = streamingEmployeeDS.join(staticDepartmentDS, $"departmentId" === $"id", "full_outer") 34 | 35 | val fullOuterJoinStream = fullOuterJoinDS.writeStream 36 | .format("console") 37 | .queryName("InnerJoin") 38 | .trigger(Trigger.ProcessingTime("10 seconds")) 39 | .option("checkpointLocation", "sparkCheckPoint\\streamStaticFullOuterJoin\\cp1") 40 | .start() 41 | 42 | 43 | } 44 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/join/streamstatic/streamStaticInnerJoin.scala: -------------------------------------------------------------------------------- 1 | package streaming.join.streamstatic 2 | 3 | import entity.RateData 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.streaming.Trigger 7 | 8 | object streamStaticInnerJoin extends App { 9 | 10 | val spark: SparkSession = SparkSession.builder() 11 | .appName("streamStaticInnerJoin") 12 | .master("local[*]") 13 | .getOrCreate() 14 | 15 | val df = spark.readStream 16 | .format("rate") 17 | .option("rowsPerSecond", 1) 18 | .option("numPartitions", 1) 19 | .option("rampUpTime", 1) 20 | .load() 21 | 22 | import spark.implicits._ 23 | 24 | val rateData = df.as[RateData] 25 | val streamingEmployeeDS = rateData.where("value % 10 != 0") 26 | .withColumn("firstName", concat(lit("firstName"),rateData.col("value"))) 27 | .withColumn("lastName", concat(lit("lastName"),rateData.col("value"))) 28 | .withColumn("departmentId", lit(floor(rateData.col("value")/10))) 29 | // .withColumnRenamed("value", "id") 30 | 31 | val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv") 32 | 33 | val innerJoinDS = streamingEmployeeDS.join(staticDepartmentDS, $"departmentId" === $"id") 34 | 35 | val innerJoinStream = innerJoinDS.writeStream 36 | .format("console") 37 | .queryName("InnerJoin") 38 | .trigger(Trigger.ProcessingTime("10 seconds")) 39 | .option("checkpointLocation", "sparkCheckPoint\\streamStaticInnerJoin\\cp1") 40 | .start() 41 | 42 | spark.streams.awaitAnyTermination() 43 | } 44 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/join/streamstatic/streamStaticLeftOuterJoin.scala: -------------------------------------------------------------------------------- 1 | package streaming.join.streamstatic 2 | 3 | import entity.RateData 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.streaming.Trigger 7 | 8 | object streamStaticLeftOuterJoin extends App { 9 | 10 | val spark: SparkSession = SparkSession.builder() 11 | .appName("streamStaticLeftOuterJoin") 12 | .master("local[*]") 13 | .getOrCreate() 14 | 15 | val df = spark.readStream 16 | .format("rate") 17 | .option("rowsPerSecond", 1) 18 | .option("numPartitions", 1) 19 | .option("rampUpTime", 1) 20 | .load() 21 | 22 | import spark.implicits._ 23 | 24 | val rateData = df.as[RateData] 25 | val streamingEmployeeDS = rateData.where("value % 10 != 0") 26 | .withColumn("firstName", concat(lit("firstName"),rateData.col("value"))) 27 | .withColumn("lastName", concat(lit("lastName"),rateData.col("value"))) 28 | .withColumn("departmentId", lit(floor(rateData.col("value")/10))) 29 | // .withColumnRenamed("value", "id") 30 | 31 | val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv") 32 | 33 | val leftOuterJoinDS = streamingEmployeeDS 34 | .join(staticDepartmentDS, $"departmentId" === $"id", "left_outer") 35 | 36 | val leftOuterJoinStream = leftOuterJoinDS.writeStream 37 | .format("console") 38 | .queryName("LeftOuterJoin") 39 | .trigger(Trigger.ProcessingTime("10 seconds")) 40 | .option("checkpointLocation", "sparkCheckPoint\\streamStaticLeftOuterJoin\\cp2") 41 | .start() 42 | 43 | spark.streams.awaitAnyTermination() 44 | } 45 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/join/streamstatic/streamStaticRightOuterJoin.scala: -------------------------------------------------------------------------------- 1 | package streaming.join.streamstatic 2 | 3 | import entity.RateData 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.streaming.Trigger 7 | 8 | object streamStaticRightOuterJoin extends App { 9 | 10 | val spark: SparkSession = SparkSession.builder() 11 | .appName("streamStaticRightOuterJoin") 12 | .master("local[*]") 13 | .getOrCreate() 14 | 15 | val df = spark.readStream 16 | .format("rate") 17 | .option("rowsPerSecond", 1) 18 | .option("numPartitions", 1) 19 | .option("rampUpTime", 1) 20 | .load() 21 | 22 | import spark.implicits._ 23 | 24 | val rateData = df.as[RateData] 25 | val streamingEmployeeDS = rateData.where("value % 10 != 0") 26 | .withColumn("firstName", concat(lit("firstName"),rateData.col("value"))) 27 | .withColumn("lastName", concat(lit("lastName"),rateData.col("value"))) 28 | .withColumn("departmentId", lit(floor(rateData.col("value")/10))) 29 | // .withColumnRenamed("value", "id") 30 | 31 | val staticDepartmentDS = spark.read.format("csv").option("header","true").load("src/main/resources/department.csv") 32 | 33 | val rightOuterJoinDS = streamingEmployeeDS.join(staticDepartmentDS, $"departmentId" === $"id", "right_outer") 34 | 35 | val rightOuterJoinStream = rightOuterJoinDS.writeStream 36 | .format("console") 37 | .queryName("InnerJoin") 38 | .trigger(Trigger.ProcessingTime("10 seconds")) 39 | .option("checkpointLocation", "sparkCheckPoint\\streamStaticRightOuterJoin\\cp1") 40 | .start() 41 | 42 | spark.streams.awaitAnyTermination() 43 | } 44 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/join/streamstream/streamStreamFullOuterJoin.scala: -------------------------------------------------------------------------------- 1 | package streaming.join.streamstream 2 | 3 | import entity.RateData 4 | import org.apache.log4j.{Level, LogManager} 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.streaming.Trigger 8 | 9 | object streamStreamFullOuterJoin extends App { 10 | 11 | val spark: SparkSession = SparkSession.builder() 12 | .appName("streamStreamFullOuterJoin") 13 | .master("local[*]") 14 | .getOrCreate() 15 | 16 | spark.conf.set("spark.sql.shuffle.partitions", "1") 17 | 18 | val logger = LogManager.getRootLogger 19 | logger.setLevel(Level.ERROR) 20 | 21 | val df = spark.readStream 22 | .format("rate") 23 | .option("rowsPerSecond", 1) 24 | .option("numPartitions", 1) 25 | .option("rampUpTime", 1) 26 | .load() 27 | 28 | import spark.implicits._ 29 | 30 | val rateData = df.as[RateData] 31 | val employeeStreamDS = rateData.where("value % 10 != 0") 32 | .withColumn("firstName", concat(lit("firstName"),rateData.col("value"))) 33 | .withColumn("lastName", concat(lit("lastName"),rateData.col("value"))) 34 | .withColumn("departmentId", lit(floor(rateData.col("value")/10))) 35 | .withColumnRenamed("timestamp", "empTimestamp") 36 | .withWatermark("empTimestamp", "10 seconds") 37 | // .withColumnRenamed("value", "id") 38 | 39 | val departmentStreamDS = rateData.where("value % 10 == 0") 40 | .withColumn("name", concat(lit("name"),floor(rateData.col("value")/10))) 41 | .withColumn("Id", lit(floor(rateData.col("value")/10))) 42 | .drop("value") 43 | .withColumnRenamed("timestamp", "depTimestamp") 44 | .withWatermark("depTimestamp", "10 seconds") 45 | 46 | val joinedDS = departmentStreamDS 47 | .join(employeeStreamDS, expr(""" 48 | id = departmentId AND 49 | empTimestamp >= depTimestamp - interval 1 minutes AND 50 | empTimestamp <= depTimestamp + interval 1 minutes 51 | """ 52 | ), "full_outer") 53 | 54 | val joinedStream = joinedDS.writeStream 55 | .format("console") 56 | .queryName("joinedTable") 57 | .option("checkpointLocation", "sparkCheckPoint\\streamStreamFullOuterJoin\\joinedTable") 58 | .trigger(Trigger.ProcessingTime("5 seconds")) 59 | .start() 60 | 61 | spark.streams.awaitAnyTermination() 62 | } 63 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/join/streamstream/streamStreamInnerJoin.scala: -------------------------------------------------------------------------------- 1 | package streaming.join.streamstream 2 | 3 | import entity.RateData 4 | import org.apache.log4j.{Level, LogManager} 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.streaming.Trigger 8 | 9 | object streamStreamInnerJoin extends App { 10 | 11 | val spark: SparkSession = SparkSession.builder() 12 | .appName("streamStreamInnerJoin") 13 | .master("local[*]") 14 | .getOrCreate() 15 | 16 | spark.conf.set("spark.sql.shuffle.partitions", "1") 17 | 18 | val logger = LogManager.getRootLogger 19 | logger.setLevel(Level.ERROR) 20 | 21 | val df = spark.readStream 22 | .format("rate") 23 | .option("rowsPerSecond", 1) 24 | .option("numPartitions", 1) 25 | .option("rampUpTime", 1) 26 | .load() 27 | 28 | import spark.implicits._ 29 | 30 | val rateData = df.as[RateData] 31 | val employeeDS = rateData.where("value % 10 != 0") 32 | .withColumn("firstName", concat(lit("firstName"),rateData.col("value"))) 33 | .withColumn("lastName", concat(lit("lastName"),rateData.col("value"))) 34 | .withColumn("departmentId", lit(floor(rateData.col("value")/10))) 35 | .withColumnRenamed("value", "id") 36 | 37 | val departmentDS = rateData.where("value % 10 == 0") 38 | .withColumn("name", concat(lit("name"),floor(rateData.col("value")/10))) 39 | .withColumn("departmentId", lit(floor(rateData.col("value")/10))) 40 | .drop("value") 41 | 42 | val joinedDS = departmentDS.join(employeeDS,"departmentId") 43 | 44 | val joinedStream = joinedDS.writeStream 45 | .format("console") 46 | .queryName("joinedTable") 47 | .option("checkpointLocation", "sparkCheckPoint\\streamStreamInnerJoin\\joinedTable") 48 | .trigger(Trigger.ProcessingTime("5 seconds")) 49 | .start() 50 | 51 | spark.streams.awaitAnyTermination() 52 | } 53 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/join/streamstream/streamStreamLeftOuterJoin.scala: -------------------------------------------------------------------------------- 1 | package streaming.join.streamstream 2 | 3 | import entity.RateData 4 | import org.apache.log4j.{Level, LogManager} 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.streaming.Trigger 8 | 9 | object streamStreamLeftOuterJoin extends App { 10 | 11 | val spark: SparkSession = SparkSession.builder() 12 | .appName("streamStreamLeftOuterJoin") 13 | .master("local[*]") 14 | .getOrCreate() 15 | 16 | spark.conf.set("spark.sql.shuffle.partitions", "1") 17 | 18 | val logger = LogManager.getRootLogger 19 | logger.setLevel(Level.ERROR) 20 | 21 | val rateSource = spark.readStream 22 | .format("rate") 23 | .option("rowsPerSecond", 10000) 24 | .option("numPartitions", 1) 25 | .option("rampUpTime", 1) 26 | .load() 27 | 28 | import spark.implicits._ 29 | 30 | val rateSourceData = rateSource.as[RateData] 31 | val employeeStreamDS = rateSourceData.where("value % 10 != 0") 32 | .withColumn("firstName", concat(lit("firstName"),rateSourceData.col("value"))) 33 | .withColumn("lastName", concat(lit("lastName"),rateSourceData.col("value"))) 34 | .withColumn("departmentId", lit(floor(rateSourceData.col("value")/10))) 35 | .withColumnRenamed("timestamp", "empTimestamp") 36 | .withWatermark("empTimestamp", "10 seconds") 37 | // .withColumnRenamed("value", "id") 38 | 39 | val departmentStreamDS = rateSourceData.where("value % 10 == 0") 40 | .withColumn("name", concat(lit("name"),floor(rateSourceData.col("value")/10))) 41 | .withColumn("Id", lit(floor(rateSourceData.col("value")/10))) 42 | .drop("value") 43 | .withColumnRenamed("timestamp", "depTimestamp") 44 | // .withWatermark("depTimestamp", "10 seconds") 45 | 46 | val joinedDS = departmentStreamDS 47 | .join(employeeStreamDS, expr(""" 48 | id = departmentId AND 49 | empTimestamp >= depTimestamp - interval 1 minutes AND 50 | empTimestamp <= depTimestamp + interval 1 minutes 51 | """ 52 | ), "left_outer") 53 | 54 | val joinedStream = joinedDS.writeStream 55 | .format("console") 56 | .queryName("joinedTable") 57 | .option("checkpointLocation", "sparkCheckPoint\\streamStreamLeftOuterJoin\\joinedTable") 58 | .trigger(Trigger.ProcessingTime("20 seconds")) 59 | .start() 60 | 61 | spark.streams.awaitAnyTermination() 62 | } 63 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/join/streamstream/streamStreamRightOuterJoin.scala: -------------------------------------------------------------------------------- 1 | package streaming.join.streamstream 2 | 3 | import entity.RateData 4 | import org.apache.log4j.{Level, LogManager} 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.streaming.Trigger 8 | 9 | object streamStreamRightOuterJoin extends App { 10 | 11 | val spark: SparkSession = SparkSession.builder() 12 | .appName("streamStreamRightOuterJoin") 13 | .master("local[*]") 14 | .getOrCreate() 15 | 16 | spark.conf.set("spark.sql.shuffle.partitions", "1") 17 | 18 | val logger = LogManager.getRootLogger 19 | logger.setLevel(Level.ERROR) 20 | 21 | val df = spark.readStream 22 | .format("rate") 23 | .option("rowsPerSecond", 1) 24 | .option("numPartitions", 1) 25 | .option("rampUpTime", 1) 26 | .load() 27 | 28 | import spark.implicits._ 29 | 30 | val rateData = df.as[RateData] 31 | val employeeStreamDS = rateData.where("value % 10 != 0") 32 | .withColumn("firstName", concat(lit("firstName"),rateData.col("value"))) 33 | .withColumn("lastName", concat(lit("lastName"),rateData.col("value"))) 34 | .withColumn("departmentId", lit(floor(rateData.col("value")/10))) 35 | .withColumnRenamed("timestamp", "empTimestamp") 36 | // .withWatermark("empTimestamp", "10 seconds") 37 | // .withColumnRenamed("value", "id") 38 | 39 | val departmentStreamDS = rateData.where("value % 10 == 0") 40 | .withColumn("name", concat(lit("name"),floor(rateData.col("value")/10))) 41 | .withColumn("Id", lit(floor(rateData.col("value")/10))) 42 | .drop("value") 43 | .withColumnRenamed("timestamp", "depTimestamp") 44 | .withWatermark("depTimestamp", "10 seconds") 45 | 46 | val joinedDS = departmentStreamDS 47 | .join(employeeStreamDS, expr(""" 48 | id = departmentId AND 49 | empTimestamp >= depTimestamp - interval 1 minutes AND 50 | empTimestamp <= depTimestamp + interval 1 minutes 51 | """ 52 | ), "right_outer") 53 | 54 | val joinedStream = joinedDS.writeStream 55 | .format("console") 56 | .queryName("joinedTable") 57 | .option("checkpointLocation", "sparkCheckPoint\\streamStreamRightOuterJoin\\joinedTable") 58 | .trigger(Trigger.ProcessingTime("5 seconds")) 59 | .start() 60 | 61 | spark.streams.awaitAnyTermination() 62 | } 63 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/kafka/Kafka2Kafka.scala: -------------------------------------------------------------------------------- 1 | package streaming.kafka 2 | 3 | import org.apache.log4j.{Level, LogManager} 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object Kafka2Kafka extends App{ 7 | 8 | val spark : SparkSession = SparkSession.builder() 9 | .appName("Kafka2Kafka") 10 | .master("local[*]") 11 | .getOrCreate() 12 | 13 | val logger = LogManager.getRootLogger 14 | logger.setLevel(Level.ERROR) 15 | 16 | val df = spark.readStream.format("kafka") 17 | .option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "test") 18 | .load() 19 | 20 | val kafkaRawData = df.selectExpr("CAST(key AS STRING)", "CAST(value AS string)", "topic", "partition", "offset", "timestamp","timestampType") 21 | val kafkaWriteStream = kafkaRawData.writeStream.format("kafka").option("topic", "test2") 22 | .option("checkpointLocation","sparkCheckPoint\\Kafka2Kafka") 23 | .option("kafka.bootstrap.servers", "localhost:9092") 24 | .start().awaitTermination() 25 | } 26 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/kafka/Rate2Kafka.scala: -------------------------------------------------------------------------------- 1 | package streaming.kafka 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.Trigger 5 | 6 | object Rate2Kafka extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .appName("Rate2Kafka") 10 | .master("local[*]") 11 | .getOrCreate() 12 | 13 | val df = spark.readStream 14 | .format("rate") 15 | .option("rowsPerSecond", 2) 16 | .option("numPartitions", 2) 17 | .option("rampUpTime", 1) 18 | .load() 19 | 20 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 21 | val kafkaStream = rateRawData.writeStream 22 | .format("kafka") 23 | .queryName("First Kafka Stream") 24 | .option("topic", "test2") 25 | .option("checkpointLocation", "sparkCheckPoint\\Rate2Kafka\\cp1").option("kafka.bootstrap.servers", "localhost:9092") 26 | .trigger(Trigger.ProcessingTime("10 seconds")) 27 | .start() 28 | 29 | kafkaStream.awaitTermination() 30 | } 31 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/kafka/Rate2KafkaMultiStream.scala: -------------------------------------------------------------------------------- 1 | package streaming.kafka 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.Trigger 5 | 6 | object Rate2KafkaMultiStream extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .appName("Rate2KafkaMultiStream") 10 | .master("local[*]") 11 | .getOrCreate() 12 | 13 | val df = spark.readStream 14 | .format("rate") 15 | .option("rowsPerSecond", 20000) 16 | .option("numPartitions", 2) 17 | .option("rampUpTime", 1) 18 | .load() 19 | 20 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 21 | val kafkaWriteStream1 = rateRawData.writeStream 22 | .queryName("First Kafka Stream") 23 | .format("kafka") 24 | .option("topic", "test2") 25 | .option("checkpointLocation", "sparkCheckPoint\\Rate2KafkaMultiStream\\cp1").option("kafka.bootstrap.servers", "localhost:9092") 26 | .trigger(Trigger.ProcessingTime("10 seconds")) 27 | .start() 28 | 29 | val kafkaWriteStream2 = rateRawData.writeStream 30 | .queryName("Second Kafka Stream") 31 | .format("kafka") 32 | .option("topic", "test2") 33 | .option("checkpointLocation", "sparkCheckPoint\\Rate2KafkaMultiStream\\cp2") 34 | .option("kafka.bootstrap.servers", "localhost:9092") 35 | .trigger(Trigger.ProcessingTime("10 seconds")) 36 | .start() 37 | 38 | spark.streams.awaitAnyTermination() 39 | } 40 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/kafka/Rate2KafkaSparkListener.scala: -------------------------------------------------------------------------------- 1 | package streaming.kafka 2 | 3 | import org.apache.log4j.{Level, LogManager} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.streaming.{StreamingQueryListener, Trigger} 6 | 7 | 8 | object Rate2KafkaSparkListener extends App { 9 | 10 | val spark: SparkSession = SparkSession.builder() 11 | .appName("Rate2KafkaSparkListener") 12 | .master("local[*]") 13 | .config("spark.sql.streaming.metricsEnabled", true) 14 | .getOrCreate() 15 | 16 | val logger = LogManager.getRootLogger 17 | logger.setLevel(Level.DEBUG) 18 | 19 | val df = spark.readStream 20 | .format("rate") 21 | // .option("rowsPerSecond", 1) 22 | // .option("numPartitions", 10) 23 | // .option("rampUpTime", 2) 24 | .load() 25 | 26 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 27 | val kafkaWriteStream1 = rateRawData.writeStream 28 | .format("kafka") 29 | .queryName("First Kafka Stream") 30 | .option("topic", "test2") 31 | .option("checkpointLocation", "sparkCheckPoint\\Rate2KafkaSparkListener\\cp1").option("kafka.bootstrap.servers", "localhost:9092") 32 | .trigger(Trigger.ProcessingTime("10 seconds")) 33 | .start() 34 | 35 | // val kafkaWriteStream2 = rateRawData.writeStream 36 | // .format("kafka") 37 | // .queryName("Second Kafka Stream") 38 | // .option("topic", "test2") 39 | // .option("checkpointLocation", "sparkCheckPoint\\Rate2KafkaSparkListener\\cp2").option("kafka.bootstrap.servers", "localhost:9092") 40 | // .trigger(Trigger.ProcessingTime("20 seconds")) 41 | // .start() 42 | 43 | val chartListener = new StreamingQueryListener() { 44 | val MaxDataPoints = 100 45 | // a mutable reference to an immutable container to buffer n data points 46 | override def onQueryStarted(event: StreamingQueryListener.QueryStartedEvent): Unit = () 47 | 48 | override def onQueryProgress(event: StreamingQueryListener.QueryProgressEvent): Unit = { 49 | val queryProgress = event.progress 50 | 51 | if(queryProgress.numInputRows > 0) { 52 | val time = queryProgress.timestamp 53 | val inputRowsPerSecond = queryProgress.numInputRows 54 | val name = queryProgress.name 55 | val processedRowsPerSecond = queryProgress.processedRowsPerSecond 56 | 57 | println("Metrics name "+ name+" time "+ time + " inputRows "+ inputRowsPerSecond + " processedRowsPerSecond "+ processedRowsPerSecond) 58 | } 59 | } 60 | 61 | override def onQueryTerminated(event: StreamingQueryListener.QueryTerminatedEvent): Unit = () 62 | } 63 | 64 | spark.streams.addListener(chartListener) 65 | spark.streams.awaitAnyTermination() 66 | 67 | } -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/mode/Rate2ConsoleAggregateAppendMode.scala: -------------------------------------------------------------------------------- 1 | package streaming.mode 2 | 3 | import org.apache.log4j.{Level, LogManager} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.streaming.Trigger 6 | 7 | object Rate2ConsoleAggregateAppendMode extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .appName("Rate2ConsoleAggregateAppendMode") 11 | .master("local[*]") 12 | .getOrCreate() 13 | 14 | val logger = LogManager.getRootLogger 15 | logger.setLevel(Level.ERROR) 16 | 17 | // spark.conf.set("spark.sql.shuffle.partitions", "1") 18 | 19 | val df = spark.readStream 20 | .format("rate") 21 | .option("rowsPerSecond", 1) 22 | // .option("numPartitions", 1) 23 | .option("rampUpTime", 1) 24 | .load() 25 | 26 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 27 | 28 | val transformedData = rateRawData.withColumn("key", rateRawData.col("timestamp").substr(15, 2)) 29 | val countData = transformedData.groupBy("key").count() 30 | 31 | val defaultStream = countData.writeStream 32 | .format("console") 33 | .queryName("Append Mode") 34 | .trigger(Trigger.ProcessingTime("10 seconds")) 35 | // .outputMode("complete") 36 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleAggregateAppendMode\\cp1") 37 | .start() 38 | 39 | spark.streams.awaitAnyTermination() 40 | } 41 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/mode/Rate2ConsoleAggregateCompleteMode.scala: -------------------------------------------------------------------------------- 1 | package streaming.mode 2 | 3 | import org.apache.log4j.{Level, LogManager} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.streaming.Trigger 6 | 7 | object Rate2ConsoleAggregateCompleteMode extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .appName("Rate2ConsoleAggregateCompleteMode") 11 | .master("local[*]") 12 | .getOrCreate() 13 | 14 | val logger = LogManager.getRootLogger 15 | logger.setLevel(Level.ERROR) 16 | 17 | spark.conf.set("spark.sql.shuffle.partitions", "1") 18 | 19 | val df = spark.readStream 20 | .format("rate") 21 | .option("rowsPerSecond", 1) 22 | // .option("numPartitions", 1) 23 | .option("rampUpTime", 1) 24 | .load() 25 | 26 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 27 | 28 | val transformedData = rateRawData.withColumn("key", rateRawData.col("timestamp").substr(15, 2)) 29 | val countData = transformedData.groupBy("key").count() 30 | 31 | val defaultStream = countData.writeStream 32 | .format("console") 33 | .queryName("Complete Mode") 34 | .trigger(Trigger.ProcessingTime("10 seconds")) 35 | .outputMode("complete") 36 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleAggregateCompleteMode\\cp1") 37 | .start() 38 | 39 | spark.streams.awaitAnyTermination() 40 | } 41 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/mode/Rate2ConsoleAggregateUpdateMode.scala: -------------------------------------------------------------------------------- 1 | package streaming.mode 2 | 3 | import org.apache.log4j.{Level, LogManager} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.streaming.Trigger 6 | 7 | object Rate2ConsoleAggregateUpdateMode extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .appName("Rate2ConsoleAggregateUpdateMode") 11 | .master("local[*]") 12 | .getOrCreate() 13 | 14 | val logger = LogManager.getRootLogger 15 | logger.setLevel(Level.ERROR) 16 | 17 | spark.conf.set("spark.sql.shuffle.partitions", "1") 18 | 19 | val df = spark.readStream 20 | .format("rate") 21 | .option("rowsPerSecond", 1) 22 | // .option("numPartitions", 1) 23 | .option("rampUpTime", 1) 24 | .load() 25 | 26 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 27 | 28 | val transformedData = rateRawData.withColumn("key", rateRawData.col("timestamp").substr(15, 2)) 29 | val countData = transformedData.groupBy("key").count() 30 | 31 | val defaultStream = countData.writeStream 32 | .format("console") 33 | .queryName("Append Mode") 34 | .trigger(Trigger.ProcessingTime("10 seconds")) 35 | .outputMode("update") 36 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleAggregateUpdateMode\\cp1") 37 | .start() 38 | 39 | spark.streams.awaitAnyTermination() 40 | } 41 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/mode/Rate2ConsoleCompleteMode.scala: -------------------------------------------------------------------------------- 1 | package streaming.mode 2 | 3 | import org.apache.log4j.{Level, LogManager} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.streaming.Trigger 6 | 7 | object Rate2ConsoleCompleteMode extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .appName("Rate2ConsoleCompleteMode") 11 | .master("local[*]") 12 | .getOrCreate() 13 | 14 | val logger = LogManager.getRootLogger 15 | logger.setLevel(Level.ERROR) 16 | 17 | val df = spark.readStream 18 | .format("rate") 19 | .option("rowsPerSecond", 1) 20 | // .option("numPartitions", 1) 21 | .option("rampUpTime", 1) 22 | .load() 23 | 24 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 25 | val defaultStream = rateRawData.writeStream 26 | .format("console") 27 | .queryName("Complete Mode") 28 | .trigger(Trigger.ProcessingTime("10 seconds")) 29 | .outputMode("complete") 30 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleCompleteMode\\cp1") 31 | .start() 32 | 33 | spark.streams.awaitAnyTermination() 34 | } 35 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/mode/Rate2ConsoleDefaultMode.scala: -------------------------------------------------------------------------------- 1 | package streaming.mode 2 | 3 | import org.apache.log4j.{Level, LogManager} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.streaming.Trigger 6 | 7 | object Rate2ConsoleDefaultMode extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .appName("Rate2ConsoleDefaultMode") 11 | .master("local[*]") 12 | .getOrCreate() 13 | 14 | val logger = LogManager.getRootLogger 15 | logger.setLevel(Level.ERROR) 16 | 17 | val df = spark.readStream 18 | .format("rate") 19 | .option("rowsPerSecond", 1) 20 | // .option("numPartitions", 1) 21 | .option("rampUpTime", 1) 22 | .load() 23 | 24 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 25 | val defaultStream = rateRawData.writeStream 26 | .format("console") 27 | .queryName("Default Mode") 28 | .trigger(Trigger.ProcessingTime("10 seconds")) 29 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleDefaultMode\\cp1") 30 | .start() 31 | 32 | spark.streams.awaitAnyTermination() 33 | } 34 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/mode/Rate2ConsoleUpdateMode.scala: -------------------------------------------------------------------------------- 1 | package streaming.mode 2 | 3 | import org.apache.log4j.{Level, LogManager} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.streaming.Trigger 6 | 7 | object Rate2ConsoleUpdateMode extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .appName("Rate2ConsoleUpdateMode") 11 | .master("local[*]") 12 | .getOrCreate() 13 | 14 | val logger = LogManager.getRootLogger 15 | logger.setLevel(Level.ERROR) 16 | 17 | val df = spark.readStream 18 | .format("rate") 19 | .option("rowsPerSecond", 1) 20 | // .option("numPartitions", 1) 21 | .option("rampUpTime", 1) 22 | .load() 23 | 24 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 25 | val defaultStream = rateRawData.writeStream 26 | .format("console") 27 | .queryName("Update Mode") 28 | .trigger(Trigger.ProcessingTime("10 seconds")) 29 | .outputMode("update") 30 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleUpdateMode\\cp1") 31 | .start() 32 | 33 | spark.streams.awaitAnyTermination() 34 | } 35 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/state/Rate2SparkState.scala: -------------------------------------------------------------------------------- 1 | package streaming.state 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.streaming.{GroupStateTimeout, Trigger} 6 | 7 | case class InputValue(timestamp: String, value: String, key: String, 8 | data1: String, data2: String, data3: String) 9 | 10 | object Rate2SparkState extends App with manageStateHelper { 11 | 12 | 13 | val spark: SparkSession = SparkSession.builder() 14 | .appName("Rate2SparkState") 15 | .master("local[*]") 16 | .getOrCreate() 17 | 18 | val df = spark.readStream 19 | .format("rate") 20 | .option("rowsPerSecond", 2) 21 | .option("numPartitions", 2) 22 | .option("rampUpTime", 1) 23 | .load() 24 | 25 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 26 | 27 | import spark.implicits._ 28 | 29 | val transformedData = rateRawData 30 | .withColumn("key", lit(1)) 31 | .withColumn("data1", lit(constant.sampleJSON)) 32 | .withColumn("data2", lit(constant.sampleJSON)) 33 | .withColumn("data3", lit(constant.sampleJSON)).as[InputValue] 34 | 35 | import spark.implicits._ 36 | 37 | val stateManagement = transformedData 38 | .groupByKey(_.key) 39 | .mapGroupsWithState(GroupStateTimeout.NoTimeout())(manageState) 40 | val processingTimeStream = stateManagement.writeStream 41 | .format("console") 42 | .outputMode("update") 43 | .queryName("Micro Batch") 44 | .trigger(Trigger.ProcessingTime("10 seconds")) 45 | .option("checkpointLocation", "sparkCheckPoint\\Rate2SparkState\\cp1") 46 | .option("truncate", false) 47 | .start() 48 | 49 | spark.streams.awaitAnyTermination() 50 | 51 | } 52 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/state/manageStateHelper.scala: -------------------------------------------------------------------------------- 1 | package streaming.state 2 | 3 | import org.apache.spark.sql.streaming.GroupState 4 | 5 | 6 | case class StateEvents(key:String=null,value:List[InputValue]=List()) 7 | 8 | trait manageStateHelper { 9 | 10 | var state :StateEvents = _ 11 | 12 | def manageState(key: String, inputEvents: Iterator[InputValue], groupState: GroupState[StateEvents]): List[InputValue] = { 13 | println("Key ::" + key) 14 | println("inputEvents ::" + inputEvents.toList) 15 | //println("groupState ::" + groupState.) 16 | 17 | val inputEventList = inputEvents.toList 18 | 19 | state = groupState.getOption.getOrElse(StateEvents()) 20 | 21 | 22 | inputEventList.map( event => updateState(state,event)) 23 | groupState.update(state) 24 | // state = groupState.getOption.getOrElse(InputValue()) 25 | 26 | //groupState.update(updateState(state,inputEventList)) 27 | if(groupState.exists) groupState.get.value else List() 28 | } 29 | 30 | def updateState(stateData:StateEvents,inputEvent:InputValue): StateEvents ={ 31 | state = state.copy(value = state.value:+inputEvent) 32 | println("State :: " + state) 33 | state 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/trigger/Rate2ConsoleContinuousTrigger.scala: -------------------------------------------------------------------------------- 1 | package streaming.trigger 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.Trigger 5 | 6 | object Rate2ConsoleContinuousTrigger extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .appName("Rate2ConsoleContinuousTrigger") 10 | .master("local[*]") 11 | .getOrCreate() 12 | 13 | spark.sparkContext.setLogLevel("ERROR") 14 | 15 | val df = spark.readStream 16 | .format("rate") 17 | .option("rowsPerSecond", 1) 18 | .option("numPartitions", 1) 19 | .option("rampUpTime", 1) 20 | .load() 21 | 22 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 23 | val firstContinuousStream = rateRawData.writeStream 24 | .format("console") 25 | .queryName("First Continuous Stream ") 26 | .trigger(Trigger.Continuous("1 seconds")) 27 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleContinuousTrigger\\cp1") 28 | .start() 29 | 30 | val secondContinuousStream = rateRawData.writeStream 31 | .format("console") 32 | .queryName("Second Continuous Stream ") 33 | .trigger(Trigger.Continuous("1 seconds")) 34 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleContinuousTrigger\\cp2") 35 | .start() 36 | 37 | spark.streams.awaitAnyTermination() 38 | } 39 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/trigger/Rate2ConsoleDefaultTrigger.scala: -------------------------------------------------------------------------------- 1 | package streaming.trigger 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object Rate2ConsoleDefaultTrigger extends App { 6 | 7 | val spark: SparkSession = SparkSession.builder() 8 | .appName("Rate2ConsoleDefaultTrigger") 9 | .master("local[*]") 10 | .getOrCreate() 11 | 12 | val df = spark.readStream 13 | .format("rate") 14 | .option("rowsPerSecond", 90000) 15 | // .option("numPartitions", 1) 16 | .option("rampUpTime", 1) 17 | .load() 18 | 19 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 20 | val defaultStream = rateRawData.writeStream 21 | .format("console") 22 | .queryName("Default") 23 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleDefaultTrigger\\cp1") 24 | .start() 25 | 26 | spark.streams.awaitAnyTermination() 27 | } 28 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/trigger/Rate2ConsoleOnceTrigger.scala: -------------------------------------------------------------------------------- 1 | package streaming.trigger 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.Trigger 5 | 6 | object Rate2ConsoleOnceTrigger extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .appName("Rate2ConsoleOnceTrigger") 10 | .master("local[*]") 11 | .getOrCreate() 12 | 13 | val df = spark.readStream 14 | .format("rate") 15 | .option("rowsPerSecond", 1000) 16 | .option("numPartitions", 4) 17 | .option("rampUpTime", 1) 18 | .load() 19 | 20 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 21 | val onceStream = rateRawData.writeStream 22 | .format("console") 23 | .queryName("Once") 24 | .trigger(Trigger.Once()) 25 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleOnceTrigger\\cp1") 26 | .start() 27 | 28 | 29 | spark.streams.awaitAnyTermination(100000) 30 | } 31 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/trigger/Rate2ConsoleProgressTrigger.scala: -------------------------------------------------------------------------------- 1 | package streaming.trigger 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.Trigger 5 | 6 | object Rate2ConsoleProgressTrigger extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .appName("Rate2ConsoleProgressTrigger") 10 | .master("local[*]") 11 | .getOrCreate() 12 | 13 | val df = spark.readStream 14 | .format("rate") 15 | .option("rowsPerSecond", 90000) 16 | // .option("numPartitions", 1) 17 | .option("rampUpTime", 1) 18 | .load() 19 | 20 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 21 | val processingTimeStream = rateRawData.writeStream 22 | .format("console") 23 | .queryName("Micro Batch") 24 | .trigger(Trigger.ProcessingTime("20 seconds")) 25 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleProgressTrigger\\cp1") 26 | .start() 27 | 28 | spark.streams.awaitAnyTermination() 29 | } 30 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/trigger/Rate2ConsoleTriggerOptions.scala: -------------------------------------------------------------------------------- 1 | package streaming.trigger 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.streaming.Trigger 5 | 6 | object Rate2ConsoleTriggerOptions extends App { 7 | 8 | val spark: SparkSession = SparkSession.builder() 9 | .appName("Rate2ConsoleTriggerOptions") 10 | .master("local[*]") 11 | .getOrCreate() 12 | 13 | val df = spark.readStream 14 | .format("rate") 15 | .option("rowsPerSecond", 100000) 16 | .option("numPartitions", 1) 17 | .option("rampUpTime", 1) 18 | .load() 19 | 20 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 21 | val defaultStream = rateRawData.writeStream 22 | .format("console") 23 | .queryName("Default") 24 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleTriggerOptions\\cp1") 25 | .start() 26 | 27 | val onceStream = rateRawData.writeStream 28 | .format("console") 29 | .queryName("Once") 30 | .trigger(Trigger.Once()) 31 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleTriggerOptions\\cp2") 32 | .start() 33 | 34 | val processingTimeStream = rateRawData.writeStream 35 | .format("console") 36 | .queryName("Micro Batch") 37 | .trigger(Trigger.ProcessingTime("20 seconds")) 38 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleTriggerOptions\\cp3") 39 | .start() 40 | 41 | val countinuousTimeStream = rateRawData.writeStream 42 | .format("console") 43 | .queryName("Micro Batch") 44 | // .trigger(Trigger.C("20 seconds")) 45 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleTriggerOptions\\cp3") 46 | .start() 47 | 48 | defaultStream.awaitTermination() 49 | onceStream.awaitTermination() 50 | processingTimeStream.awaitTermination() 51 | 52 | // spark.streams.awaitAnyTermination() 53 | } 54 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/unsupported/Rate2ConsoleMultiStream.scala: -------------------------------------------------------------------------------- 1 | package streaming.unsupported 2 | 3 | import org.apache.log4j.{Level, LogManager} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.streaming.Trigger 6 | 7 | object Rate2ConsoleMultiStream extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .appName("Rate2ConsoleMultiStream") 11 | .master("local[*]") 12 | .getOrCreate() 13 | 14 | val logger = LogManager.getRootLogger 15 | logger.setLevel(Level.ERROR) 16 | 17 | val df = spark.readStream 18 | .format("rate") 19 | .option("rowsPerSecond", 1) 20 | // .option("numPartitions", 1) 21 | .option("rampUpTime", 1) 22 | .load() 23 | 24 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 25 | 26 | val transformedData = rateRawData.withColumn("key", rateRawData.col("timestamp").substr(15,2)) 27 | val groupedData = transformedData.groupBy("key").count() 28 | groupedData.createTempView("countTable") 29 | val countData = spark.sql("select count(*) from countTable") 30 | 31 | val defaultStream = countData.writeStream 32 | .format("console") 33 | .queryName("Complete Mode") 34 | .trigger(Trigger.ProcessingTime("10 seconds")) 35 | .outputMode("complete") 36 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleMultiStream\\cp1") 37 | .start() 38 | 39 | spark.streams.awaitAnyTermination() 40 | } 41 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/unsupported/UnsupportedFeatures.scala: -------------------------------------------------------------------------------- 1 | package streaming.unsupported 2 | 3 | import entity.RateData 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | import org.apache.spark.sql.streaming.Trigger 7 | 8 | object UnsupportedFeatures extends App { 9 | 10 | val spark: SparkSession = SparkSession.builder() 11 | .appName("UnsupportedFeatures") 12 | .master("local[*]") 13 | .getOrCreate() 14 | 15 | val df = spark.readStream 16 | .format("rate") 17 | .option("rowsPerSecond", 1) 18 | .option("numPartitions", 1) 19 | .option("rampUpTime", 1) 20 | .load() 21 | 22 | import spark.implicits._ 23 | 24 | val rateData = df.as[RateData] 25 | val employeeDS = rateData.where("value % 10 != 0") 26 | .withColumn("firstName", concat(lit("firstName"),rateData.col("value"))) 27 | .withColumn("lastName", concat(lit("lastName"),rateData.col("value"))) 28 | .withColumn("departmentId", lit(floor(rateData.col("value")/10))) 29 | // .withColumnRenamed("value", "id") 30 | 31 | val departmentDS = rateData.where("value % 10 == 0") 32 | .withColumn("name", concat(lit("name"),floor(rateData.col("value")/10))) 33 | .withColumn("id", lit(floor(rateData.col("value")/10))) 34 | .drop("value") 35 | 36 | val targetDS = departmentDS.join(employeeDS, $"id" === $"departmentId") 37 | 38 | // val targetDS = departmentDS.join(employeeDS) 39 | 40 | val employeeStream = employeeDS.writeStream 41 | .format("console") 42 | .queryName("Employee") 43 | .trigger(Trigger.ProcessingTime("10 seconds")) 44 | .option("checkpointLocation", "sparkCheckPoint\\UnsupportedFeatures\\employee") 45 | .start() 46 | 47 | val departmentStream = departmentDS.writeStream 48 | .format("console") 49 | .queryName("Department") 50 | .trigger(Trigger.ProcessingTime("10 seconds")) 51 | .option("checkpointLocation", "sparkCheckPoint\\UnsupportedFeatures\\department") 52 | .start() 53 | 54 | 55 | val targetStream = targetDS.writeStream 56 | .format("console") 57 | .queryName("joinedTable") 58 | .trigger(Trigger.ProcessingTime("15 seconds")) 59 | .option("checkpointLocation", "sparkCheckPoint\\UnsupportedFeatures\\joinedTable") 60 | .start() 61 | 62 | spark.streams.awaitAnyTermination() 63 | } 64 | -------------------------------------------------------------------------------- /SparkStreamingPOC/src/main/scala/streaming/watermark/Rate2ConsoleWatermark.scala: -------------------------------------------------------------------------------- 1 | package streaming.watermark 2 | 3 | import org.apache.log4j.{Level, LogManager} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.streaming.Trigger 6 | 7 | object Rate2ConsoleWatermark extends App { 8 | 9 | val spark: SparkSession = SparkSession.builder() 10 | .appName("Rate2ConsoleWatermark") 11 | .master("local[*]") 12 | .getOrCreate() 13 | 14 | val logger = LogManager.getRootLogger 15 | logger.setLevel(Level.ERROR) 16 | 17 | spark.conf.set("spark.sql.shuffle.partitions", "1") 18 | 19 | val df = spark.readStream 20 | .format("rate") 21 | .option("rowsPerSecond", 1) 22 | // .option("numPartitions", 1) 23 | .option("rampUpTime", 1) 24 | .load() 25 | 26 | val rateRawData = df.selectExpr("CAST(timestamp AS STRING)", "CAST(value AS string)") 27 | 28 | val transformedData = rateRawData.withColumn("key", rateRawData.col("timestamp").substr(15, 2)) 29 | val countData = transformedData.groupBy("key").count() 30 | 31 | val defaultStream = countData.writeStream 32 | .format("console") 33 | .queryName("Complete Mode") 34 | .trigger(Trigger.ProcessingTime("10 seconds")) 35 | .outputMode("complete") 36 | .option("checkpointLocation", "sparkCheckPoint\\Rate2ConsoleWatermark\\cp1") 37 | .start() 38 | 39 | spark.streams.awaitAnyTermination() 40 | } 41 | --------------------------------------------------------------------------------