├── .gitignore
├── StreamHandler
    ├── build.sbt
    └── src
    │   └── main
    │       └── scala
    │           └── StreamHandler.scala
├── iot_devices.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | project/


--------------------------------------------------------------------------------
/StreamHandler/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "Stream Handler"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.11.12"
 6 | 
 7 | libraryDependencies ++= Seq(
 8 | 	"org.apache.spark" %% "spark-core" % "2.4.5" % "provided",
 9 | 	"org.apache.spark" %% "spark-sql" % "2.4.5" % "provided",
10 | 	"com.datastax.spark" %% "spark-cassandra-connector" % "2.4.3",
11 | 	"com.datastax.cassandra" % "cassandra-driver-core" % "4.0.0"
12 | )


--------------------------------------------------------------------------------
/iot_devices.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | # imports
 4 | from kafka import KafkaProducer # pip install kafka-python
 5 | import numpy as np              # pip install numpy
 6 | from sys import argv, exit
 7 | from time import time, sleep
 8 | 
 9 | # different device "profiles" with different 
10 | # distributions of values to make things interesting
11 | # tuple --> (mean, std.dev)
12 | DEVICE_PROFILES = {
13 | 	"boston": {'temp': (51.3, 17.7), 'humd': (77.4, 18.7), 'pres': (1019.9, 9.5) },
14 | 	"denver": {'temp': (49.5, 19.3), 'humd': (33.0, 13.9), 'pres': (1012.0, 41.3) },
15 | 	"losang": {'temp': (63.9, 11.7), 'humd': (62.8, 21.8), 'pres': (1015.9, 11.3) },
16 | }
17 | 
18 | # check for arguments, exit if wrong
19 | if len(argv) != 2 or argv[1] not in DEVICE_PROFILES.keys():
20 | 	print("please provide a valid device name:")
21 | 	for key in DEVICE_PROFILES.keys():
22 | 		print(f"  * {key}")
23 | 	print(f"\nformat: {argv[0]} DEVICE_NAME")
24 | 	exit(1)
25 | 
26 | profile_name = argv[1]
27 | profile = DEVICE_PROFILES[profile_name]
28 | 
29 | # set up the producer
30 | producer = KafkaProducer(bootstrap_servers='localhost:9092')
31 | 
32 | count = 1
33 | 
34 | # until ^C
35 | while True:
36 | 	# get random values within a normal distribution of the value
37 | 	temp = np.random.normal(profile['temp'][0], profile['temp'][1])
38 | 	humd = max(0, min(np.random.normal(profile['humd'][0], profile['humd'][1]), 100))
39 | 	pres = np.random.normal(profile['pres'][0], profile['pres'][1])
40 | 	
41 | 	# create CSV structure
42 | 	msg = f'{time()},{profile_name},{temp},{humd},{pres}'
43 | 
44 | 	# send to Kafka
45 | 	producer.send('weather', bytes(msg, encoding='utf8'))
46 | 	print(f'sending data to kafka, #{count}')
47 | 
48 | 	count += 1
49 | 	sleep(.5)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spark Structured Streaming Example
 2 | 
 3 | ## Walkthrough Video
 4 | 
 5 | [Watch walkthrough video here](https://youtu.be/CGT8v8_9i2g)
 6 | 
 7 | ## Overview
 8 | 
 9 | Mock data pipeline which reads a stream of weather data, aggregates it slightly, then saves it to a database.
10 | 
11 | ## Architecture
12 | 
13 | 	IoT devices --> Kafka --> Spark --> Cassandra  
14 | 
15 | **NOTES**: "IoT Devices" are represented with the Python script `iot_devices.py`. This script allows "weather data" to be captured and transmitted from three differnt weather sensors, which are located in Boston, Denver, and Los Angeles. 
16 | 
17 | ## Qucikstart
18 | 
19 | 1. Start a Kafka server
20 | 	* create a topic called `weather`
21 | 1. Start a Cassandra database
22 | 	* create a keyspace called `stuff` (SimpleStrategy, replication=1)
23 | 		```
24 | 		CREATE KEYSPACE stuff
25 | 		WITH replication = {'class': 'SimpleStrategy, 'replication_factor' : 1};
26 | 		```
27 | 	* create a table called `weather` with the following schema
28 | 		```
29 | 		CREATE TABLE weather (
30 | 			uuid uuid primary key,
31 | 			device text,
32 | 			temp double,
33 | 			humd double,
34 | 			pres double
35 | 		);
36 | 	  ```
37 | 1. Inside the `StreamHandler` directory, package up the Scala file:
38 | 	```
39 | 	sbt package
40 | 	```
41 | 1. Then run
42 | 	``` 
43 | 	spark-submit --class StreamHandler \
44 | 	--master local[*] \
45 | 	--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,\
46 | 		com.datastax.cassandra:cassandra-driver-core:4.0.0,\
47 | 		com.datastax.spark:spark-cassandra-connector_2.11:2.4.3 \
48 | 	target/scala-2.11/stream-handler_2.11-1.0.jar
49 | 	```
50 | 1. From root directory, start one or more "IoT devices":
51 | 	```
52 | 	./iot_devices.py boston
53 | 	./iot_devices.py denver
54 | 	./iot_devices.py losang
55 | 	```
56 | 1. `select * from weather` from CQLSH to see if the data is being processed saved correctly!
57 | 
58 | For a complete walkthrough, checkout this [video](https://youtu.be/CGT8v8_9i2g)


--------------------------------------------------------------------------------
/StreamHandler/src/main/scala/StreamHandler.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.sql._
 2 | import org.apache.spark.sql.functions._
 3 | import org.apache.spark.sql.streaming._
 4 | import org.apache.spark.sql.types._
 5 | import org.apache.spark.sql.cassandra._
 6 | 
 7 | import com.datastax.oss.driver.api.core.uuid.Uuids // com.datastax.cassandra:cassandra-driver-core:4.0.0
 8 | import com.datastax.spark.connector._              // com.datastax.spark:spark-cassandra-connector_2.11:2.4.3
 9 | 
10 | case class DeviceData(device: String, temp: Double, humd: Double, pres: Double)
11 | 
12 | object StreamHandler {
13 | 	def main(args: Array[String]) {
14 | 
15 | 		// initialize Spark
16 | 		val spark = SparkSession
17 | 			.builder
18 | 			.appName("Stream Handler")
19 | 			.config("spark.cassandra.connection.host", "localhost")
20 | 			.getOrCreate()
21 | 
22 | 		import spark.implicits._
23 | 
24 | 		// read from Kafka
25 | 		val inputDF = spark
26 | 			.readStream
27 | 			.format("kafka") // org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5
28 | 			.option("kafka.bootstrap.servers", "localhost:9092")
29 | 			.option("subscribe", "weather")
30 | 			.load()
31 | 
32 | 		// only select 'value' from the table,
33 | 		// convert from bytes to string
34 | 		val rawDF = inputDF.selectExpr("CAST(value AS STRING)").as[String]
35 | 
36 | 		// split each row on comma, load it to the case class
37 | 		val expandedDF = rawDF.map(row => row.split(","))
38 | 			.map(row => DeviceData(
39 | 				row(1),
40 | 				row(2).toDouble,
41 | 				row(3).toDouble,
42 | 				row(4).toDouble
43 | 			))
44 | 
45 | 		// groupby and aggregate
46 | 		val summaryDf = expandedDF
47 | 			.groupBy("device")
48 | 			.agg(avg("temp"), avg("humd"), avg("pres"))
49 | 
50 | 		// create a dataset function that creates UUIDs
51 | 		val makeUUID = udf(() => Uuids.timeBased().toString)
52 | 
53 | 		// add the UUIDs and renamed the columns
54 | 		// this is necessary so that the dataframe matches the 
55 | 		// table schema in cassandra
56 | 		val summaryWithIDs = summaryDf.withColumn("uuid", makeUUID())
57 | 			.withColumnRenamed("avg(temp)", "temp")
58 | 			.withColumnRenamed("avg(humd)", "humd")
59 | 			.withColumnRenamed("avg(pres)", "pres")
60 | 
61 | 		// write dataframe to Cassandra
62 | 		val query = summaryWithIDs
63 | 			.writeStream
64 | 			.trigger(Trigger.ProcessingTime("5 seconds"))
65 | 			.foreachBatch { (batchDF: DataFrame, batchID: Long) =>
66 | 				println(s"Writing to Cassandra $batchID")
67 | 				batchDF.write
68 | 					.cassandraFormat("weather", "stuff") // table, keyspace
69 | 					.mode("append")
70 | 					.save()
71 | 			}
72 | 			.outputMode("update")
73 | 			.start()
74 | 
75 | 		// until ^C
76 | 		query.awaitTermination()
77 | 	}
78 | }


--------------------------------------------------------------------------------