├── .gitignore ├── StreamHandler ├── build.sbt └── src │ └── main │ └── scala │ └── StreamHandler.scala ├── iot_devices.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | project/ -------------------------------------------------------------------------------- /StreamHandler/build.sbt: -------------------------------------------------------------------------------- 1 | name := "Stream Handler" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.11.12" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.apache.spark" %% "spark-core" % "2.4.5" % "provided", 9 | "org.apache.spark" %% "spark-sql" % "2.4.5" % "provided", 10 | "com.datastax.spark" %% "spark-cassandra-connector" % "2.4.3", 11 | "com.datastax.cassandra" % "cassandra-driver-core" % "4.0.0" 12 | ) -------------------------------------------------------------------------------- /iot_devices.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | # imports 4 | from kafka import KafkaProducer # pip install kafka-python 5 | import numpy as np # pip install numpy 6 | from sys import argv, exit 7 | from time import time, sleep 8 | 9 | # different device "profiles" with different 10 | # distributions of values to make things interesting 11 | # tuple --> (mean, std.dev) 12 | DEVICE_PROFILES = { 13 | "boston": {'temp': (51.3, 17.7), 'humd': (77.4, 18.7), 'pres': (1019.9, 9.5) }, 14 | "denver": {'temp': (49.5, 19.3), 'humd': (33.0, 13.9), 'pres': (1012.0, 41.3) }, 15 | "losang": {'temp': (63.9, 11.7), 'humd': (62.8, 21.8), 'pres': (1015.9, 11.3) }, 16 | } 17 | 18 | # check for arguments, exit if wrong 19 | if len(argv) != 2 or argv[1] not in DEVICE_PROFILES.keys(): 20 | print("please provide a valid device name:") 21 | for key in DEVICE_PROFILES.keys(): 22 | print(f" * {key}") 23 | print(f"\nformat: {argv[0]} DEVICE_NAME") 24 | exit(1) 25 | 26 | profile_name = argv[1] 27 | profile = DEVICE_PROFILES[profile_name] 28 | 29 | # set up the producer 30 | producer = KafkaProducer(bootstrap_servers='localhost:9092') 31 | 32 | count = 1 33 | 34 | # until ^C 35 | while True: 36 | # get random values within a normal distribution of the value 37 | temp = np.random.normal(profile['temp'][0], profile['temp'][1]) 38 | humd = max(0, min(np.random.normal(profile['humd'][0], profile['humd'][1]), 100)) 39 | pres = np.random.normal(profile['pres'][0], profile['pres'][1]) 40 | 41 | # create CSV structure 42 | msg = f'{time()},{profile_name},{temp},{humd},{pres}' 43 | 44 | # send to Kafka 45 | producer.send('weather', bytes(msg, encoding='utf8')) 46 | print(f'sending data to kafka, #{count}') 47 | 48 | count += 1 49 | sleep(.5) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark Structured Streaming Example 2 | 3 | ## Walkthrough Video 4 | 5 | [Watch walkthrough video here](https://youtu.be/CGT8v8_9i2g) 6 | 7 | ## Overview 8 | 9 | Mock data pipeline which reads a stream of weather data, aggregates it slightly, then saves it to a database. 10 | 11 | ## Architecture 12 | 13 | IoT devices --> Kafka --> Spark --> Cassandra 14 | 15 | **NOTES**: "IoT Devices" are represented with the Python script `iot_devices.py`. This script allows "weather data" to be captured and transmitted from three differnt weather sensors, which are located in Boston, Denver, and Los Angeles. 16 | 17 | ## Qucikstart 18 | 19 | 1. Start a Kafka server 20 | * create a topic called `weather` 21 | 1. Start a Cassandra database 22 | * create a keyspace called `stuff` (SimpleStrategy, replication=1) 23 | ``` 24 | CREATE KEYSPACE stuff 25 | WITH replication = {'class': 'SimpleStrategy, 'replication_factor' : 1}; 26 | ``` 27 | * create a table called `weather` with the following schema 28 | ``` 29 | CREATE TABLE weather ( 30 | uuid uuid primary key, 31 | device text, 32 | temp double, 33 | humd double, 34 | pres double 35 | ); 36 | ``` 37 | 1. Inside the `StreamHandler` directory, package up the Scala file: 38 | ``` 39 | sbt package 40 | ``` 41 | 1. Then run 42 | ``` 43 | spark-submit --class StreamHandler \ 44 | --master local[*] \ 45 | --packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5,\ 46 | com.datastax.cassandra:cassandra-driver-core:4.0.0,\ 47 | com.datastax.spark:spark-cassandra-connector_2.11:2.4.3 \ 48 | target/scala-2.11/stream-handler_2.11-1.0.jar 49 | ``` 50 | 1. From root directory, start one or more "IoT devices": 51 | ``` 52 | ./iot_devices.py boston 53 | ./iot_devices.py denver 54 | ./iot_devices.py losang 55 | ``` 56 | 1. `select * from weather` from CQLSH to see if the data is being processed saved correctly! 57 | 58 | For a complete walkthrough, checkout this [video](https://youtu.be/CGT8v8_9i2g) -------------------------------------------------------------------------------- /StreamHandler/src/main/scala/StreamHandler.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql._ 2 | import org.apache.spark.sql.functions._ 3 | import org.apache.spark.sql.streaming._ 4 | import org.apache.spark.sql.types._ 5 | import org.apache.spark.sql.cassandra._ 6 | 7 | import com.datastax.oss.driver.api.core.uuid.Uuids // com.datastax.cassandra:cassandra-driver-core:4.0.0 8 | import com.datastax.spark.connector._ // com.datastax.spark:spark-cassandra-connector_2.11:2.4.3 9 | 10 | case class DeviceData(device: String, temp: Double, humd: Double, pres: Double) 11 | 12 | object StreamHandler { 13 | def main(args: Array[String]) { 14 | 15 | // initialize Spark 16 | val spark = SparkSession 17 | .builder 18 | .appName("Stream Handler") 19 | .config("spark.cassandra.connection.host", "localhost") 20 | .getOrCreate() 21 | 22 | import spark.implicits._ 23 | 24 | // read from Kafka 25 | val inputDF = spark 26 | .readStream 27 | .format("kafka") // org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5 28 | .option("kafka.bootstrap.servers", "localhost:9092") 29 | .option("subscribe", "weather") 30 | .load() 31 | 32 | // only select 'value' from the table, 33 | // convert from bytes to string 34 | val rawDF = inputDF.selectExpr("CAST(value AS STRING)").as[String] 35 | 36 | // split each row on comma, load it to the case class 37 | val expandedDF = rawDF.map(row => row.split(",")) 38 | .map(row => DeviceData( 39 | row(1), 40 | row(2).toDouble, 41 | row(3).toDouble, 42 | row(4).toDouble 43 | )) 44 | 45 | // groupby and aggregate 46 | val summaryDf = expandedDF 47 | .groupBy("device") 48 | .agg(avg("temp"), avg("humd"), avg("pres")) 49 | 50 | // create a dataset function that creates UUIDs 51 | val makeUUID = udf(() => Uuids.timeBased().toString) 52 | 53 | // add the UUIDs and renamed the columns 54 | // this is necessary so that the dataframe matches the 55 | // table schema in cassandra 56 | val summaryWithIDs = summaryDf.withColumn("uuid", makeUUID()) 57 | .withColumnRenamed("avg(temp)", "temp") 58 | .withColumnRenamed("avg(humd)", "humd") 59 | .withColumnRenamed("avg(pres)", "pres") 60 | 61 | // write dataframe to Cassandra 62 | val query = summaryWithIDs 63 | .writeStream 64 | .trigger(Trigger.ProcessingTime("5 seconds")) 65 | .foreachBatch { (batchDF: DataFrame, batchID: Long) => 66 | println(s"Writing to Cassandra $batchID") 67 | batchDF.write 68 | .cassandraFormat("weather", "stuff") // table, keyspace 69 | .mode("append") 70 | .save() 71 | } 72 | .outputMode("update") 73 | .start() 74 | 75 | // until ^C 76 | query.awaitTermination() 77 | } 78 | } --------------------------------------------------------------------------------