├── cassandra.png
├── kafka-demo.png
├── turtlebot3.png
├── data_pipeline.png
├── kafka-demo
    ├── consumer.py
    ├── data_generator.py
    └── producer.py
├── ros
    ├── readFromKafka.py
    └── publish2kafka.py
├── spark-demo
    ├── streamingKafka2Console.py
    └── streamingKafka2Cassandra.py
└── README.md


/cassandra.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/HEAD/cassandra.png


--------------------------------------------------------------------------------
/kafka-demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/HEAD/kafka-demo.png


--------------------------------------------------------------------------------
/turtlebot3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/HEAD/turtlebot3.png


--------------------------------------------------------------------------------
/data_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/HEAD/data_pipeline.png


--------------------------------------------------------------------------------
/kafka-demo/consumer.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from kafka import KafkaConsumer
 3 | 
 4 | if __name__=="__main__":
 5 | 
 6 |     consumer=KafkaConsumer(
 7 |         "demo",
 8 |         bootstrap_servers="localhost:9092",
 9 |         auto_offset_reset="earliest"
10 |     )
11 | 
12 |     for msg in consumer:
13 |         print(json.loads(msg.value))
14 | 


--------------------------------------------------------------------------------
/ros/readFromKafka.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from kafka import KafkaConsumer
 3 | 
 4 | if __name__=="__main__":
 5 | 
 6 |     consumer=KafkaConsumer(
 7 |         "odometry",
 8 |         bootstrap_servers="localhost:9092",
 9 |         auto_offset_reset="earliest"
10 |     )
11 | 
12 |     for msg in consumer:
13 |         print(json.loads(msg.value))
14 | 


--------------------------------------------------------------------------------
/kafka-demo/data_generator.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import string
 3 | 
 4 | user_ids = list(range(1,101))
 5 | recipient_ids = list(range(1,101))
 6 | 
 7 | def generate_message()->dict:
 8 |     random_user_id = random.choice(user_ids)
 9 | 
10 |     recipient_ids_copy = recipient_ids.copy()
11 | 
12 |     #recipient_ids.remove(random_user_id)
13 |     random_recepient_id = random.choice(recipient_ids_copy)
14 | 
15 |     message = "".join(random.choice(string.ascii_letters) for i in range(32))
16 | 
17 |     return {
18 |         "user_id":random_user_id,
19 |         "recipient_id":random_recepient_id,
20 |         "message":message
21 |     }


--------------------------------------------------------------------------------
/kafka-demo/producer.py:
--------------------------------------------------------------------------------
 1 | import time,json,random
 2 | from datetime import datetime
 3 | from data_generator import generate_message
 4 | from kafka import KafkaProducer
 5 | 
 6 | 
 7 | def serializer(message):
 8 |     return json.dumps(message).encode("utf-8")
 9 | 
10 | 
11 | producer = KafkaProducer(
12 |     bootstrap_servers=["localhost:9092"],
13 |     value_serializer=serializer
14 | )
15 | 
16 | if __name__=="__main__":
17 | 
18 |     while True:
19 | 
20 |         dummy_messages=generate_message()
21 | 
22 |         print(f"Producing message {datetime.now()} | Message = {str(dummy_messages)}")
23 |         producer.send("demo",dummy_messages)
24 | 
25 |         time.sleep(2)
26 | 
27 | 


--------------------------------------------------------------------------------
/ros/publish2kafka.py:
--------------------------------------------------------------------------------
 1 | import rospy
 2 | from nav_msgs.msg import Odometry
 3 | import json
 4 | from datetime import datetime
 5 | from kafka import KafkaProducer
 6 | 
 7 | count = 0
 8 | def callback(msg):
 9 |     global count
10 |     messages={
11 |         "id":count,
12 |         "posex":float("{0:.5f}".format(msg.pose.pose.position.x)),
13 |         "posey":float("{0:.5f}".format(msg.pose.pose.position.y)),
14 |         "posez":float("{0:.5f}".format(msg.pose.pose.position.z)),
15 |         "orientx":float("{0:.5f}".format(msg.pose.pose.orientation.x)),
16 |         "orienty":float("{0:.5f}".format(msg.pose.pose.orientation.y)),
17 |         "orientz":float("{0:.5f}".format(msg.pose.pose.orientation.z)),
18 |         "orientw":float("{0:.5f}".format(msg.pose.pose.orientation.w))
19 |         }
20 | 
21 |     print(f"Producing message {datetime.now()} Message :\n {str(messages)}")
22 |     producer.send("odometry",messages)
23 |     count+=1
24 | 
25 | producer = KafkaProducer(
26 |     bootstrap_servers=["localhost:9092"],
27 |     value_serializer=lambda message: json.dumps(message).encode('utf-8')
28 | )
29 | 
30 | if __name__=="__main__":
31 | 
32 |     rospy.init_node('odomSubscriber', anonymous=True)
33 |     rospy.Subscriber('odom',Odometry,callback)
34 |     rospy.spin()
35 | 
36 | 


--------------------------------------------------------------------------------
/spark-demo/streamingKafka2Console.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from pyspark.sql.types import StructType,StructField,LongType,IntegerType,FloatType,StringType
 3 | from pyspark.sql.functions import split,from_json,col
 4 | 
 5 | odometrySchema = StructType([
 6 |                 StructField("id",IntegerType(),False),
 7 |                 StructField("posex",FloatType(),False),
 8 |                 StructField("posey",FloatType(),False),
 9 |                 StructField("posez",FloatType(),False),
10 |                 StructField("orientx",FloatType(),False),
11 |                 StructField("orienty",FloatType(),False),
12 |                 StructField("orientz",FloatType(),False),
13 |                 StructField("orientw",FloatType(),False)
14 |             ])
15 | 
16 | spark = SparkSession \
17 |     .builder \
18 |     .appName("SSKafka") \
19 |     .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0") \
20 |     .getOrCreate()
21 | spark.sparkContext.setLogLevel("ERROR")
22 | 
23 | 
24 | df = spark \
25 |   .readStream \
26 |   .format("kafka") \
27 |   .option("kafka.bootstrap.servers", "localhost:9092") \
28 |   .option("subscribe", "odometry") \
29 |   .option("delimeter",",") \
30 |   .option("startingOffsets", "latest") \
31 |   .load() 
32 | 
33 | df.printSchema()
34 | 
35 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*")
36 | df1.printSchema()
37 | 
38 | df1.writeStream \
39 |   .outputMode("update") \
40 |   .format("console") \
41 |   .option("truncate", False) \
42 |   .start() \
43 |   .awaitTermination()
44 | 


--------------------------------------------------------------------------------
/spark-demo/streamingKafka2Cassandra.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from pyspark.sql.types import StructType,StructField,FloatType,IntegerType
 3 | from pyspark.sql.functions import from_json,col
 4 | 
 5 | odometrySchema = StructType([
 6 |                 StructField("id",IntegerType(),False),
 7 |                 StructField("posex",FloatType(),False),
 8 |                 StructField("posey",FloatType(),False),
 9 |                 StructField("posez",FloatType(),False),
10 |                 StructField("orientx",FloatType(),False),
11 |                 StructField("orienty",FloatType(),False),
12 |                 StructField("orientz",FloatType(),False),
13 |                 StructField("orientw",FloatType(),False)
14 |             ])
15 | 
16 | #  .config("spark.jars.packages","com.datastax.spark:spark-cassandra-connector_2.12:3.0.0")\
17 | spark = SparkSession \
18 |     .builder \
19 |     .appName("SSKafka") \
20 |     .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0") \
21 |     .getOrCreate()
22 | spark.sparkContext.setLogLevel("ERROR")
23 | 
24 | 
25 | df = spark \
26 |   .readStream \
27 |   .format("kafka") \
28 |   .option("kafka.bootstrap.servers", "localhost:9092") \
29 |   .option("subscribe", "odometry") \
30 |   .option("delimeter",",") \
31 |   .option("startingOffsets", "latest") \
32 |   .load() 
33 | 
34 | df.printSchema()
35 | 
36 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*")
37 | df1.printSchema()
38 | 
39 | 
40 | def writeToCassandra(writeDF, epochId):
41 |   writeDF.write \
42 |     .format("org.apache.spark.sql.cassandra")\
43 |     .mode('append')\
44 |     .options(table="odometry", keyspace="ros")\
45 |     .save()
46 | 
47 | df1.writeStream \
48 |     .option("spark.cassandra.connection.host","localhost:9042")\
49 |     .foreachBatch(writeToCassandra) \
50 |     .outputMode("update") \
51 |     .start()\
52 |     .awaitTermination()
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra
  2 | 
  3 | The purpose of this project is to demonstrate a structured streaming pipeline with Apache Spark. The process consists of given steps:
  4 |   
  5 | 0. Installation Process
  6 | 1. Prepare a robotic simulation environment to generate data to feed into the Kafka. 
  7 | 2. Prepare Kafka and Zookeeper environment to store discrete data.
  8 | 3. Prepare Cassandra environment to store analyzed data.
  9 | 4. Prepare Apache Spark structured streaming pipeline, integrate with Kafka and Cassandra.
 10 | 5. Result
 11 | 
 12 | <p align="center" width="100%">
 13 |     <img src="https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/data_pipeline.png"> 
 14 | </p>
 15 | 
 16 | ### 0. Installation Processes
 17 | You are able to install all required components to realize this project using the given steps.
 18 | 
 19 | #### Installation of ROS and Turtlebot3
 20 | We won't address the whole installation process of ROS and Turtlebot3 but you can access all required info from [ROS & Turtlebot3 Installation](https://emanual.robotis.com/docs/en/platform/turtlebot3/quick-start/#pc-setup).
 21 | 
 22 | After all installations are completed, you can demo our robotic environment using the given commands:
 23 | ```
 24 | roslaunch turtlebot3_gazebo turtlebot3_world.launch
 25 | ```
 26 | You should see a view like the one given below.
 27 | <p align="center" width="100%">
 28 |     <img src="https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/turtlebot3.png"> 
 29 | </p>
 30 | 
 31 | #### Installation of Kafka and Zookeeper
 32 | We won't address the whole installation process of Kafka and Zookeeper but you can access all required info from [Kafka & Zookeeper Installation](https://www.linode.com/docs/guides/how-to-install-apache-kafka-on-ubuntu/).
 33 | 
 34 | After all installations are completed, you can demo Kafka using the given commands:
 35 | ```
 36 | # Change your path to Kafka folder and then run 
 37 | bin/zookeeper-server-start.sh config/zookeeper.properties
 38 | 
 39 | # Open second terminal and then run
 40 | bin/kafka-server-start.sh config/server.properties
 41 | 
 42 | # Create Kafka "demo" topic
 43 | bin/kafka-topics.sh --create --topic demo --partitions 1 --replication-factor 1 -bootstrap-server localhost:9092
 44 | ```
 45 | 
 46 | Once you create "demo" topic, you can run [kafka-demo/producer.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/kafka-demo/producer.py) and [kafka-demo/consumer.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/kafka-demo/consumer.py) respectively to check your setup.
 47 | >:exclamation: If you haven't installed [kafka-python](https://kafka-python.readthedocs.io/en/master/), use the given command and then run given files.
 48 | ```
 49 | pip install kafka-python
 50 | ```
 51 | - producer.py
 52 | ```python3
 53 | import time,json,random
 54 | from datetime import datetime
 55 | from data_generator import generate_message
 56 | from kafka import KafkaProducer
 57 | 
 58 | def serializer(message):
 59 |     return json.dumps(message).encode("utf-8")
 60 |     
 61 | producer = KafkaProducer(
 62 |     bootstrap_servers=["localhost:9092"],
 63 |     value_serializer=serializer
 64 | )
 65 | 
 66 | if __name__=="__main__":
 67 |     while True:
 68 |         dummy_messages=generate_message()
 69 |         print(f"Producing message {datetime.now()} | Message = {str(dummy_messages)}")
 70 |         producer.send("demo",dummy_messages)
 71 |         time.sleep(2)
 72 | ```
 73 | - consumer.py
 74 | ```python3
 75 | import json
 76 | from kafka import KafkaConsumer
 77 | 
 78 | if __name__=="__main__":
 79 |     consumer=KafkaConsumer(
 80 |         "demo",
 81 |         bootstrap_servers="localhost:9092",
 82 |         auto_offset_reset="latest"    )
 83 | 
 84 |     for msg in consumer:
 85 |         print(json.loads(msg.value))
 86 | ```
 87 | You should see a view like the one given below after run the commands:
 88 | ```
 89 | python3 producer.py
 90 | python3 consumer.py
 91 | ```
 92 | 
 93 | <p align="center" width="100%">
 94 |     <img src="https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/kafka-demo.png"> 
 95 | </p>
 96 | 
 97 | #### Installation of Cassandra
 98 | We won't address the whole installation process of Cassandra but you can access all required info from [Cassandra Installation](https://phoenixnap.com/kb/install-cassandra-on-ubuntu).
 99 | 
100 | After all installations are completed, you can demo Cassandra using *cqlsh*. You can check this [link](https://www.tutorialspoint.com/cassandra/index.htm).
101 | 
102 | #### Installation of Apache Spark
103 | We won't address the whole installation process of Apache Spark but you can access all required info from [Apache Spark Installation](https://phoenixnap.com/kb/install-spark-on-ubuntu).
104 | 
105 | After all installations are completed, you can make a quick example like [here](https://spark.apache.org/docs/latest/streaming-programming-guide.html).
106 | 
107 | 
108 | ### 1. Prepare a robotic simulation environment
109 | [ROS (Robot Operating System)](http://wiki.ros.org/) allows us to design a robotic environment. We will use [Turtlebot3](https://emanual.robotis.com/docs/en/platform/turtlebot3/overview/), a robot in [Gazebo](http://gazebosim.org/) simulation env, to generate data for our use case. Turtlebot3 publishes its data with ROS topics. Therefore, we will subscribe the topic and send data into Kafka.
110 | 
111 | #### Run the simulation environment and analysis the data we will use
112 | Turtlebot3 publishes its odometry data with ROS "odom" topic. So, we can see the published data with the given command:
113 | ```
114 | # run the simulation environment
115 | roslaunch turtlebot3_gazebo turtlebot3_world.launch
116 | 
117 | # check the topic to see data
118 | rostopic echo /odom
119 | ```
120 | You should see a view like the one given below.
121 | ```
122 | header: 
123 |   seq: 10954
124 |   stamp: 
125 |     secs: 365
126 |     nsecs: 483000000
127 |   frame_id: "odom"
128 | child_frame_id: "base_footprint"
129 | pose: 
130 |   pose: 
131 |     position: 
132 |       x: -2.000055643960576
133 |       y: -0.4997879642933192
134 |       z: -0.0010013932644100873
135 |     orientation: 
136 |       x: -1.3486164084605e-05
137 |       y: 0.0038530870521455017
138 |       z: 0.0016676819550213058
139 |       w: 0.9999911861487526
140 |   covariance: [1e-05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 0.0, 0.0, 0.0, 0.0, 0.0,...
141 | twist: 
142 |   twist: 
143 |     linear: 
144 |       x: 5.8050405333644035e-08
145 |       y: 7.749200305343809e-07
146 |       z: 0.0
147 |     angular: 
148 |       x: 0.0
149 |       y: 0.0
150 |       z: 1.15143519181447e-05
151 |   covariance: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
152 | ```
153 | In this use case, we will just interest the given part of the data:
154 | ```
155 |     position: 
156 |       x: -2.000055643960576
157 |       y: -0.4997879642933192
158 |       z: -0.0010013932644100873
159 |     orientation: 
160 |       x: -1.3486164084605e-05
161 |       y: 0.0038530870521455017
162 |       z: 0.0016676819550213058
163 |       w: 0.9999911861487526
164 | ```
165 | 
166 | ### 2. Prepare Kafka and Zookeeper environment
167 | The data produced by Turtlebot3 will stored into Kafka clusters. 
168 | 
169 | #### Prepare Kafka for Use Case
170 | First of all, we will create a new Kafka topic namely *odometry* for ROS odom data using the given commands:
171 | ```
172 | # Change your path to Kafka folder and then run 
173 | bin/zookeeper-server-start.sh config/zookeeper.properties
174 | 
175 | # Open second terminal and then run
176 | bin/kafka-server-start.sh config/server.properties
177 | 
178 | # Create Kafka "odometry" topic for ROS odom data
179 | bin/kafka-topics.sh --create --topic odometry --partitions 1 --replication-factor 1 -bootstrap-server localhost:9092
180 | ```
181 | Then we will write a ROS subscriber to listen to the data from Turtlebot3. Also, since we need to send data to Kafka, it is necessary to add a producer script in it. We will use [ros/publish2kafka.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/ros/publish2kafka.py) to do it. This script subscribes to the odom topic and sends the content of the topic to Kafka.
182 | ```python3
183 | import rospy
184 | from nav_msgs.msg import Odometry
185 | import json
186 | from datetime import datetime
187 | from kafka import KafkaProducer
188 | 
189 | count = 0
190 | def callback(msg):
191 |     global count
192 |     messages={
193 |         "id":count,
194 |         "posex":float("{0:.5f}".format(msg.pose.pose.position.x)),
195 |         "posey":float("{0:.5f}".format(msg.pose.pose.position.y)),
196 |         "posez":float("{0:.5f}".format(msg.pose.pose.position.z)),
197 |         "orientx":float("{0:.5f}".format(msg.pose.pose.orientation.x)),
198 |         "orienty":float("{0:.5f}".format(msg.pose.pose.orientation.y)),
199 |         "orientz":float("{0:.5f}".format(msg.pose.pose.orientation.z)),
200 |         "orientw":float("{0:.5f}".format(msg.pose.pose.orientation.w))
201 |         }
202 | 
203 |     print(f"Producing message {datetime.now()} Message :\n {str(messages)}")
204 |     producer.send("odometry",messages)
205 |     count+=1
206 | 
207 | producer = KafkaProducer(
208 |     bootstrap_servers=["localhost:9092"],
209 |     value_serializer=lambda message: json.dumps(message).encode('utf-8')
210 | )
211 | 
212 | if __name__=="__main__":
213 | 
214 |     rospy.init_node('odomSubscriber', anonymous=True)
215 |     rospy.Subscriber('odom',Odometry,callback)
216 |     rospy.spin()
217 | ```
218 | You can use [ros/readFromKafka.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/ros/readFromKafka.py) to check the data is really reach Kafka while ROS and publish2kafka.py is running.
219 | ```python3
220 | import json
221 | from kafka import KafkaConsumer
222 | 
223 | if __name__=="__main__":
224 | 
225 |     consumer=KafkaConsumer(
226 |         "odometry",
227 |         bootstrap_servers="localhost:9092",
228 |         auto_offset_reset="earliest"
229 |     )
230 | 
231 |     for msg in consumer:
232 |         print(json.loads(msg.value))
233 | ```
234 | ### 3. Prepare Cassandra environment
235 | 
236 | #### Prepare Cassandra for Use Case
237 | Initially, we will create a *keyspace* and then a *topic* in it using given command:
238 | ```
239 | # Open the cqlsh and then run the command to create 'ros' keyspace
240 | cqlsh> CREATE KEYSPACE ros WITH replication = {'class':'SimpleStrategy', 'replication_factor' : 1};
241 | 
242 | # Then, run the command to create 'odometry' topic in 'ros'
243 | cqlsh> create table ros.odometry(
244 |         id int primary key, 
245 |         posex float,
246 |         posey float,
247 |         posez float,
248 |         orientx float,
249 |         orienty float,
250 |         orientz float,
251 |         orientw float);
252 | 
253 | # Check your setup is correct
254 | cqlsh> DESCRIBE ros
255 | 
256 | #and
257 | cqlsh> DESCRIBE ros.odometry
258 | ```
259 | > :warning: **The content of topic has to be the same as Spark schema**: Be very careful here!
260 | 
261 | ### 4. Prepare Apache Spark structured streaming pipeline
262 | You are able to write analysis results to either console or Cassandra.
263 | #### (First Way) Prepare Apache Spark Structured Streaming Pipeline Kafka to Cassandra
264 | We will write streaming script that read *odometry* topic from Kafka, analyze it and then write results to Cassandra. We will use [spark-demo/streamingKafka2Cassandra.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/spark-demo/streamingKafka2Cassandra.py) to do it.
265 | 
266 | First of all, we create a schema same as we already defined in Cassandra.
267 | > :warning: **The content of schema has to be the same as Casssandra table**: Be very careful here!
268 | 
269 | ```python3
270 | odometrySchema = StructType([
271 |                 StructField("id",IntegerType(),False),
272 |                 StructField("posex",FloatType(),False),
273 |                 StructField("posey",FloatType(),False),
274 |                 StructField("posez",FloatType(),False),
275 |                 StructField("orientx",FloatType(),False),
276 |                 StructField("orienty",FloatType(),False),
277 |                 StructField("orientz",FloatType(),False),
278 |                 StructField("orientw",FloatType(),False)
279 |             ])
280 | ```
281 | Then, we create a Spark Session using two packages:
282 | - **for spark kafka connector**     : org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0
283 | - **for spark cassandra connector** : com.datastax.spark:spark-cassandra-connector_2.12:3.0.0
284 | ```python3
285 | spark = SparkSession \
286 |     .builder \
287 |     .appName("SparkStructuredStreaming") \
288 |     .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0") \
289 |     .getOrCreate()
290 | ```
291 | > :warning: **If you use spark-submit you can specify the packages as:** 
292 | 
293 | - spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0 spark_cassandra.py
294 | 
295 | In order to read Kafka stream, we use **readStream()** and specify Kafka configurations as the given below:
296 | ```python3
297 | df = spark \
298 |   .readStream \
299 |   .format("kafka") \
300 |   .option("kafka.bootstrap.servers", "localhost:9092") \
301 |   .option("subscribe", "odometry") \
302 |   .option("delimeter",",") \
303 |   .option("startingOffsets", "latest") \
304 |   .load() 
305 | ```
306 | Since Kafka send data as binary, first we need to convert the binary value to String using **selectExpr()** as the given below:
307 | ```python3
308 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*")
309 | df1.printSchema()
310 | ```
311 | Although Apache Spark isn't capable of directly write stream data to Cassandra yet (using **writeStream()**), we can do it with use **foreachBatch()** as the given below:
312 | ```python3
313 | def writeToCassandra(writeDF, _):
314 |   writeDF.write \
315 |     .format("org.apache.spark.sql.cassandra")\
316 |     .mode('append')\
317 |     .options(table="odometry", keyspace="ros")\
318 |     .save()
319 | 
320 | df1.writeStream \
321 |     .option("spark.cassandra.connection.host","localhost:9042")\
322 |     .foreachBatch(writeToCassandra) \
323 |     .outputMode("update") \
324 |     .start()\
325 |     .awaitTermination()
326 | ```
327 | Finally, we got the given script [spark-demo/streamingKafka2Cassandra.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/spark-demo/streamingKafka2Cassandra.py):
328 | ```python3
329 | from pyspark.sql import SparkSession
330 | from pyspark.sql.types import StructType,StructField,FloatType,IntegerType
331 | from pyspark.sql.functions import from_json,col
332 | 
333 | odometrySchema = StructType([
334 |                 StructField("id",IntegerType(),False),
335 |                 StructField("posex",FloatType(),False),
336 |                 StructField("posey",FloatType(),False),
337 |                 StructField("posez",FloatType(),False),
338 |                 StructField("orientx",FloatType(),False),
339 |                 StructField("orienty",FloatType(),False),
340 |                 StructField("orientz",FloatType(),False),
341 |                 StructField("orientw",FloatType(),False)
342 |             ])
343 | 
344 | spark = SparkSession \
345 |     .builder \
346 |     .appName("SparkStructuredStreaming") \
347 |     .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0") \
348 |     .getOrCreate()
349 | 
350 | spark.sparkContext.setLogLevel("ERROR")
351 | 
352 | 
353 | df = spark \
354 |   .readStream \
355 |   .format("kafka") \
356 |   .option("kafka.bootstrap.servers", "localhost:9092") \
357 |   .option("subscribe", "odometry") \
358 |   .option("delimeter",",") \
359 |   .option("startingOffsets", "latest") \
360 |   .load() 
361 | 
362 | df.printSchema()
363 | 
364 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*")
365 | df1.printSchema()
366 | 
367 | # It is possible to analysis data here using df1
368 | 
369 | 
370 | def writeToCassandra(writeDF, _):
371 |   writeDF.write \
372 |     .format("org.apache.spark.sql.cassandra")\
373 |     .mode('append')\
374 |     .options(table="odometry", keyspace="ros")\
375 |     .save()
376 | 
377 | df1.writeStream \
378 |     .option("spark.cassandra.connection.host","localhost:9042")\
379 |     .foreachBatch(writeToCassandra) \
380 |     .outputMode("update") \
381 |     .start()\
382 |     .awaitTermination()
383 | ```
384 | #### (Second Way) Prepare Apache Spark Structured Streaming Pipeline Kafka to Console
385 | There are a few differences between writing to the console and writing to Cassandra. 
386 | First of all, we don't need to use cassandra connector, so we remove it from packages.
387 | ```python3
388 | spark = SparkSession \
389 |     .builder \
390 |     .appName("SSKafka") \
391 |     .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0") \
392 |     .getOrCreate()
393 | ```
394 | With **writeStream()** we can write stream data directly to the console.
395 | ```python3
396 | df1.writeStream \
397 |   .outputMode("update") \
398 |   .format("console") \
399 |   .option("truncate", False) \
400 |   .start() \
401 |   .awaitTermination()
402 | ```
403 | The rest of the process takes place in the same way as the previous one. Finally, we got the given script spark-demo/streamingKafka2Console.py:
404 | ```python3
405 | from pyspark.sql import SparkSession
406 | from pyspark.sql.types import StructType,StructField,LongType,IntegerType,FloatType,StringType
407 | from pyspark.sql.functions import split,from_json,col
408 | 
409 | odometrySchema = StructType([
410 |                 StructField("id",IntegerType(),False),
411 |                 StructField("posex",FloatType(),False),
412 |                 StructField("posey",FloatType(),False),
413 |                 StructField("posez",FloatType(),False),
414 |                 StructField("orientx",FloatType(),False),
415 |                 StructField("orienty",FloatType(),False),
416 |                 StructField("orientz",FloatType(),False),
417 |                 StructField("orientw",FloatType(),False)
418 |             ])
419 | 
420 | spark = SparkSession \
421 |     .builder \
422 |     .appName("SSKafka") \
423 |     .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0") \
424 |     .getOrCreate()
425 | spark.sparkContext.setLogLevel("ERROR")
426 | 
427 | df = spark \
428 |   .readStream \
429 |   .format("kafka") \
430 |   .option("kafka.bootstrap.servers", "localhost:9092") \
431 |   .option("subscribe", "odometry") \
432 |   .option("delimeter",",") \
433 |   .option("startingOffsets", "latest") \
434 |   .load() 
435 | 
436 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*")
437 | df1.printSchema()
438 | 
439 | df1.writeStream \
440 |   .outputMode("update") \
441 |   .format("console") \
442 |   .option("truncate", False) \
443 |   .start() \
444 |   .awaitTermination()
445 | ```
446 | ### 5. Result
447 | After all the process is done, we got the data in our Cassandra table as the given below:
448 | 
449 | You can query the given command to see your table:
450 | ```
451 | # Open the cqlsh 
452 | cqlsh
453 | # Then write select query to see content of the table
454 | cqlsh> select * from ros.odometry
455 | ```
456 | <p align="center" width="100%">
457 |     <img src="https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/cassandra.png"> 
458 | </p>
459 | 
460 | 


--------------------------------------------------------------------------------