├── cassandra.png
├── kafka-demo.png
├── turtlebot3.png
├── data_pipeline.png
├── kafka-demo
├── consumer.py
├── data_generator.py
└── producer.py
├── ros
├── readFromKafka.py
└── publish2kafka.py
├── spark-demo
├── streamingKafka2Console.py
└── streamingKafka2Cassandra.py
└── README.md
/cassandra.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/HEAD/cassandra.png
--------------------------------------------------------------------------------
/kafka-demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/HEAD/kafka-demo.png
--------------------------------------------------------------------------------
/turtlebot3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/HEAD/turtlebot3.png
--------------------------------------------------------------------------------
/data_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/HEAD/data_pipeline.png
--------------------------------------------------------------------------------
/kafka-demo/consumer.py:
--------------------------------------------------------------------------------
1 | import json
2 | from kafka import KafkaConsumer
3 |
4 | if __name__=="__main__":
5 |
6 | consumer=KafkaConsumer(
7 | "demo",
8 | bootstrap_servers="localhost:9092",
9 | auto_offset_reset="earliest"
10 | )
11 |
12 | for msg in consumer:
13 | print(json.loads(msg.value))
14 |
--------------------------------------------------------------------------------
/ros/readFromKafka.py:
--------------------------------------------------------------------------------
1 | import json
2 | from kafka import KafkaConsumer
3 |
4 | if __name__=="__main__":
5 |
6 | consumer=KafkaConsumer(
7 | "odometry",
8 | bootstrap_servers="localhost:9092",
9 | auto_offset_reset="earliest"
10 | )
11 |
12 | for msg in consumer:
13 | print(json.loads(msg.value))
14 |
--------------------------------------------------------------------------------
/kafka-demo/data_generator.py:
--------------------------------------------------------------------------------
1 | import random
2 | import string
3 |
4 | user_ids = list(range(1,101))
5 | recipient_ids = list(range(1,101))
6 |
7 | def generate_message()->dict:
8 | random_user_id = random.choice(user_ids)
9 |
10 | recipient_ids_copy = recipient_ids.copy()
11 |
12 | #recipient_ids.remove(random_user_id)
13 | random_recepient_id = random.choice(recipient_ids_copy)
14 |
15 | message = "".join(random.choice(string.ascii_letters) for i in range(32))
16 |
17 | return {
18 | "user_id":random_user_id,
19 | "recipient_id":random_recepient_id,
20 | "message":message
21 | }
--------------------------------------------------------------------------------
/kafka-demo/producer.py:
--------------------------------------------------------------------------------
1 | import time,json,random
2 | from datetime import datetime
3 | from data_generator import generate_message
4 | from kafka import KafkaProducer
5 |
6 |
7 | def serializer(message):
8 | return json.dumps(message).encode("utf-8")
9 |
10 |
11 | producer = KafkaProducer(
12 | bootstrap_servers=["localhost:9092"],
13 | value_serializer=serializer
14 | )
15 |
16 | if __name__=="__main__":
17 |
18 | while True:
19 |
20 | dummy_messages=generate_message()
21 |
22 | print(f"Producing message {datetime.now()} | Message = {str(dummy_messages)}")
23 | producer.send("demo",dummy_messages)
24 |
25 | time.sleep(2)
26 |
27 |
--------------------------------------------------------------------------------
/ros/publish2kafka.py:
--------------------------------------------------------------------------------
1 | import rospy
2 | from nav_msgs.msg import Odometry
3 | import json
4 | from datetime import datetime
5 | from kafka import KafkaProducer
6 |
7 | count = 0
8 | def callback(msg):
9 | global count
10 | messages={
11 | "id":count,
12 | "posex":float("{0:.5f}".format(msg.pose.pose.position.x)),
13 | "posey":float("{0:.5f}".format(msg.pose.pose.position.y)),
14 | "posez":float("{0:.5f}".format(msg.pose.pose.position.z)),
15 | "orientx":float("{0:.5f}".format(msg.pose.pose.orientation.x)),
16 | "orienty":float("{0:.5f}".format(msg.pose.pose.orientation.y)),
17 | "orientz":float("{0:.5f}".format(msg.pose.pose.orientation.z)),
18 | "orientw":float("{0:.5f}".format(msg.pose.pose.orientation.w))
19 | }
20 |
21 | print(f"Producing message {datetime.now()} Message :\n {str(messages)}")
22 | producer.send("odometry",messages)
23 | count+=1
24 |
25 | producer = KafkaProducer(
26 | bootstrap_servers=["localhost:9092"],
27 | value_serializer=lambda message: json.dumps(message).encode('utf-8')
28 | )
29 |
30 | if __name__=="__main__":
31 |
32 | rospy.init_node('odomSubscriber', anonymous=True)
33 | rospy.Subscriber('odom',Odometry,callback)
34 | rospy.spin()
35 |
36 |
--------------------------------------------------------------------------------
/spark-demo/streamingKafka2Console.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql.types import StructType,StructField,LongType,IntegerType,FloatType,StringType
3 | from pyspark.sql.functions import split,from_json,col
4 |
5 | odometrySchema = StructType([
6 | StructField("id",IntegerType(),False),
7 | StructField("posex",FloatType(),False),
8 | StructField("posey",FloatType(),False),
9 | StructField("posez",FloatType(),False),
10 | StructField("orientx",FloatType(),False),
11 | StructField("orienty",FloatType(),False),
12 | StructField("orientz",FloatType(),False),
13 | StructField("orientw",FloatType(),False)
14 | ])
15 |
16 | spark = SparkSession \
17 | .builder \
18 | .appName("SSKafka") \
19 | .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0") \
20 | .getOrCreate()
21 | spark.sparkContext.setLogLevel("ERROR")
22 |
23 |
24 | df = spark \
25 | .readStream \
26 | .format("kafka") \
27 | .option("kafka.bootstrap.servers", "localhost:9092") \
28 | .option("subscribe", "odometry") \
29 | .option("delimeter",",") \
30 | .option("startingOffsets", "latest") \
31 | .load()
32 |
33 | df.printSchema()
34 |
35 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*")
36 | df1.printSchema()
37 |
38 | df1.writeStream \
39 | .outputMode("update") \
40 | .format("console") \
41 | .option("truncate", False) \
42 | .start() \
43 | .awaitTermination()
44 |
--------------------------------------------------------------------------------
/spark-demo/streamingKafka2Cassandra.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql.types import StructType,StructField,FloatType,IntegerType
3 | from pyspark.sql.functions import from_json,col
4 |
5 | odometrySchema = StructType([
6 | StructField("id",IntegerType(),False),
7 | StructField("posex",FloatType(),False),
8 | StructField("posey",FloatType(),False),
9 | StructField("posez",FloatType(),False),
10 | StructField("orientx",FloatType(),False),
11 | StructField("orienty",FloatType(),False),
12 | StructField("orientz",FloatType(),False),
13 | StructField("orientw",FloatType(),False)
14 | ])
15 |
16 | # .config("spark.jars.packages","com.datastax.spark:spark-cassandra-connector_2.12:3.0.0")\
17 | spark = SparkSession \
18 | .builder \
19 | .appName("SSKafka") \
20 | .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0") \
21 | .getOrCreate()
22 | spark.sparkContext.setLogLevel("ERROR")
23 |
24 |
25 | df = spark \
26 | .readStream \
27 | .format("kafka") \
28 | .option("kafka.bootstrap.servers", "localhost:9092") \
29 | .option("subscribe", "odometry") \
30 | .option("delimeter",",") \
31 | .option("startingOffsets", "latest") \
32 | .load()
33 |
34 | df.printSchema()
35 |
36 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*")
37 | df1.printSchema()
38 |
39 |
40 | def writeToCassandra(writeDF, epochId):
41 | writeDF.write \
42 | .format("org.apache.spark.sql.cassandra")\
43 | .mode('append')\
44 | .options(table="odometry", keyspace="ros")\
45 | .save()
46 |
47 | df1.writeStream \
48 | .option("spark.cassandra.connection.host","localhost:9042")\
49 | .foreachBatch(writeToCassandra) \
50 | .outputMode("update") \
51 | .start()\
52 | .awaitTermination()
53 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra
2 |
3 | The purpose of this project is to demonstrate a structured streaming pipeline with Apache Spark. The process consists of given steps:
4 |
5 | 0. Installation Process
6 | 1. Prepare a robotic simulation environment to generate data to feed into the Kafka.
7 | 2. Prepare Kafka and Zookeeper environment to store discrete data.
8 | 3. Prepare Cassandra environment to store analyzed data.
9 | 4. Prepare Apache Spark structured streaming pipeline, integrate with Kafka and Cassandra.
10 | 5. Result
11 |
12 |
13 |
14 |
15 |
16 | ### 0. Installation Processes
17 | You are able to install all required components to realize this project using the given steps.
18 |
19 | #### Installation of ROS and Turtlebot3
20 | We won't address the whole installation process of ROS and Turtlebot3 but you can access all required info from [ROS & Turtlebot3 Installation](https://emanual.robotis.com/docs/en/platform/turtlebot3/quick-start/#pc-setup).
21 |
22 | After all installations are completed, you can demo our robotic environment using the given commands:
23 | ```
24 | roslaunch turtlebot3_gazebo turtlebot3_world.launch
25 | ```
26 | You should see a view like the one given below.
27 |
28 |
29 |
30 |
31 | #### Installation of Kafka and Zookeeper
32 | We won't address the whole installation process of Kafka and Zookeeper but you can access all required info from [Kafka & Zookeeper Installation](https://www.linode.com/docs/guides/how-to-install-apache-kafka-on-ubuntu/).
33 |
34 | After all installations are completed, you can demo Kafka using the given commands:
35 | ```
36 | # Change your path to Kafka folder and then run
37 | bin/zookeeper-server-start.sh config/zookeeper.properties
38 |
39 | # Open second terminal and then run
40 | bin/kafka-server-start.sh config/server.properties
41 |
42 | # Create Kafka "demo" topic
43 | bin/kafka-topics.sh --create --topic demo --partitions 1 --replication-factor 1 -bootstrap-server localhost:9092
44 | ```
45 |
46 | Once you create "demo" topic, you can run [kafka-demo/producer.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/kafka-demo/producer.py) and [kafka-demo/consumer.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/kafka-demo/consumer.py) respectively to check your setup.
47 | >:exclamation: If you haven't installed [kafka-python](https://kafka-python.readthedocs.io/en/master/), use the given command and then run given files.
48 | ```
49 | pip install kafka-python
50 | ```
51 | - producer.py
52 | ```python3
53 | import time,json,random
54 | from datetime import datetime
55 | from data_generator import generate_message
56 | from kafka import KafkaProducer
57 |
58 | def serializer(message):
59 | return json.dumps(message).encode("utf-8")
60 |
61 | producer = KafkaProducer(
62 | bootstrap_servers=["localhost:9092"],
63 | value_serializer=serializer
64 | )
65 |
66 | if __name__=="__main__":
67 | while True:
68 | dummy_messages=generate_message()
69 | print(f"Producing message {datetime.now()} | Message = {str(dummy_messages)}")
70 | producer.send("demo",dummy_messages)
71 | time.sleep(2)
72 | ```
73 | - consumer.py
74 | ```python3
75 | import json
76 | from kafka import KafkaConsumer
77 |
78 | if __name__=="__main__":
79 | consumer=KafkaConsumer(
80 | "demo",
81 | bootstrap_servers="localhost:9092",
82 | auto_offset_reset="latest" )
83 |
84 | for msg in consumer:
85 | print(json.loads(msg.value))
86 | ```
87 | You should see a view like the one given below after run the commands:
88 | ```
89 | python3 producer.py
90 | python3 consumer.py
91 | ```
92 |
93 |
94 |
95 |
96 |
97 | #### Installation of Cassandra
98 | We won't address the whole installation process of Cassandra but you can access all required info from [Cassandra Installation](https://phoenixnap.com/kb/install-cassandra-on-ubuntu).
99 |
100 | After all installations are completed, you can demo Cassandra using *cqlsh*. You can check this [link](https://www.tutorialspoint.com/cassandra/index.htm).
101 |
102 | #### Installation of Apache Spark
103 | We won't address the whole installation process of Apache Spark but you can access all required info from [Apache Spark Installation](https://phoenixnap.com/kb/install-spark-on-ubuntu).
104 |
105 | After all installations are completed, you can make a quick example like [here](https://spark.apache.org/docs/latest/streaming-programming-guide.html).
106 |
107 |
108 | ### 1. Prepare a robotic simulation environment
109 | [ROS (Robot Operating System)](http://wiki.ros.org/) allows us to design a robotic environment. We will use [Turtlebot3](https://emanual.robotis.com/docs/en/platform/turtlebot3/overview/), a robot in [Gazebo](http://gazebosim.org/) simulation env, to generate data for our use case. Turtlebot3 publishes its data with ROS topics. Therefore, we will subscribe the topic and send data into Kafka.
110 |
111 | #### Run the simulation environment and analysis the data we will use
112 | Turtlebot3 publishes its odometry data with ROS "odom" topic. So, we can see the published data with the given command:
113 | ```
114 | # run the simulation environment
115 | roslaunch turtlebot3_gazebo turtlebot3_world.launch
116 |
117 | # check the topic to see data
118 | rostopic echo /odom
119 | ```
120 | You should see a view like the one given below.
121 | ```
122 | header:
123 | seq: 10954
124 | stamp:
125 | secs: 365
126 | nsecs: 483000000
127 | frame_id: "odom"
128 | child_frame_id: "base_footprint"
129 | pose:
130 | pose:
131 | position:
132 | x: -2.000055643960576
133 | y: -0.4997879642933192
134 | z: -0.0010013932644100873
135 | orientation:
136 | x: -1.3486164084605e-05
137 | y: 0.0038530870521455017
138 | z: 0.0016676819550213058
139 | w: 0.9999911861487526
140 | covariance: [1e-05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 0.0, 0.0, 0.0, 0.0, 0.0,...
141 | twist:
142 | twist:
143 | linear:
144 | x: 5.8050405333644035e-08
145 | y: 7.749200305343809e-07
146 | z: 0.0
147 | angular:
148 | x: 0.0
149 | y: 0.0
150 | z: 1.15143519181447e-05
151 | covariance: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
152 | ```
153 | In this use case, we will just interest the given part of the data:
154 | ```
155 | position:
156 | x: -2.000055643960576
157 | y: -0.4997879642933192
158 | z: -0.0010013932644100873
159 | orientation:
160 | x: -1.3486164084605e-05
161 | y: 0.0038530870521455017
162 | z: 0.0016676819550213058
163 | w: 0.9999911861487526
164 | ```
165 |
166 | ### 2. Prepare Kafka and Zookeeper environment
167 | The data produced by Turtlebot3 will stored into Kafka clusters.
168 |
169 | #### Prepare Kafka for Use Case
170 | First of all, we will create a new Kafka topic namely *odometry* for ROS odom data using the given commands:
171 | ```
172 | # Change your path to Kafka folder and then run
173 | bin/zookeeper-server-start.sh config/zookeeper.properties
174 |
175 | # Open second terminal and then run
176 | bin/kafka-server-start.sh config/server.properties
177 |
178 | # Create Kafka "odometry" topic for ROS odom data
179 | bin/kafka-topics.sh --create --topic odometry --partitions 1 --replication-factor 1 -bootstrap-server localhost:9092
180 | ```
181 | Then we will write a ROS subscriber to listen to the data from Turtlebot3. Also, since we need to send data to Kafka, it is necessary to add a producer script in it. We will use [ros/publish2kafka.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/ros/publish2kafka.py) to do it. This script subscribes to the odom topic and sends the content of the topic to Kafka.
182 | ```python3
183 | import rospy
184 | from nav_msgs.msg import Odometry
185 | import json
186 | from datetime import datetime
187 | from kafka import KafkaProducer
188 |
189 | count = 0
190 | def callback(msg):
191 | global count
192 | messages={
193 | "id":count,
194 | "posex":float("{0:.5f}".format(msg.pose.pose.position.x)),
195 | "posey":float("{0:.5f}".format(msg.pose.pose.position.y)),
196 | "posez":float("{0:.5f}".format(msg.pose.pose.position.z)),
197 | "orientx":float("{0:.5f}".format(msg.pose.pose.orientation.x)),
198 | "orienty":float("{0:.5f}".format(msg.pose.pose.orientation.y)),
199 | "orientz":float("{0:.5f}".format(msg.pose.pose.orientation.z)),
200 | "orientw":float("{0:.5f}".format(msg.pose.pose.orientation.w))
201 | }
202 |
203 | print(f"Producing message {datetime.now()} Message :\n {str(messages)}")
204 | producer.send("odometry",messages)
205 | count+=1
206 |
207 | producer = KafkaProducer(
208 | bootstrap_servers=["localhost:9092"],
209 | value_serializer=lambda message: json.dumps(message).encode('utf-8')
210 | )
211 |
212 | if __name__=="__main__":
213 |
214 | rospy.init_node('odomSubscriber', anonymous=True)
215 | rospy.Subscriber('odom',Odometry,callback)
216 | rospy.spin()
217 | ```
218 | You can use [ros/readFromKafka.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/ros/readFromKafka.py) to check the data is really reach Kafka while ROS and publish2kafka.py is running.
219 | ```python3
220 | import json
221 | from kafka import KafkaConsumer
222 |
223 | if __name__=="__main__":
224 |
225 | consumer=KafkaConsumer(
226 | "odometry",
227 | bootstrap_servers="localhost:9092",
228 | auto_offset_reset="earliest"
229 | )
230 |
231 | for msg in consumer:
232 | print(json.loads(msg.value))
233 | ```
234 | ### 3. Prepare Cassandra environment
235 |
236 | #### Prepare Cassandra for Use Case
237 | Initially, we will create a *keyspace* and then a *topic* in it using given command:
238 | ```
239 | # Open the cqlsh and then run the command to create 'ros' keyspace
240 | cqlsh> CREATE KEYSPACE ros WITH replication = {'class':'SimpleStrategy', 'replication_factor' : 1};
241 |
242 | # Then, run the command to create 'odometry' topic in 'ros'
243 | cqlsh> create table ros.odometry(
244 | id int primary key,
245 | posex float,
246 | posey float,
247 | posez float,
248 | orientx float,
249 | orienty float,
250 | orientz float,
251 | orientw float);
252 |
253 | # Check your setup is correct
254 | cqlsh> DESCRIBE ros
255 |
256 | #and
257 | cqlsh> DESCRIBE ros.odometry
258 | ```
259 | > :warning: **The content of topic has to be the same as Spark schema**: Be very careful here!
260 |
261 | ### 4. Prepare Apache Spark structured streaming pipeline
262 | You are able to write analysis results to either console or Cassandra.
263 | #### (First Way) Prepare Apache Spark Structured Streaming Pipeline Kafka to Cassandra
264 | We will write streaming script that read *odometry* topic from Kafka, analyze it and then write results to Cassandra. We will use [spark-demo/streamingKafka2Cassandra.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/spark-demo/streamingKafka2Cassandra.py) to do it.
265 |
266 | First of all, we create a schema same as we already defined in Cassandra.
267 | > :warning: **The content of schema has to be the same as Casssandra table**: Be very careful here!
268 |
269 | ```python3
270 | odometrySchema = StructType([
271 | StructField("id",IntegerType(),False),
272 | StructField("posex",FloatType(),False),
273 | StructField("posey",FloatType(),False),
274 | StructField("posez",FloatType(),False),
275 | StructField("orientx",FloatType(),False),
276 | StructField("orienty",FloatType(),False),
277 | StructField("orientz",FloatType(),False),
278 | StructField("orientw",FloatType(),False)
279 | ])
280 | ```
281 | Then, we create a Spark Session using two packages:
282 | - **for spark kafka connector** : org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0
283 | - **for spark cassandra connector** : com.datastax.spark:spark-cassandra-connector_2.12:3.0.0
284 | ```python3
285 | spark = SparkSession \
286 | .builder \
287 | .appName("SparkStructuredStreaming") \
288 | .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0") \
289 | .getOrCreate()
290 | ```
291 | > :warning: **If you use spark-submit you can specify the packages as:**
292 |
293 | - spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0 spark_cassandra.py
294 |
295 | In order to read Kafka stream, we use **readStream()** and specify Kafka configurations as the given below:
296 | ```python3
297 | df = spark \
298 | .readStream \
299 | .format("kafka") \
300 | .option("kafka.bootstrap.servers", "localhost:9092") \
301 | .option("subscribe", "odometry") \
302 | .option("delimeter",",") \
303 | .option("startingOffsets", "latest") \
304 | .load()
305 | ```
306 | Since Kafka send data as binary, first we need to convert the binary value to String using **selectExpr()** as the given below:
307 | ```python3
308 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*")
309 | df1.printSchema()
310 | ```
311 | Although Apache Spark isn't capable of directly write stream data to Cassandra yet (using **writeStream()**), we can do it with use **foreachBatch()** as the given below:
312 | ```python3
313 | def writeToCassandra(writeDF, _):
314 | writeDF.write \
315 | .format("org.apache.spark.sql.cassandra")\
316 | .mode('append')\
317 | .options(table="odometry", keyspace="ros")\
318 | .save()
319 |
320 | df1.writeStream \
321 | .option("spark.cassandra.connection.host","localhost:9042")\
322 | .foreachBatch(writeToCassandra) \
323 | .outputMode("update") \
324 | .start()\
325 | .awaitTermination()
326 | ```
327 | Finally, we got the given script [spark-demo/streamingKafka2Cassandra.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/spark-demo/streamingKafka2Cassandra.py):
328 | ```python3
329 | from pyspark.sql import SparkSession
330 | from pyspark.sql.types import StructType,StructField,FloatType,IntegerType
331 | from pyspark.sql.functions import from_json,col
332 |
333 | odometrySchema = StructType([
334 | StructField("id",IntegerType(),False),
335 | StructField("posex",FloatType(),False),
336 | StructField("posey",FloatType(),False),
337 | StructField("posez",FloatType(),False),
338 | StructField("orientx",FloatType(),False),
339 | StructField("orienty",FloatType(),False),
340 | StructField("orientz",FloatType(),False),
341 | StructField("orientw",FloatType(),False)
342 | ])
343 |
344 | spark = SparkSession \
345 | .builder \
346 | .appName("SparkStructuredStreaming") \
347 | .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0") \
348 | .getOrCreate()
349 |
350 | spark.sparkContext.setLogLevel("ERROR")
351 |
352 |
353 | df = spark \
354 | .readStream \
355 | .format("kafka") \
356 | .option("kafka.bootstrap.servers", "localhost:9092") \
357 | .option("subscribe", "odometry") \
358 | .option("delimeter",",") \
359 | .option("startingOffsets", "latest") \
360 | .load()
361 |
362 | df.printSchema()
363 |
364 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*")
365 | df1.printSchema()
366 |
367 | # It is possible to analysis data here using df1
368 |
369 |
370 | def writeToCassandra(writeDF, _):
371 | writeDF.write \
372 | .format("org.apache.spark.sql.cassandra")\
373 | .mode('append')\
374 | .options(table="odometry", keyspace="ros")\
375 | .save()
376 |
377 | df1.writeStream \
378 | .option("spark.cassandra.connection.host","localhost:9042")\
379 | .foreachBatch(writeToCassandra) \
380 | .outputMode("update") \
381 | .start()\
382 | .awaitTermination()
383 | ```
384 | #### (Second Way) Prepare Apache Spark Structured Streaming Pipeline Kafka to Console
385 | There are a few differences between writing to the console and writing to Cassandra.
386 | First of all, we don't need to use cassandra connector, so we remove it from packages.
387 | ```python3
388 | spark = SparkSession \
389 | .builder \
390 | .appName("SSKafka") \
391 | .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0") \
392 | .getOrCreate()
393 | ```
394 | With **writeStream()** we can write stream data directly to the console.
395 | ```python3
396 | df1.writeStream \
397 | .outputMode("update") \
398 | .format("console") \
399 | .option("truncate", False) \
400 | .start() \
401 | .awaitTermination()
402 | ```
403 | The rest of the process takes place in the same way as the previous one. Finally, we got the given script spark-demo/streamingKafka2Console.py:
404 | ```python3
405 | from pyspark.sql import SparkSession
406 | from pyspark.sql.types import StructType,StructField,LongType,IntegerType,FloatType,StringType
407 | from pyspark.sql.functions import split,from_json,col
408 |
409 | odometrySchema = StructType([
410 | StructField("id",IntegerType(),False),
411 | StructField("posex",FloatType(),False),
412 | StructField("posey",FloatType(),False),
413 | StructField("posez",FloatType(),False),
414 | StructField("orientx",FloatType(),False),
415 | StructField("orienty",FloatType(),False),
416 | StructField("orientz",FloatType(),False),
417 | StructField("orientw",FloatType(),False)
418 | ])
419 |
420 | spark = SparkSession \
421 | .builder \
422 | .appName("SSKafka") \
423 | .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0") \
424 | .getOrCreate()
425 | spark.sparkContext.setLogLevel("ERROR")
426 |
427 | df = spark \
428 | .readStream \
429 | .format("kafka") \
430 | .option("kafka.bootstrap.servers", "localhost:9092") \
431 | .option("subscribe", "odometry") \
432 | .option("delimeter",",") \
433 | .option("startingOffsets", "latest") \
434 | .load()
435 |
436 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*")
437 | df1.printSchema()
438 |
439 | df1.writeStream \
440 | .outputMode("update") \
441 | .format("console") \
442 | .option("truncate", False) \
443 | .start() \
444 | .awaitTermination()
445 | ```
446 | ### 5. Result
447 | After all the process is done, we got the data in our Cassandra table as the given below:
448 |
449 | You can query the given command to see your table:
450 | ```
451 | # Open the cqlsh
452 | cqlsh
453 | # Then write select query to see content of the table
454 | cqlsh> select * from ros.odometry
455 | ```
456 |
457 |
458 |
459 |
460 |
--------------------------------------------------------------------------------