├── cassandra.png ├── kafka-demo.png ├── turtlebot3.png ├── data_pipeline.png ├── kafka-demo ├── consumer.py ├── data_generator.py └── producer.py ├── ros ├── readFromKafka.py └── publish2kafka.py ├── spark-demo ├── streamingKafka2Console.py └── streamingKafka2Cassandra.py └── README.md /cassandra.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/HEAD/cassandra.png -------------------------------------------------------------------------------- /kafka-demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/HEAD/kafka-demo.png -------------------------------------------------------------------------------- /turtlebot3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/HEAD/turtlebot3.png -------------------------------------------------------------------------------- /data_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/HEAD/data_pipeline.png -------------------------------------------------------------------------------- /kafka-demo/consumer.py: -------------------------------------------------------------------------------- 1 | import json 2 | from kafka import KafkaConsumer 3 | 4 | if __name__=="__main__": 5 | 6 | consumer=KafkaConsumer( 7 | "demo", 8 | bootstrap_servers="localhost:9092", 9 | auto_offset_reset="earliest" 10 | ) 11 | 12 | for msg in consumer: 13 | print(json.loads(msg.value)) 14 | -------------------------------------------------------------------------------- /ros/readFromKafka.py: -------------------------------------------------------------------------------- 1 | import json 2 | from kafka import KafkaConsumer 3 | 4 | if __name__=="__main__": 5 | 6 | consumer=KafkaConsumer( 7 | "odometry", 8 | bootstrap_servers="localhost:9092", 9 | auto_offset_reset="earliest" 10 | ) 11 | 12 | for msg in consumer: 13 | print(json.loads(msg.value)) 14 | -------------------------------------------------------------------------------- /kafka-demo/data_generator.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | user_ids = list(range(1,101)) 5 | recipient_ids = list(range(1,101)) 6 | 7 | def generate_message()->dict: 8 | random_user_id = random.choice(user_ids) 9 | 10 | recipient_ids_copy = recipient_ids.copy() 11 | 12 | #recipient_ids.remove(random_user_id) 13 | random_recepient_id = random.choice(recipient_ids_copy) 14 | 15 | message = "".join(random.choice(string.ascii_letters) for i in range(32)) 16 | 17 | return { 18 | "user_id":random_user_id, 19 | "recipient_id":random_recepient_id, 20 | "message":message 21 | } -------------------------------------------------------------------------------- /kafka-demo/producer.py: -------------------------------------------------------------------------------- 1 | import time,json,random 2 | from datetime import datetime 3 | from data_generator import generate_message 4 | from kafka import KafkaProducer 5 | 6 | 7 | def serializer(message): 8 | return json.dumps(message).encode("utf-8") 9 | 10 | 11 | producer = KafkaProducer( 12 | bootstrap_servers=["localhost:9092"], 13 | value_serializer=serializer 14 | ) 15 | 16 | if __name__=="__main__": 17 | 18 | while True: 19 | 20 | dummy_messages=generate_message() 21 | 22 | print(f"Producing message {datetime.now()} | Message = {str(dummy_messages)}") 23 | producer.send("demo",dummy_messages) 24 | 25 | time.sleep(2) 26 | 27 | -------------------------------------------------------------------------------- /ros/publish2kafka.py: -------------------------------------------------------------------------------- 1 | import rospy 2 | from nav_msgs.msg import Odometry 3 | import json 4 | from datetime import datetime 5 | from kafka import KafkaProducer 6 | 7 | count = 0 8 | def callback(msg): 9 | global count 10 | messages={ 11 | "id":count, 12 | "posex":float("{0:.5f}".format(msg.pose.pose.position.x)), 13 | "posey":float("{0:.5f}".format(msg.pose.pose.position.y)), 14 | "posez":float("{0:.5f}".format(msg.pose.pose.position.z)), 15 | "orientx":float("{0:.5f}".format(msg.pose.pose.orientation.x)), 16 | "orienty":float("{0:.5f}".format(msg.pose.pose.orientation.y)), 17 | "orientz":float("{0:.5f}".format(msg.pose.pose.orientation.z)), 18 | "orientw":float("{0:.5f}".format(msg.pose.pose.orientation.w)) 19 | } 20 | 21 | print(f"Producing message {datetime.now()} Message :\n {str(messages)}") 22 | producer.send("odometry",messages) 23 | count+=1 24 | 25 | producer = KafkaProducer( 26 | bootstrap_servers=["localhost:9092"], 27 | value_serializer=lambda message: json.dumps(message).encode('utf-8') 28 | ) 29 | 30 | if __name__=="__main__": 31 | 32 | rospy.init_node('odomSubscriber', anonymous=True) 33 | rospy.Subscriber('odom',Odometry,callback) 34 | rospy.spin() 35 | 36 | -------------------------------------------------------------------------------- /spark-demo/streamingKafka2Console.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.types import StructType,StructField,LongType,IntegerType,FloatType,StringType 3 | from pyspark.sql.functions import split,from_json,col 4 | 5 | odometrySchema = StructType([ 6 | StructField("id",IntegerType(),False), 7 | StructField("posex",FloatType(),False), 8 | StructField("posey",FloatType(),False), 9 | StructField("posez",FloatType(),False), 10 | StructField("orientx",FloatType(),False), 11 | StructField("orienty",FloatType(),False), 12 | StructField("orientz",FloatType(),False), 13 | StructField("orientw",FloatType(),False) 14 | ]) 15 | 16 | spark = SparkSession \ 17 | .builder \ 18 | .appName("SSKafka") \ 19 | .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0") \ 20 | .getOrCreate() 21 | spark.sparkContext.setLogLevel("ERROR") 22 | 23 | 24 | df = spark \ 25 | .readStream \ 26 | .format("kafka") \ 27 | .option("kafka.bootstrap.servers", "localhost:9092") \ 28 | .option("subscribe", "odometry") \ 29 | .option("delimeter",",") \ 30 | .option("startingOffsets", "latest") \ 31 | .load() 32 | 33 | df.printSchema() 34 | 35 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*") 36 | df1.printSchema() 37 | 38 | df1.writeStream \ 39 | .outputMode("update") \ 40 | .format("console") \ 41 | .option("truncate", False) \ 42 | .start() \ 43 | .awaitTermination() 44 | -------------------------------------------------------------------------------- /spark-demo/streamingKafka2Cassandra.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.types import StructType,StructField,FloatType,IntegerType 3 | from pyspark.sql.functions import from_json,col 4 | 5 | odometrySchema = StructType([ 6 | StructField("id",IntegerType(),False), 7 | StructField("posex",FloatType(),False), 8 | StructField("posey",FloatType(),False), 9 | StructField("posez",FloatType(),False), 10 | StructField("orientx",FloatType(),False), 11 | StructField("orienty",FloatType(),False), 12 | StructField("orientz",FloatType(),False), 13 | StructField("orientw",FloatType(),False) 14 | ]) 15 | 16 | # .config("spark.jars.packages","com.datastax.spark:spark-cassandra-connector_2.12:3.0.0")\ 17 | spark = SparkSession \ 18 | .builder \ 19 | .appName("SSKafka") \ 20 | .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0") \ 21 | .getOrCreate() 22 | spark.sparkContext.setLogLevel("ERROR") 23 | 24 | 25 | df = spark \ 26 | .readStream \ 27 | .format("kafka") \ 28 | .option("kafka.bootstrap.servers", "localhost:9092") \ 29 | .option("subscribe", "odometry") \ 30 | .option("delimeter",",") \ 31 | .option("startingOffsets", "latest") \ 32 | .load() 33 | 34 | df.printSchema() 35 | 36 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*") 37 | df1.printSchema() 38 | 39 | 40 | def writeToCassandra(writeDF, epochId): 41 | writeDF.write \ 42 | .format("org.apache.spark.sql.cassandra")\ 43 | .mode('append')\ 44 | .options(table="odometry", keyspace="ros")\ 45 | .save() 46 | 47 | df1.writeStream \ 48 | .option("spark.cassandra.connection.host","localhost:9042")\ 49 | .foreachBatch(writeToCassandra) \ 50 | .outputMode("update") \ 51 | .start()\ 52 | .awaitTermination() 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra 2 | 3 | The purpose of this project is to demonstrate a structured streaming pipeline with Apache Spark. The process consists of given steps: 4 | 5 | 0. Installation Process 6 | 1. Prepare a robotic simulation environment to generate data to feed into the Kafka. 7 | 2. Prepare Kafka and Zookeeper environment to store discrete data. 8 | 3. Prepare Cassandra environment to store analyzed data. 9 | 4. Prepare Apache Spark structured streaming pipeline, integrate with Kafka and Cassandra. 10 | 5. Result 11 | 12 |

13 | 14 |

15 | 16 | ### 0. Installation Processes 17 | You are able to install all required components to realize this project using the given steps. 18 | 19 | #### Installation of ROS and Turtlebot3 20 | We won't address the whole installation process of ROS and Turtlebot3 but you can access all required info from [ROS & Turtlebot3 Installation](https://emanual.robotis.com/docs/en/platform/turtlebot3/quick-start/#pc-setup). 21 | 22 | After all installations are completed, you can demo our robotic environment using the given commands: 23 | ``` 24 | roslaunch turtlebot3_gazebo turtlebot3_world.launch 25 | ``` 26 | You should see a view like the one given below. 27 |

28 | 29 |

30 | 31 | #### Installation of Kafka and Zookeeper 32 | We won't address the whole installation process of Kafka and Zookeeper but you can access all required info from [Kafka & Zookeeper Installation](https://www.linode.com/docs/guides/how-to-install-apache-kafka-on-ubuntu/). 33 | 34 | After all installations are completed, you can demo Kafka using the given commands: 35 | ``` 36 | # Change your path to Kafka folder and then run 37 | bin/zookeeper-server-start.sh config/zookeeper.properties 38 | 39 | # Open second terminal and then run 40 | bin/kafka-server-start.sh config/server.properties 41 | 42 | # Create Kafka "demo" topic 43 | bin/kafka-topics.sh --create --topic demo --partitions 1 --replication-factor 1 -bootstrap-server localhost:9092 44 | ``` 45 | 46 | Once you create "demo" topic, you can run [kafka-demo/producer.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/kafka-demo/producer.py) and [kafka-demo/consumer.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/kafka-demo/consumer.py) respectively to check your setup. 47 | >:exclamation: If you haven't installed [kafka-python](https://kafka-python.readthedocs.io/en/master/), use the given command and then run given files. 48 | ``` 49 | pip install kafka-python 50 | ``` 51 | - producer.py 52 | ```python3 53 | import time,json,random 54 | from datetime import datetime 55 | from data_generator import generate_message 56 | from kafka import KafkaProducer 57 | 58 | def serializer(message): 59 | return json.dumps(message).encode("utf-8") 60 | 61 | producer = KafkaProducer( 62 | bootstrap_servers=["localhost:9092"], 63 | value_serializer=serializer 64 | ) 65 | 66 | if __name__=="__main__": 67 | while True: 68 | dummy_messages=generate_message() 69 | print(f"Producing message {datetime.now()} | Message = {str(dummy_messages)}") 70 | producer.send("demo",dummy_messages) 71 | time.sleep(2) 72 | ``` 73 | - consumer.py 74 | ```python3 75 | import json 76 | from kafka import KafkaConsumer 77 | 78 | if __name__=="__main__": 79 | consumer=KafkaConsumer( 80 | "demo", 81 | bootstrap_servers="localhost:9092", 82 | auto_offset_reset="latest" ) 83 | 84 | for msg in consumer: 85 | print(json.loads(msg.value)) 86 | ``` 87 | You should see a view like the one given below after run the commands: 88 | ``` 89 | python3 producer.py 90 | python3 consumer.py 91 | ``` 92 | 93 |

94 | 95 |

96 | 97 | #### Installation of Cassandra 98 | We won't address the whole installation process of Cassandra but you can access all required info from [Cassandra Installation](https://phoenixnap.com/kb/install-cassandra-on-ubuntu). 99 | 100 | After all installations are completed, you can demo Cassandra using *cqlsh*. You can check this [link](https://www.tutorialspoint.com/cassandra/index.htm). 101 | 102 | #### Installation of Apache Spark 103 | We won't address the whole installation process of Apache Spark but you can access all required info from [Apache Spark Installation](https://phoenixnap.com/kb/install-spark-on-ubuntu). 104 | 105 | After all installations are completed, you can make a quick example like [here](https://spark.apache.org/docs/latest/streaming-programming-guide.html). 106 | 107 | 108 | ### 1. Prepare a robotic simulation environment 109 | [ROS (Robot Operating System)](http://wiki.ros.org/) allows us to design a robotic environment. We will use [Turtlebot3](https://emanual.robotis.com/docs/en/platform/turtlebot3/overview/), a robot in [Gazebo](http://gazebosim.org/) simulation env, to generate data for our use case. Turtlebot3 publishes its data with ROS topics. Therefore, we will subscribe the topic and send data into Kafka. 110 | 111 | #### Run the simulation environment and analysis the data we will use 112 | Turtlebot3 publishes its odometry data with ROS "odom" topic. So, we can see the published data with the given command: 113 | ``` 114 | # run the simulation environment 115 | roslaunch turtlebot3_gazebo turtlebot3_world.launch 116 | 117 | # check the topic to see data 118 | rostopic echo /odom 119 | ``` 120 | You should see a view like the one given below. 121 | ``` 122 | header: 123 | seq: 10954 124 | stamp: 125 | secs: 365 126 | nsecs: 483000000 127 | frame_id: "odom" 128 | child_frame_id: "base_footprint" 129 | pose: 130 | pose: 131 | position: 132 | x: -2.000055643960576 133 | y: -0.4997879642933192 134 | z: -0.0010013932644100873 135 | orientation: 136 | x: -1.3486164084605e-05 137 | y: 0.0038530870521455017 138 | z: 0.0016676819550213058 139 | w: 0.9999911861487526 140 | covariance: [1e-05, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1e-05, 0.0, 0.0, 0.0, 0.0, 0.0,... 141 | twist: 142 | twist: 143 | linear: 144 | x: 5.8050405333644035e-08 145 | y: 7.749200305343809e-07 146 | z: 0.0 147 | angular: 148 | x: 0.0 149 | y: 0.0 150 | z: 1.15143519181447e-05 151 | covariance: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... 152 | ``` 153 | In this use case, we will just interest the given part of the data: 154 | ``` 155 | position: 156 | x: -2.000055643960576 157 | y: -0.4997879642933192 158 | z: -0.0010013932644100873 159 | orientation: 160 | x: -1.3486164084605e-05 161 | y: 0.0038530870521455017 162 | z: 0.0016676819550213058 163 | w: 0.9999911861487526 164 | ``` 165 | 166 | ### 2. Prepare Kafka and Zookeeper environment 167 | The data produced by Turtlebot3 will stored into Kafka clusters. 168 | 169 | #### Prepare Kafka for Use Case 170 | First of all, we will create a new Kafka topic namely *odometry* for ROS odom data using the given commands: 171 | ``` 172 | # Change your path to Kafka folder and then run 173 | bin/zookeeper-server-start.sh config/zookeeper.properties 174 | 175 | # Open second terminal and then run 176 | bin/kafka-server-start.sh config/server.properties 177 | 178 | # Create Kafka "odometry" topic for ROS odom data 179 | bin/kafka-topics.sh --create --topic odometry --partitions 1 --replication-factor 1 -bootstrap-server localhost:9092 180 | ``` 181 | Then we will write a ROS subscriber to listen to the data from Turtlebot3. Also, since we need to send data to Kafka, it is necessary to add a producer script in it. We will use [ros/publish2kafka.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/ros/publish2kafka.py) to do it. This script subscribes to the odom topic and sends the content of the topic to Kafka. 182 | ```python3 183 | import rospy 184 | from nav_msgs.msg import Odometry 185 | import json 186 | from datetime import datetime 187 | from kafka import KafkaProducer 188 | 189 | count = 0 190 | def callback(msg): 191 | global count 192 | messages={ 193 | "id":count, 194 | "posex":float("{0:.5f}".format(msg.pose.pose.position.x)), 195 | "posey":float("{0:.5f}".format(msg.pose.pose.position.y)), 196 | "posez":float("{0:.5f}".format(msg.pose.pose.position.z)), 197 | "orientx":float("{0:.5f}".format(msg.pose.pose.orientation.x)), 198 | "orienty":float("{0:.5f}".format(msg.pose.pose.orientation.y)), 199 | "orientz":float("{0:.5f}".format(msg.pose.pose.orientation.z)), 200 | "orientw":float("{0:.5f}".format(msg.pose.pose.orientation.w)) 201 | } 202 | 203 | print(f"Producing message {datetime.now()} Message :\n {str(messages)}") 204 | producer.send("odometry",messages) 205 | count+=1 206 | 207 | producer = KafkaProducer( 208 | bootstrap_servers=["localhost:9092"], 209 | value_serializer=lambda message: json.dumps(message).encode('utf-8') 210 | ) 211 | 212 | if __name__=="__main__": 213 | 214 | rospy.init_node('odomSubscriber', anonymous=True) 215 | rospy.Subscriber('odom',Odometry,callback) 216 | rospy.spin() 217 | ``` 218 | You can use [ros/readFromKafka.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/ros/readFromKafka.py) to check the data is really reach Kafka while ROS and publish2kafka.py is running. 219 | ```python3 220 | import json 221 | from kafka import KafkaConsumer 222 | 223 | if __name__=="__main__": 224 | 225 | consumer=KafkaConsumer( 226 | "odometry", 227 | bootstrap_servers="localhost:9092", 228 | auto_offset_reset="earliest" 229 | ) 230 | 231 | for msg in consumer: 232 | print(json.loads(msg.value)) 233 | ``` 234 | ### 3. Prepare Cassandra environment 235 | 236 | #### Prepare Cassandra for Use Case 237 | Initially, we will create a *keyspace* and then a *topic* in it using given command: 238 | ``` 239 | # Open the cqlsh and then run the command to create 'ros' keyspace 240 | cqlsh> CREATE KEYSPACE ros WITH replication = {'class':'SimpleStrategy', 'replication_factor' : 1}; 241 | 242 | # Then, run the command to create 'odometry' topic in 'ros' 243 | cqlsh> create table ros.odometry( 244 | id int primary key, 245 | posex float, 246 | posey float, 247 | posez float, 248 | orientx float, 249 | orienty float, 250 | orientz float, 251 | orientw float); 252 | 253 | # Check your setup is correct 254 | cqlsh> DESCRIBE ros 255 | 256 | #and 257 | cqlsh> DESCRIBE ros.odometry 258 | ``` 259 | > :warning: **The content of topic has to be the same as Spark schema**: Be very careful here! 260 | 261 | ### 4. Prepare Apache Spark structured streaming pipeline 262 | You are able to write analysis results to either console or Cassandra. 263 | #### (First Way) Prepare Apache Spark Structured Streaming Pipeline Kafka to Cassandra 264 | We will write streaming script that read *odometry* topic from Kafka, analyze it and then write results to Cassandra. We will use [spark-demo/streamingKafka2Cassandra.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/spark-demo/streamingKafka2Cassandra.py) to do it. 265 | 266 | First of all, we create a schema same as we already defined in Cassandra. 267 | > :warning: **The content of schema has to be the same as Casssandra table**: Be very careful here! 268 | 269 | ```python3 270 | odometrySchema = StructType([ 271 | StructField("id",IntegerType(),False), 272 | StructField("posex",FloatType(),False), 273 | StructField("posey",FloatType(),False), 274 | StructField("posez",FloatType(),False), 275 | StructField("orientx",FloatType(),False), 276 | StructField("orienty",FloatType(),False), 277 | StructField("orientz",FloatType(),False), 278 | StructField("orientw",FloatType(),False) 279 | ]) 280 | ``` 281 | Then, we create a Spark Session using two packages: 282 | - **for spark kafka connector** : org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0 283 | - **for spark cassandra connector** : com.datastax.spark:spark-cassandra-connector_2.12:3.0.0 284 | ```python3 285 | spark = SparkSession \ 286 | .builder \ 287 | .appName("SparkStructuredStreaming") \ 288 | .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0") \ 289 | .getOrCreate() 290 | ``` 291 | > :warning: **If you use spark-submit you can specify the packages as:** 292 | 293 | - spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0 spark_cassandra.py 294 | 295 | In order to read Kafka stream, we use **readStream()** and specify Kafka configurations as the given below: 296 | ```python3 297 | df = spark \ 298 | .readStream \ 299 | .format("kafka") \ 300 | .option("kafka.bootstrap.servers", "localhost:9092") \ 301 | .option("subscribe", "odometry") \ 302 | .option("delimeter",",") \ 303 | .option("startingOffsets", "latest") \ 304 | .load() 305 | ``` 306 | Since Kafka send data as binary, first we need to convert the binary value to String using **selectExpr()** as the given below: 307 | ```python3 308 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*") 309 | df1.printSchema() 310 | ``` 311 | Although Apache Spark isn't capable of directly write stream data to Cassandra yet (using **writeStream()**), we can do it with use **foreachBatch()** as the given below: 312 | ```python3 313 | def writeToCassandra(writeDF, _): 314 | writeDF.write \ 315 | .format("org.apache.spark.sql.cassandra")\ 316 | .mode('append')\ 317 | .options(table="odometry", keyspace="ros")\ 318 | .save() 319 | 320 | df1.writeStream \ 321 | .option("spark.cassandra.connection.host","localhost:9042")\ 322 | .foreachBatch(writeToCassandra) \ 323 | .outputMode("update") \ 324 | .start()\ 325 | .awaitTermination() 326 | ``` 327 | Finally, we got the given script [spark-demo/streamingKafka2Cassandra.py](https://github.com/zekeriyyaa/PySpark-Structured-Streaming-ROS-Kafka-ApacheSpark-Cassandra/blob/main/spark-demo/streamingKafka2Cassandra.py): 328 | ```python3 329 | from pyspark.sql import SparkSession 330 | from pyspark.sql.types import StructType,StructField,FloatType,IntegerType 331 | from pyspark.sql.functions import from_json,col 332 | 333 | odometrySchema = StructType([ 334 | StructField("id",IntegerType(),False), 335 | StructField("posex",FloatType(),False), 336 | StructField("posey",FloatType(),False), 337 | StructField("posez",FloatType(),False), 338 | StructField("orientx",FloatType(),False), 339 | StructField("orienty",FloatType(),False), 340 | StructField("orientz",FloatType(),False), 341 | StructField("orientw",FloatType(),False) 342 | ]) 343 | 344 | spark = SparkSession \ 345 | .builder \ 346 | .appName("SparkStructuredStreaming") \ 347 | .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0,com.datastax.spark:spark-cassandra-connector_2.12:3.0.0") \ 348 | .getOrCreate() 349 | 350 | spark.sparkContext.setLogLevel("ERROR") 351 | 352 | 353 | df = spark \ 354 | .readStream \ 355 | .format("kafka") \ 356 | .option("kafka.bootstrap.servers", "localhost:9092") \ 357 | .option("subscribe", "odometry") \ 358 | .option("delimeter",",") \ 359 | .option("startingOffsets", "latest") \ 360 | .load() 361 | 362 | df.printSchema() 363 | 364 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*") 365 | df1.printSchema() 366 | 367 | # It is possible to analysis data here using df1 368 | 369 | 370 | def writeToCassandra(writeDF, _): 371 | writeDF.write \ 372 | .format("org.apache.spark.sql.cassandra")\ 373 | .mode('append')\ 374 | .options(table="odometry", keyspace="ros")\ 375 | .save() 376 | 377 | df1.writeStream \ 378 | .option("spark.cassandra.connection.host","localhost:9042")\ 379 | .foreachBatch(writeToCassandra) \ 380 | .outputMode("update") \ 381 | .start()\ 382 | .awaitTermination() 383 | ``` 384 | #### (Second Way) Prepare Apache Spark Structured Streaming Pipeline Kafka to Console 385 | There are a few differences between writing to the console and writing to Cassandra. 386 | First of all, we don't need to use cassandra connector, so we remove it from packages. 387 | ```python3 388 | spark = SparkSession \ 389 | .builder \ 390 | .appName("SSKafka") \ 391 | .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0") \ 392 | .getOrCreate() 393 | ``` 394 | With **writeStream()** we can write stream data directly to the console. 395 | ```python3 396 | df1.writeStream \ 397 | .outputMode("update") \ 398 | .format("console") \ 399 | .option("truncate", False) \ 400 | .start() \ 401 | .awaitTermination() 402 | ``` 403 | The rest of the process takes place in the same way as the previous one. Finally, we got the given script spark-demo/streamingKafka2Console.py: 404 | ```python3 405 | from pyspark.sql import SparkSession 406 | from pyspark.sql.types import StructType,StructField,LongType,IntegerType,FloatType,StringType 407 | from pyspark.sql.functions import split,from_json,col 408 | 409 | odometrySchema = StructType([ 410 | StructField("id",IntegerType(),False), 411 | StructField("posex",FloatType(),False), 412 | StructField("posey",FloatType(),False), 413 | StructField("posez",FloatType(),False), 414 | StructField("orientx",FloatType(),False), 415 | StructField("orienty",FloatType(),False), 416 | StructField("orientz",FloatType(),False), 417 | StructField("orientw",FloatType(),False) 418 | ]) 419 | 420 | spark = SparkSession \ 421 | .builder \ 422 | .appName("SSKafka") \ 423 | .config("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0") \ 424 | .getOrCreate() 425 | spark.sparkContext.setLogLevel("ERROR") 426 | 427 | df = spark \ 428 | .readStream \ 429 | .format("kafka") \ 430 | .option("kafka.bootstrap.servers", "localhost:9092") \ 431 | .option("subscribe", "odometry") \ 432 | .option("delimeter",",") \ 433 | .option("startingOffsets", "latest") \ 434 | .load() 435 | 436 | df1 = df.selectExpr("CAST(value AS STRING)").select(from_json(col("value"),odometrySchema).alias("data")).select("data.*") 437 | df1.printSchema() 438 | 439 | df1.writeStream \ 440 | .outputMode("update") \ 441 | .format("console") \ 442 | .option("truncate", False) \ 443 | .start() \ 444 | .awaitTermination() 445 | ``` 446 | ### 5. Result 447 | After all the process is done, we got the data in our Cassandra table as the given below: 448 | 449 | You can query the given command to see your table: 450 | ``` 451 | # Open the cqlsh 452 | cqlsh 453 | # Then write select query to see content of the table 454 | cqlsh> select * from ros.odometry 455 | ``` 456 |

457 | 458 |

459 | 460 | --------------------------------------------------------------------------------