├── .gitignore ├── install.sh ├── cdsw_iot.py ├── particlespark.conf ├── total_data_count.py ├── total_event_count.py ├── iot_demo.py ├── data_count.py ├── event_count.py ├── particlespark.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | particle_spark.conf 2 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #Environment prep 2 | ZK="$1:2181" 3 | 4 | #Install pip 5 | sudo wget https://bootstrap.pypa.io/get-pip.py 6 | sudo python get-pip.py 7 | 8 | #Install Python dependencies 9 | sudo pip install sseclient 10 | sudo pip install kafka-python 11 | 12 | #Kafka set up 13 | kafka-topics --create --zookeeper $1 --replication-factor 1 --partitions 1 --topic particle 14 | 15 | -------------------------------------------------------------------------------- /cdsw_iot.py: -------------------------------------------------------------------------------- 1 | #Set up packages needed in workbench 2 | !pip install kafka-python --quiet 3 | !pip install ConfigParser --quiet 4 | !pip install thrift==0.9.3 5 | !conda install -y ibis-framework -c conda-forge 6 | !pip install thrift_sasl 7 | !pip install requests-kerberos 8 | 9 | 10 | 11 | 12 | import ConfigParser,json, os 13 | 14 | 15 | #Set up configuration 16 | Config = ConfigParser.ConfigParser() 17 | Config.read('particlespark.conf') 18 | kafka_broker = Config.get('Kafka','KafkaBrokers') 19 | kudu_master = Config.get('Kudu','KuduMaster') 20 | zookeeper = Config.get('Zookeeper','Zookeeper') 21 | principal = Config.get('Kerberos','Principal') 22 | keytab = Config.get('Kerberos','Keytab') 23 | impala_host = Config.get('Impala','Daemon').split(':')[0] 24 | impala_port = Config.get('Impala','Daemon').split(':')[1] 25 | 26 | 27 | #Creating tables 28 | 29 | 30 | import ibis 31 | 32 | con = ibis.impala.connect(host=impala_host, port=impala_port, 33 | database='default',auth_mechanism='GSSAPI', 34 | use_ssl=False) 35 | -------------------------------------------------------------------------------- /particlespark.conf: -------------------------------------------------------------------------------- 1 | #Defines options relating to Particle 2 | [Particle] 3 | ParticleUri=https://api.particle.io/v1/events ; The API endpoint. This default is for all public events. 4 | ApiKey=yourkeyhere ; This is an API key for accessing the Particle API. 5 | 6 | #Defines options relating to Kafka 7 | [Kafka] 8 | KafkaBrokers=ip-10-0-0-243.ec2.internal:9092 ; Location of a Kafka Broker 9 | 10 | #Defines options relating to Kudu 11 | [Kudu] 12 | KuduMaster=ip-10-0-0-243.ec2.internal:7051 ; Location of a Kudu Master 13 | 14 | #Defines options relating to Zookeeper 15 | [Zookeeper] 16 | Zookeeper=ip-10-0-0-243.ec2.internal:2181 17 | 18 | #Defines options relating to Impala 19 | [Impala] 20 | Daemon=cdh-0-160.cloudera.internal:21050 21 | 22 | #Defines Kerberos related options 23 | [Kerberos] 24 | Principal=navi@CLOUDERA.INTERNAL ; Kerberos principal 25 | Keytab=/home/cdsw/navi.keytab ; Fully qualified path to keytab 26 | 27 | #Defines script-related options 28 | [Options] 29 | PrintEvents=enabled ; This enables printing events to console as they come in. Valid values are enabled and disabled 30 | BatchSize=1000 ; This is the number of events between pauses. In low-resource Hadoop envs, high values cause memory exceptions. 31 | BatchPause=0 ; Number of seconds between event batches. 32 | -------------------------------------------------------------------------------- /total_data_count.py: -------------------------------------------------------------------------------- 1 | import json, sys, ConfigParser 2 | from operator import add 3 | from pyspark import SparkContext 4 | from pyspark import SparkConf 5 | from pyspark.streaming import StreamingContext 6 | from pyspark.streaming.kafka import KafkaUtils 7 | from pyspark.storagelevel import StorageLevel 8 | from pyspark.sql import SQLContext 9 | 10 | conf = (SparkConf().setMaster("yarn-client").setAppName("Total Data Count")) 11 | sc = SparkContext(conf = conf) 12 | sqc = SQLContext(sc) 13 | 14 | Config = ConfigParser.ConfigParser() 15 | Config.read('particlespark.conf') 16 | kudu_master = Config.get('Kudu','KuduMaster') 17 | kudu_all_data_table = "particle_test" 18 | kudu_counts_table = "particle_counts_total_data" 19 | 20 | #Read from Kudu table particle_test 21 | kudu_events_df = sqc.read.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_all_data_table).load() 22 | 23 | #Grab only the data column, split it by white space and count up each unique key 24 | kudu_events_count = kudu_events_df.map(lambda x: x.data).flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(add) 25 | 26 | #Convert event counts to DataFrame from RDD 27 | kudu_events_count_df = kudu_events_count.toDF(['data_word','count']) 28 | 29 | kudu_events_count_df.show() 30 | 31 | #Write the event word counts to Kudu particle_counts_total_data table 32 | kudu_events_count_df.write.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_counts_table).mode("append").save() 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /total_event_count.py: -------------------------------------------------------------------------------- 1 | import json, sys, ConfigParser 2 | from operator import add 3 | from pyspark import SparkContext 4 | from pyspark import SparkConf 5 | from pyspark.streaming import StreamingContext 6 | from pyspark.streaming.kafka import KafkaUtils 7 | from pyspark.storagelevel import StorageLevel 8 | from pyspark.sql import SQLContext 9 | 10 | conf = (SparkConf().setMaster("yarn-client").setAppName("Total Event Count")) 11 | sc = SparkContext(conf = conf) 12 | sqc = SQLContext(sc) 13 | 14 | Config = ConfigParser.ConfigParser() 15 | Config.read('particlespark.conf') 16 | kudu_master = Config.get('Kudu','KuduMaster') 17 | 18 | kudu_all_events_table = "particle_test" 19 | kudu_counts_table = "particle_counts_total" 20 | 21 | #Read from Kudu table particle_test 22 | kudu_events_df = sqc.read.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_all_events_table).load() 23 | 24 | #Grab only the event column, split it by white space and count up each unique key 25 | kudu_events_count = kudu_events_df.map(lambda x: x.event).flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(add) 26 | 27 | #Convert event counts to DataFrame from RDD 28 | kudu_events_count_df = kudu_events_count.toDF(['event_word','count']) 29 | 30 | kudu_events_count_df.show() 31 | 32 | #Write the event word counts to Kudu particle_counts_total table 33 | kudu_events_count_df.write.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_counts_table).mode("append").save() 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /iot_demo.py: -------------------------------------------------------------------------------- 1 | import json, ConfigParser 2 | from pyspark import SparkContext 3 | from pyspark import SparkConf 4 | from pyspark.streaming import StreamingContext 5 | from pyspark.streaming.kafka import KafkaUtils 6 | from pyspark.storagelevel import StorageLevel 7 | from pyspark.sql import SQLContext 8 | 9 | #conf = (SparkConf().setMaster("yarn-cluster").setAppName("Particle Stream to Kudu")) 10 | conf = (SparkConf().setMaster("yarn-client").setAppName("Particle Stream to Kudu")) 11 | sc = SparkContext(conf = conf) 12 | ssc = StreamingContext(sc,5) 13 | 14 | #Set up for Kafka and Kudu 15 | Config = ConfigParser.ConfigParser() 16 | Config.read('particlespark.conf') 17 | kafka_broker = Config.get('Kafka','KafkaBrokers') 18 | kudu_master = Config.get('Kudu','KuduMaster') 19 | kudu_table = "particle_test" 20 | topic = "particle" 21 | 22 | #Lazy SqlContext evaluation 23 | def getSqlContextInstance(sparkContext): 24 | if ('sqlContextSingletonInstance' not in globals()): 25 | globals()['sqlContextSingletonInstance'] = SQLContext(sc) 26 | return globals()['sqlContextSingletonInstance'] 27 | 28 | #Insert data into Kudu 29 | def insert_into_kudu(time,rdd): 30 | sqc = getSqlContextInstance(rdd.context) 31 | kudu_df = sqc.jsonRDD(rdd) 32 | kudu_df.show() 33 | kudu_df.write.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_table).mode("append").save() 34 | 35 | #Create a Kafka DStream by reading from our topic 36 | kafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": kafka_broker}) 37 | 38 | #Get rid of the key, which is null anyways 39 | json_kafka_stream = kafkaStream.map(lambda x: x[1]) 40 | 41 | #For each RDD in the DStream, insert it into Kudu table 42 | json_kafka_stream.foreachRDD(insert_into_kudu) 43 | 44 | ssc.start() 45 | ssc.awaitTermination() 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /data_count.py: -------------------------------------------------------------------------------- 1 | import json, sys, ConfigParser 2 | from operator import add 3 | from pyspark import SparkContext 4 | from pyspark import SparkConf 5 | from pyspark.streaming import StreamingContext 6 | from pyspark.streaming.kafka import KafkaUtils 7 | from pyspark.storagelevel import StorageLevel 8 | from pyspark.sql import SQLContext 9 | 10 | conf = (SparkConf().setMaster("yarn-client").setAppName("Count Data Word 20 Sec")) 11 | sc = SparkContext(conf = conf) 12 | ssc = StreamingContext(sc,20) 13 | 14 | Config = ConfigParser.ConfigParser() 15 | Config.read('particlespark.conf') 16 | kafka_broker = Config.get('Kafka','KafkaBrokers') 17 | kudu_master = Config.get('Kudu','KuduMaster') 18 | topic = "particle" 19 | kudu_table = "particle_counts_last_20_data" 20 | 21 | def getSqlContextInstance(sparkContext): 22 | if ('sqlContextSingletonInstance' not in globals()): 23 | globals()['sqlContextSingletonInstance'] = SQLContext(sc) 24 | return globals()['sqlContextSingletonInstance'] 25 | 26 | 27 | def insert_into_kudu(time,rdd): 28 | sqc = getSqlContextInstance(rdd.context) 29 | kudu_df = rdd.toDF(['data_word','count']) 30 | kudu_df.show() 31 | kudu_df.write.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_table).mode("append").save() 32 | 33 | def get_data(payload): 34 | if 'data' not in payload: 35 | print str(payload) 36 | else: 37 | return payload["data"] 38 | 39 | 40 | kafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": kafka_broker}) 41 | 42 | events_kafka_stream = kafkaStream.map(lambda x: get_data(json.loads(x[1]))) 43 | 44 | counts = events_kafka_stream.flatMap(lambda x: x.split(' ')).map(lambda x: (x,1)).reduceByKey(add) 45 | 46 | counts.foreachRDD(insert_into_kudu) 47 | 48 | ssc.start() 49 | ssc.awaitTermination() 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /event_count.py: -------------------------------------------------------------------------------- 1 | import json, sys, ConfigParser 2 | from operator import add 3 | from pyspark import SparkContext 4 | from pyspark import SparkConf 5 | from pyspark.streaming import StreamingContext 6 | from pyspark.streaming.kafka import KafkaUtils 7 | from pyspark.storagelevel import StorageLevel 8 | from pyspark.sql import SQLContext 9 | 10 | conf = (SparkConf().setMaster("yarn-client").setAppName("Count Event Word 20 Sec")) 11 | sc = SparkContext(conf = conf) 12 | ssc = StreamingContext(sc,20) 13 | 14 | 15 | Config = ConfigParser.ConfigParser() 16 | Config.read('particlespark.conf') 17 | kafka_broker = Config.get('Kafka','KafkaBrokers') 18 | kudu_master = Config.get('Kudu','KuduMaster') 19 | kudu_table = "particle_counts_last_20" 20 | topic = "particle" 21 | 22 | def getSqlContextInstance(sparkContext): 23 | if ('sqlContextSingletonInstance' not in globals()): 24 | globals()['sqlContextSingletonInstance'] = SQLContext(sc) 25 | return globals()['sqlContextSingletonInstance'] 26 | 27 | 28 | def insert_into_kudu(time,rdd): 29 | sqc = getSqlContextInstance(rdd.context) 30 | kudu_df = rdd.toDF(['event_word','count']) 31 | kudu_df.show() 32 | kudu_df.write.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_table).mode("append").save() 33 | 34 | def get_event(payload): 35 | if 'event' not in payload: 36 | print str(payload) 37 | else: 38 | return payload["event"] 39 | 40 | 41 | kafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": kafka_broker}) 42 | 43 | events_kafka_stream = kafkaStream.map(lambda x: get_event(json.loads(x[1]))) 44 | 45 | counts = events_kafka_stream.flatMap(lambda x: x.split(' ')).map(lambda x: (x,1)).reduceByKey(add) 46 | 47 | counts.foreachRDD(insert_into_kudu) 48 | 49 | ssc.start() 50 | ssc.awaitTermination() 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /particlespark.py: -------------------------------------------------------------------------------- 1 | import json, StringIO, requests, time, ConfigParser, sys 2 | from sseclient import SSEClient 3 | from datetime import datetime 4 | from kafka import KafkaClient, KafkaProducer 5 | 6 | #Configuration 7 | Config = ConfigParser.ConfigParser() 8 | Config.read('particlespark.conf') 9 | kafka_broker = Config.get('Kafka','KafkaBrokers') 10 | kafka_topic = "particle" 11 | api_key = Config.get('Particle','ApiKey') 12 | print_events = Config.get('Options','PrintEvents') 13 | batch_size = int(Config.get('Options','BatchSize')) 14 | batch_pause = int(Config.get('Options','BatchPause')) 15 | particle_uri = Config.get('Particle','ParticleUri') 16 | uri = particle_uri + '?access_token=' + api_key 17 | count = 0 18 | 19 | 20 | print kafka_broker 21 | #Instantiate Kafka Producer 22 | producer = KafkaProducer(bootstrap_servers=str(kafka_broker),api_version=(0,9)) 23 | 24 | #not sure if these headers are necessary even, but leaving 25 | headers = {"Accept-Content":"application/json; charset=UTF-8"} 26 | messages = SSEClient(uri) 27 | for msg in messages: 28 | event = '"'+msg.event+'"' 29 | data = msg.data 30 | payload = {} 31 | if(data): 32 | json_out = '{"event":' + event + "," + '"data":' + data + '}' 33 | 34 | #try loop because some data is wonky and causes exceptions. 35 | try: 36 | obj = json.loads(json_out) 37 | event = str(obj["event"]) 38 | data = str(obj["data"]["data"].replace(",","")) 39 | published_at = obj["data"]["published_at"] 40 | ttl = obj["data"]["ttl"] 41 | coreid = str(obj["data"]["coreid"]) 42 | parsed_time = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%S.%fZ") 43 | print(parsed_time) 44 | formatted_time = parsed_time.strftime("%Y-%m-%d %H:%M:%S.%f") 45 | payload["coreid"] = coreid 46 | payload["published_at"] = formatted_time 47 | payload["event"] = event 48 | payload["data"] = data 49 | payload["ttl"] = int(ttl) 50 | except: 51 | continue 52 | 53 | message = json.dumps(payload) 54 | 55 | #if event printing is enabled, send to console 56 | if(print_events == 'enabled'): 57 | print(message) 58 | #send to Kafka 59 | producer.send(kafka_topic,value=message) 60 | 61 | count += 1 62 | 63 | #once configured batch is met, wait for configured time 64 | if count >= batch_size: 65 | print('Wating for ' + str(batch_pause) + ' seconds') 66 | time.sleep(batch_pause) 67 | count = 0 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### IOT Demo 2 2 | 3 | #### Spark Streaming + Kafka + Kudu 4 | This is a demonstration showing how to use Spark/Spark Streaming to read from Kafka and insert data into Kudu - all in Python. The streaming data is coming from the Particle.io event stream, and requires an API key to consume. I believe this may be the first demonstration of reading from/writing to Kudu from Spark Streaming using Python. 5 | 6 | ##### Content Description 7 | - particlespark.py: SSEClient reading from Particle.io event stream and sending to Kafka topic 8 | - iot_demo.py: Spark streaming application reading from Kafka topic and inserting into a Kudu table 9 | - event_count.py: Spark streaming application reading from Kafka topic, counting unique words of last 20 seconds and upserting into a Kudu table every 20 seconds (shows update capabilities) 10 | - data_count.py: Same as above but counting data instead of events 11 | - event_count_total.py: Spark batch job that reads from the master event table (particle_test) and counts the total occurance of each word for all time and upserts (shows update capabilities) 12 | 13 | ##### Versions 14 | - CDH 5.8 15 | - Kafka 2.0.2-1.2.0.2.p0.5 16 | - Kudu 0.10.0-1.kudu0.10.0.p0.7 17 | - Impala_Kudu 2.6.0-1.cdh5.8.0.p0.17 18 | 19 | ##### Python Dependencies 20 | ``` 21 | sudo pip install sseclient 22 | sudo pip install kafka-python 23 | ``` 24 | ##### Configuration 25 | Working on making this a bit easier. First, install the above Python dependencies and run the below commands using your specific parameters where necessary. particlespark.conf contains parameters that need to be filled out for all of the .py files (python producer and spark jobs) to run. You can define the Kudu Master location and Kafka Broker once so that it does not need to be defined in the individual files 26 | 27 | ##### Impala create table: 28 | ``` 29 | CREATE TABLE `particle_test` ( 30 | `coreid` STRING, 31 | `published_at` STRING, 32 | `data` STRING, 33 | `event` STRING, 34 | `ttl` BIGINT 35 | ) 36 | DISTRIBUTE BY HASH (coreid) INTO 16 BUCKETS 37 | TBLPROPERTIES( 38 | 'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', 39 | 'kudu.table_name' = 'particle_test', 40 | 'kudu.master_addresses' = 'ip-10-0-0-224.ec2.internal:7051', 41 | 'kudu.key_columns' = 'coreid,published_at', 42 | 'kudu.num_tablet_replicas' = '3' 43 | ); 44 | ``` 45 | ``` 46 | CREATE TABLE `particle_counts_last_20_data` ( 47 | `data_word` STRING, 48 | `count` BIGINT 49 | ) 50 | DISTRIBUTE BY HASH (data_word) INTO 16 BUCKETS 51 | TBLPROPERTIES( 52 | 'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', 53 | 'kudu.table_name' = 'particle_counts_last_20_data', 54 | 'kudu.master_addresses' = 'ip-10-0-0-224.ec2.internal:7051', 55 | 'kudu.key_columns' = 'data_word', 56 | 'kudu.num_tablet_replicas' = '3' 57 | ); 58 | ``` 59 | ``` 60 | CREATE TABLE `particle_counts_last_20` ( 61 | `event_word` STRING, 62 | `count` BIGINT 63 | ) 64 | DISTRIBUTE BY HASH (event_word) INTO 16 BUCKETS 65 | TBLPROPERTIES( 66 | 'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', 67 | 'kudu.table_name' = 'particle_counts_last_20', 68 | 'kudu.master_addresses' = 'ip-10-0-0-224.ec2.internal:7051', 69 | 'kudu.key_columns' = 'event_word', 70 | 'kudu.num_tablet_replicas' = '3' 71 | ); 72 | ``` 73 | ``` 74 | CREATE TABLE `particle_counts_total` ( 75 | `event_word` STRING, 76 | `count` BIGINT 77 | ) 78 | DISTRIBUTE BY HASH (event_word) INTO 16 BUCKETS 79 | TBLPROPERTIES( 80 | 'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', 81 | 'kudu.table_name' = 'particle_counts_total', 82 | 'kudu.master_addresses' = 'ip-10-0-0-224.ec2.internal:7051', 83 | 'kudu.key_columns' = 'event_word', 84 | 'kudu.num_tablet_replicas' = '3' 85 | ); 86 | ``` 87 | ``` 88 | CREATE TABLE `particle_counts_total_data` ( 89 | `data_word` STRING, 90 | `count` BIGINT 91 | ) 92 | DISTRIBUTE BY HASH (data_word) INTO 16 BUCKETS 93 | TBLPROPERTIES( 94 | 'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler', 95 | 'kudu.table_name' = 'particle_counts_total_data', 96 | 'kudu.master_addresses' = 'ip-10-0-0-224.ec2.internal:7051', 97 | 'kudu.key_columns' = 'data_word', 98 | 'kudu.num_tablet_replicas' = '3' 99 | ); 100 | ``` 101 | 102 | Kafka create topic: 103 | ``` 104 | kafka-topics --create --zookeeper ip-10-0-0-224.ec2.internal:2181 --replication-factor 1 --partitions 1 --topic particle 105 | ``` 106 | 107 | spark-submit: 108 | ``` 109 | spark-submit --master yarn --jars kudu-spark_2.10-0.10.0.jar --packages org.apache.spark:spark-streaming-kafka_2.10:1.6.0 iot_demo.py 110 | spark-submit --master yarn --jars kudu-spark_2.10-0.10.0.jar --packages org.apache.spark:spark-streaming-kafka_2.10:1.6.0 event_count.py 111 | spark-submit --master yarn --jars kudu-spark_2.10-0.10.0.jar --packages org.apache.spark:spark-streaming-kafka_2.10:1.6.0 data_count.py 112 | spark-submit --master yarn --jars kudu-spark_2.10-0.10.0.jar --packages org.apache.spark:spark-streaming-kafka_2.10:1.6.0 total_event_count.py 113 | ``` 114 | python producer: 115 | ``` 116 | python particlespark.py 117 | ``` 118 | --------------------------------------------------------------------------------