├── .gitignore
├── install.sh
├── cdsw_iot.py
├── particlespark.conf
├── total_data_count.py
├── total_event_count.py
├── iot_demo.py
├── data_count.py
├── event_count.py
├── particlespark.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | particle_spark.conf
2 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #Environment prep
 2 | ZK="$1:2181"
 3 | 
 4 | #Install pip
 5 | sudo wget https://bootstrap.pypa.io/get-pip.py
 6 | sudo python get-pip.py
 7 | 
 8 | #Install Python  dependencies
 9 | sudo pip install sseclient
10 | sudo pip install kafka-python
11 | 
12 | #Kafka set up
13 | kafka-topics --create --zookeeper  $1 --replication-factor 1 --partitions 1 --topic particle
14 | 
15 | 


--------------------------------------------------------------------------------
/cdsw_iot.py:
--------------------------------------------------------------------------------
 1 | #Set up packages needed in workbench
 2 | !pip install kafka-python --quiet
 3 | !pip install ConfigParser --quiet
 4 | !pip install thrift==0.9.3
 5 | !conda install -y ibis-framework -c conda-forge
 6 | !pip install thrift_sasl
 7 | !pip install requests-kerberos
 8 | 
 9 | 
10 | 
11 | 
12 | import ConfigParser,json, os
13 | 
14 | 
15 | #Set up configuration
16 | Config = ConfigParser.ConfigParser()
17 | Config.read('particlespark.conf')
18 | kafka_broker = Config.get('Kafka','KafkaBrokers')
19 | kudu_master = Config.get('Kudu','KuduMaster')
20 | zookeeper = Config.get('Zookeeper','Zookeeper')
21 | principal = Config.get('Kerberos','Principal')
22 | keytab = Config.get('Kerberos','Keytab')
23 | impala_host = Config.get('Impala','Daemon').split(':')[0]
24 | impala_port = Config.get('Impala','Daemon').split(':')[1]
25 | 
26 | 
27 | #Creating tables
28 | 
29 | 
30 | import ibis
31 | 
32 | con = ibis.impala.connect(host=impala_host, port=impala_port, 
33 |                           database='default',auth_mechanism='GSSAPI', 
34 |                           use_ssl=False)
35 | 


--------------------------------------------------------------------------------
/particlespark.conf:
--------------------------------------------------------------------------------
 1 | #Defines options relating to Particle
 2 | [Particle]
 3 | ParticleUri=https://api.particle.io/v1/events ; The API endpoint. This default is for all public events.
 4 | ApiKey=yourkeyhere ; This is an API key for accessing the Particle API.
 5 | 
 6 | #Defines options relating to Kafka
 7 | [Kafka]
 8 | KafkaBrokers=ip-10-0-0-243.ec2.internal:9092 ; Location of a Kafka Broker
 9 | 
10 | #Defines options relating to Kudu
11 | [Kudu]
12 | KuduMaster=ip-10-0-0-243.ec2.internal:7051 ; Location of a Kudu Master 
13 | 
14 | #Defines options relating to Zookeeper
15 | [Zookeeper]
16 | Zookeeper=ip-10-0-0-243.ec2.internal:2181
17 | 
18 | #Defines options relating to Impala
19 | [Impala]
20 | Daemon=cdh-0-160.cloudera.internal:21050
21 | 
22 | #Defines Kerberos related options
23 | [Kerberos]
24 | Principal=navi@CLOUDERA.INTERNAL ; Kerberos principal
25 | Keytab=/home/cdsw/navi.keytab ; Fully qualified path to keytab
26 | 
27 | #Defines script-related options
28 | [Options]
29 | PrintEvents=enabled ; This enables printing events to console as they come in. Valid values are enabled and disabled
30 | BatchSize=1000 ; This is the number of events between pauses. In low-resource Hadoop envs, high values cause memory exceptions.
31 | BatchPause=0 ; Number of seconds between event batches.
32 | 


--------------------------------------------------------------------------------
/total_data_count.py:
--------------------------------------------------------------------------------
 1 | import json, sys, ConfigParser
 2 | from operator import add
 3 | from pyspark import SparkContext
 4 | from pyspark import SparkConf
 5 | from pyspark.streaming import StreamingContext
 6 | from pyspark.streaming.kafka import KafkaUtils
 7 | from pyspark.storagelevel import StorageLevel
 8 | from pyspark.sql import SQLContext
 9 | 
10 | conf = (SparkConf().setMaster("yarn-client").setAppName("Total Data Count"))
11 | sc = SparkContext(conf = conf)
12 | sqc = SQLContext(sc)
13 | 
14 | Config = ConfigParser.ConfigParser()
15 | Config.read('particlespark.conf')
16 | kudu_master = Config.get('Kudu','KuduMaster')
17 | kudu_all_data_table = "particle_test"
18 | kudu_counts_table = "particle_counts_total_data"
19 | 
20 | #Read from Kudu table particle_test
21 | kudu_events_df = sqc.read.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_all_data_table).load()
22 | 
23 | #Grab only the data column, split it by white space and count up each unique key
24 | kudu_events_count = kudu_events_df.map(lambda x: x.data).flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(add)
25 | 
26 | #Convert event counts to DataFrame from RDD
27 | kudu_events_count_df = kudu_events_count.toDF(['data_word','count'])
28 | 
29 | kudu_events_count_df.show()
30 | 
31 | #Write the event word counts to Kudu particle_counts_total_data table
32 | kudu_events_count_df.write.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_counts_table).mode("append").save()
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/total_event_count.py:
--------------------------------------------------------------------------------
 1 | import json, sys, ConfigParser
 2 | from operator import add
 3 | from pyspark import SparkContext
 4 | from pyspark import SparkConf
 5 | from pyspark.streaming import StreamingContext
 6 | from pyspark.streaming.kafka import KafkaUtils
 7 | from pyspark.storagelevel import StorageLevel
 8 | from pyspark.sql import SQLContext
 9 | 
10 | conf = (SparkConf().setMaster("yarn-client").setAppName("Total Event Count"))
11 | sc = SparkContext(conf = conf)
12 | sqc = SQLContext(sc)
13 | 
14 | Config = ConfigParser.ConfigParser()
15 | Config.read('particlespark.conf')
16 | kudu_master = Config.get('Kudu','KuduMaster')
17 | 
18 | kudu_all_events_table = "particle_test"
19 | kudu_counts_table = "particle_counts_total"
20 | 
21 | #Read from Kudu table particle_test
22 | kudu_events_df = sqc.read.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_all_events_table).load()
23 | 
24 | #Grab only the event column, split it by white space and count up each unique key
25 | kudu_events_count = kudu_events_df.map(lambda x: x.event).flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(add)
26 | 
27 | #Convert event counts to DataFrame from RDD
28 | kudu_events_count_df = kudu_events_count.toDF(['event_word','count'])
29 | 
30 | kudu_events_count_df.show()
31 | 
32 | #Write the event word counts to Kudu particle_counts_total table
33 | kudu_events_count_df.write.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_counts_table).mode("append").save()
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/iot_demo.py:
--------------------------------------------------------------------------------
 1 | import json, ConfigParser
 2 | from pyspark import SparkContext
 3 | from pyspark import SparkConf
 4 | from pyspark.streaming import StreamingContext
 5 | from pyspark.streaming.kafka import KafkaUtils
 6 | from pyspark.storagelevel import StorageLevel
 7 | from pyspark.sql import SQLContext
 8 | 
 9 | #conf = (SparkConf().setMaster("yarn-cluster").setAppName("Particle Stream to Kudu"))
10 | conf = (SparkConf().setMaster("yarn-client").setAppName("Particle Stream to Kudu"))
11 | sc = SparkContext(conf = conf)
12 | ssc = StreamingContext(sc,5)
13 | 
14 | #Set up for Kafka and Kudu
15 | Config = ConfigParser.ConfigParser()
16 | Config.read('particlespark.conf')
17 | kafka_broker = Config.get('Kafka','KafkaBrokers')
18 | kudu_master = Config.get('Kudu','KuduMaster')
19 | kudu_table = "particle_test"
20 | topic = "particle"
21 | 
22 | #Lazy SqlContext evaluation
23 | def getSqlContextInstance(sparkContext):
24 |     if ('sqlContextSingletonInstance' not in globals()):
25 |         globals()['sqlContextSingletonInstance'] = SQLContext(sc)
26 |     return globals()['sqlContextSingletonInstance']
27 | 
28 | #Insert data into Kudu
29 | def insert_into_kudu(time,rdd):
30 |     sqc = getSqlContextInstance(rdd.context)
31 |     kudu_df = sqc.jsonRDD(rdd)
32 |     kudu_df.show()
33 |     kudu_df.write.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_table).mode("append").save()
34 | 
35 | #Create a Kafka DStream by reading from our topic
36 | kafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": kafka_broker})
37 | 
38 | #Get rid of the key, which is null anyways
39 | json_kafka_stream = kafkaStream.map(lambda x: x[1])
40 | 
41 | #For each RDD in the DStream, insert it into Kudu table
42 | json_kafka_stream.foreachRDD(insert_into_kudu)
43 | 
44 | ssc.start()
45 | ssc.awaitTermination()
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/data_count.py:
--------------------------------------------------------------------------------
 1 | import json, sys, ConfigParser
 2 | from operator import add
 3 | from pyspark import SparkContext
 4 | from pyspark import SparkConf
 5 | from pyspark.streaming import StreamingContext
 6 | from pyspark.streaming.kafka import KafkaUtils
 7 | from pyspark.storagelevel import StorageLevel
 8 | from pyspark.sql import SQLContext
 9 | 
10 | conf = (SparkConf().setMaster("yarn-client").setAppName("Count Data Word 20 Sec"))
11 | sc = SparkContext(conf = conf)
12 | ssc = StreamingContext(sc,20)
13 | 
14 | Config = ConfigParser.ConfigParser()
15 | Config.read('particlespark.conf')
16 | kafka_broker = Config.get('Kafka','KafkaBrokers')
17 | kudu_master = Config.get('Kudu','KuduMaster')
18 | topic = "particle"
19 | kudu_table = "particle_counts_last_20_data"
20 | 
21 | def getSqlContextInstance(sparkContext):
22 |     if ('sqlContextSingletonInstance' not in globals()):
23 |         globals()['sqlContextSingletonInstance'] = SQLContext(sc)
24 |     return globals()['sqlContextSingletonInstance']
25 | 
26 | 
27 | def insert_into_kudu(time,rdd):
28 |     sqc = getSqlContextInstance(rdd.context)
29 |     kudu_df = rdd.toDF(['data_word','count'])
30 |     kudu_df.show()
31 |     kudu_df.write.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_table).mode("append").save()
32 | 
33 | def get_data(payload):
34 |     if 'data' not in payload: 
35 |         print str(payload)
36 |     else:
37 |         return payload["data"]
38 | 
39 | 
40 | kafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": kafka_broker})
41 | 
42 | events_kafka_stream = kafkaStream.map(lambda x: get_data(json.loads(x[1])))
43 | 
44 | counts = events_kafka_stream.flatMap(lambda x: x.split(' ')).map(lambda x: (x,1)).reduceByKey(add)
45 | 
46 | counts.foreachRDD(insert_into_kudu)
47 | 
48 | ssc.start()
49 | ssc.awaitTermination()
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/event_count.py:
--------------------------------------------------------------------------------
 1 | import json, sys, ConfigParser
 2 | from operator import add
 3 | from pyspark import SparkContext
 4 | from pyspark import SparkConf
 5 | from pyspark.streaming import StreamingContext
 6 | from pyspark.streaming.kafka import KafkaUtils
 7 | from pyspark.storagelevel import StorageLevel
 8 | from pyspark.sql import SQLContext
 9 | 
10 | conf = (SparkConf().setMaster("yarn-client").setAppName("Count Event Word 20 Sec"))
11 | sc = SparkContext(conf = conf)
12 | ssc = StreamingContext(sc,20)
13 | 
14 | 
15 | Config = ConfigParser.ConfigParser()
16 | Config.read('particlespark.conf')
17 | kafka_broker = Config.get('Kafka','KafkaBrokers')
18 | kudu_master = Config.get('Kudu','KuduMaster')
19 | kudu_table = "particle_counts_last_20"
20 | topic = "particle"
21 | 
22 | def getSqlContextInstance(sparkContext):
23 |     if ('sqlContextSingletonInstance' not in globals()):
24 |         globals()['sqlContextSingletonInstance'] = SQLContext(sc)
25 |     return globals()['sqlContextSingletonInstance']
26 | 
27 | 
28 | def insert_into_kudu(time,rdd):
29 |     sqc = getSqlContextInstance(rdd.context)
30 |     kudu_df = rdd.toDF(['event_word','count'])
31 |     kudu_df.show()
32 |     kudu_df.write.format('org.apache.kudu.spark.kudu').option('kudu.master',kudu_master).option('kudu.table',kudu_table).mode("append").save()
33 | 
34 | def get_event(payload):
35 |     if 'event' not in payload: 
36 |         print str(payload)
37 |     else:
38 |         return payload["event"]
39 | 
40 | 
41 | kafkaStream = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": kafka_broker})
42 | 
43 | events_kafka_stream = kafkaStream.map(lambda x: get_event(json.loads(x[1])))
44 | 
45 | counts = events_kafka_stream.flatMap(lambda x: x.split(' ')).map(lambda x: (x,1)).reduceByKey(add)
46 | 
47 | counts.foreachRDD(insert_into_kudu)
48 | 
49 | ssc.start()
50 | ssc.awaitTermination()
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/particlespark.py:
--------------------------------------------------------------------------------
 1 | import json, StringIO, requests, time, ConfigParser, sys
 2 | from sseclient import SSEClient
 3 | from datetime import datetime
 4 | from kafka import KafkaClient, KafkaProducer
 5 | 
 6 | #Configuration
 7 | Config = ConfigParser.ConfigParser()
 8 | Config.read('particlespark.conf')
 9 | kafka_broker = Config.get('Kafka','KafkaBrokers')
10 | kafka_topic = "particle"
11 | api_key = Config.get('Particle','ApiKey')
12 | print_events = Config.get('Options','PrintEvents')
13 | batch_size = int(Config.get('Options','BatchSize'))
14 | batch_pause = int(Config.get('Options','BatchPause'))
15 | particle_uri = Config.get('Particle','ParticleUri')
16 | uri = particle_uri + '?access_token=' + api_key
17 | count = 0
18 | 
19 | 
20 | print kafka_broker
21 | #Instantiate Kafka Producer
22 | producer = KafkaProducer(bootstrap_servers=str(kafka_broker),api_version=(0,9))
23 | 
24 | #not sure if these headers are necessary even, but leaving
25 | headers = {"Accept-Content":"application/json; charset=UTF-8"}
26 | messages = SSEClient(uri)
27 | for msg in messages:
28 |     event = '"'+msg.event+'"'
29 |     data = msg.data
30 |     payload = {}
31 |     if(data):
32 |         json_out = '{"event":' + event + "," + '"data":' + data + '}'
33 |         
34 |         #try loop because some data is wonky and causes exceptions.
35 |         try:
36 |             obj = json.loads(json_out)
37 |             event = str(obj["event"])
38 |             data  = str(obj["data"]["data"].replace(",",""))
39 |             published_at = obj["data"]["published_at"]
40 |             ttl = obj["data"]["ttl"]
41 |             coreid = str(obj["data"]["coreid"])
42 |             parsed_time = datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%S.%fZ")
43 |             print(parsed_time)
44 |             formatted_time = parsed_time.strftime("%Y-%m-%d %H:%M:%S.%f")
45 |             payload["coreid"] = coreid
46 |             payload["published_at"] = formatted_time
47 |             payload["event"] = event
48 |             payload["data"] = data
49 |             payload["ttl"] = int(ttl)
50 |         except:
51 |             continue
52 |      
53 |         message = json.dumps(payload)
54 | 
55 |         #if event printing is enabled, send to console
56 |         if(print_events == 'enabled'):
57 |             print(message)
58 |         #send to Kafka
59 |         producer.send(kafka_topic,value=message)
60 | 
61 |         count += 1
62 |         
63 |         #once configured batch is met, wait for configured time
64 |         if count >= batch_size:
65 |             print('Wating for ' + str(batch_pause) + ' seconds')
66 |             time.sleep(batch_pause)
67 |             count = 0
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ### IOT Demo 2
  2 | 
  3 | #### Spark Streaming + Kafka + Kudu
  4 | This is a demonstration showing how to use Spark/Spark Streaming to read from Kafka and insert data into Kudu - all in Python. The streaming data is coming from the Particle.io event stream, and requires an API key to consume. I believe this may be the first demonstration of reading from/writing to Kudu from Spark Streaming using Python.
  5 | 
  6 | ##### Content Description
  7 | - particlespark.py: SSEClient reading from Particle.io event stream and sending to Kafka topic 
  8 | - iot_demo.py: Spark streaming application reading from Kafka topic and inserting into a Kudu table
  9 | - event_count.py: Spark streaming application reading from Kafka topic, counting unique words of last 20 seconds and upserting into a Kudu table every 20 seconds (shows update capabilities)
 10 | - data_count.py: Same as above but counting data instead of events
 11 | - event_count_total.py: Spark batch job that reads from the master event table (particle_test) and counts the total occurance of each word for all time and upserts (shows update capabilities)
 12 | 
 13 | ##### Versions
 14 | - CDH 5.8
 15 | - Kafka 2.0.2-1.2.0.2.p0.5
 16 | - Kudu 0.10.0-1.kudu0.10.0.p0.7 
 17 | - Impala_Kudu 2.6.0-1.cdh5.8.0.p0.17
 18 | 
 19 | ##### Python Dependencies
 20 | ```
 21 | sudo pip install sseclient
 22 | sudo pip install kafka-python
 23 | ```
 24 | ##### Configuration 
 25 | Working on making this a bit easier. First, install the above Python dependencies and run the below commands using your specific parameters where necessary. particlespark.conf contains parameters that need to be filled out for all of the .py files (python producer and spark jobs) to run. You can define the Kudu Master location and Kafka Broker once so that it does not need to be defined in the individual files
 26 | 
 27 | ##### Impala create table:
 28 | ```
 29 | CREATE TABLE `particle_test` (
 30 | `coreid` STRING,
 31 | `published_at` STRING,
 32 | `data` STRING,
 33 | `event` STRING,
 34 | `ttl` BIGINT
 35 | )
 36 | DISTRIBUTE BY HASH (coreid) INTO 16 BUCKETS
 37 | TBLPROPERTIES(
 38 |  'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler',
 39 |  'kudu.table_name' = 'particle_test',
 40 |  'kudu.master_addresses' = 'ip-10-0-0-224.ec2.internal:7051',
 41 |  'kudu.key_columns' = 'coreid,published_at',
 42 |  'kudu.num_tablet_replicas' = '3'
 43 | );
 44 | ```
 45 | ```
 46 | CREATE TABLE `particle_counts_last_20_data` (
 47 | `data_word` STRING,
 48 | `count` BIGINT
 49 | )
 50 | DISTRIBUTE BY HASH (data_word) INTO 16 BUCKETS
 51 | TBLPROPERTIES(
 52 |  'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler',
 53 |  'kudu.table_name' = 'particle_counts_last_20_data',
 54 |  'kudu.master_addresses' = 'ip-10-0-0-224.ec2.internal:7051',
 55 |  'kudu.key_columns' = 'data_word',
 56 |  'kudu.num_tablet_replicas' = '3'
 57 | );
 58 | ```
 59 | ```
 60 | CREATE TABLE `particle_counts_last_20` (
 61 | `event_word` STRING,
 62 | `count` BIGINT
 63 | )
 64 | DISTRIBUTE BY HASH (event_word) INTO 16 BUCKETS
 65 | TBLPROPERTIES(
 66 |  'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler',
 67 |  'kudu.table_name' = 'particle_counts_last_20',
 68 |  'kudu.master_addresses' = 'ip-10-0-0-224.ec2.internal:7051',
 69 |  'kudu.key_columns' = 'event_word',
 70 |  'kudu.num_tablet_replicas' = '3'
 71 | );
 72 | ```
 73 | ```
 74 | CREATE TABLE `particle_counts_total` (
 75 | `event_word` STRING,
 76 | `count` BIGINT
 77 | )
 78 | DISTRIBUTE BY HASH (event_word) INTO 16 BUCKETS
 79 | TBLPROPERTIES(
 80 |  'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler',
 81 |  'kudu.table_name' = 'particle_counts_total',
 82 |  'kudu.master_addresses' = 'ip-10-0-0-224.ec2.internal:7051',
 83 |  'kudu.key_columns' = 'event_word',
 84 |  'kudu.num_tablet_replicas' = '3'
 85 | );
 86 | ```
 87 | ```
 88 | CREATE TABLE `particle_counts_total_data` (
 89 | `data_word` STRING,
 90 | `count` BIGINT
 91 | )
 92 | DISTRIBUTE BY HASH (data_word) INTO 16 BUCKETS
 93 | TBLPROPERTIES(
 94 |  'storage_handler' = 'com.cloudera.kudu.hive.KuduStorageHandler',
 95 |  'kudu.table_name' = 'particle_counts_total_data',
 96 |  'kudu.master_addresses' = 'ip-10-0-0-224.ec2.internal:7051',
 97 |  'kudu.key_columns' = 'data_word',
 98 |  'kudu.num_tablet_replicas' = '3'
 99 | );
100 | ```
101 | 
102 | Kafka create topic:
103 | ```
104 | kafka-topics --create --zookeeper  ip-10-0-0-224.ec2.internal:2181 --replication-factor 1 --partitions 1 --topic particle
105 | ```
106 | 
107 | spark-submit:
108 | ```
109 | spark-submit --master yarn --jars kudu-spark_2.10-0.10.0.jar --packages org.apache.spark:spark-streaming-kafka_2.10:1.6.0 iot_demo.py
110 | spark-submit --master yarn --jars kudu-spark_2.10-0.10.0.jar --packages org.apache.spark:spark-streaming-kafka_2.10:1.6.0 event_count.py
111 | spark-submit --master yarn --jars kudu-spark_2.10-0.10.0.jar --packages org.apache.spark:spark-streaming-kafka_2.10:1.6.0 data_count.py
112 | spark-submit --master yarn --jars kudu-spark_2.10-0.10.0.jar --packages org.apache.spark:spark-streaming-kafka_2.10:1.6.0 total_event_count.py
113 | ```
114 | python producer:
115 | ```
116 | python particlespark.py
117 | ```
118 | 


--------------------------------------------------------------------------------