├── tweets-per-user-counter.properties ├── .gitignore ├── src └── main │ ├── resources │ └── log4j.properties │ ├── avro │ └── com │ │ └── github │ │ └── rollulus │ │ └── myprocessor │ │ └── TwitterStatus.avsc │ └── scala │ └── com │ └── github │ └── rollulus │ └── myprocessor │ ├── PropertiesHelpers.scala │ ├── SpecificAvroDeserializer.scala │ ├── TweetsPerMinuteProcessingTime.scala │ ├── TweetsPerMinuteEventTime.scala │ └── TweetsPerUserCounter.scala ├── README.md └── pom.xml /tweets-per-user-counter.properties: -------------------------------------------------------------------------------- 1 | job.id=tweets-per-user-counter 2 | bootstrap.servers=localhost:9092 3 | zookeeper.connect=localhost:2181 4 | schema.registry.url=http://localhost:8081 5 | source.topic=tweets 6 | sink.topic=tweets-count 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache 6 | .history 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | # Scala-IDE specific 16 | .scala_dependencies 17 | .worksheet 18 | 19 | *.iml 20 | *.ipr 21 | *.iws 22 | .idea 23 | 24 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # suppress inspection "UnusedProperty" for whole file 2 | log4j.rootLogger=INFO,stdout 3 | 4 | #stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.stdout.layout.conversionPattern=%d{ISO8601} %-5p [%t] [%c] [%M:%L] %m%n 8 | -------------------------------------------------------------------------------- /src/main/avro/com/github/rollulus/myprocessor/TwitterStatus.avsc: -------------------------------------------------------------------------------- 1 | {"type":"record","name":"TwitterStatus","namespace":"com.eneco.trading.kafka.connect.twitter","fields":[{"name":"id","type":"long"},{"name":"createdAt","type":"string"},{"name":"favoriteCount","type":"int"},{"name":"text","type":"string"},{"name":"user","type":{"type":"record","name":"TwitterUser","namespace":"com.github.rollulus.myprocessor","fields":[{"name":"id","type":"long"},{"name":"name","type":"string"},{"name":"screenName","type":"string"}],"connect.name":"TwitterUser"}}],"connect.name":"com.github.rollulus.myprocessor.TwitterStatus"} 2 | -------------------------------------------------------------------------------- /src/main/scala/com/github/rollulus/myprocessor/PropertiesHelpers.scala: -------------------------------------------------------------------------------- 1 | package com.github.rollulus.myprocessor 2 | 3 | import collection.JavaConversions._ 4 | import java.io.FileInputStream 5 | 6 | object Properties { 7 | def create(m: Map[String, _ <: AnyRef]) = { 8 | val ps = new java.util.Properties 9 | ps.putAll(m) 10 | ps 11 | } 12 | def union(a: java.util.Properties, b:java.util.Properties) = { 13 | val ps = new java.util.Properties 14 | ps.putAll(b) 15 | ps.putAll(a) 16 | ps 17 | } 18 | def fromFile(filename: String) = { 19 | val ps = new java.util.Properties() 20 | ps.load(new FileInputStream(filename)) 21 | ps 22 | } 23 | 24 | implicit class MyProperties(p: java.util.Properties) { 25 | def union(q: java.util.Properties) = Properties.union(p, q) 26 | } 27 | } 28 | 29 | -------------------------------------------------------------------------------- /src/main/scala/com/github/rollulus/myprocessor/SpecificAvroDeserializer.scala: -------------------------------------------------------------------------------- 1 | package com.github.rollulus.myprocessor 2 | 3 | import io.confluent.kafka.schemaregistry.client.SchemaRegistryClient 4 | import io.confluent.kafka.serializers.KafkaAvroDeserializer 5 | import org.apache.kafka.common.Configurable 6 | import org.apache.kafka.common.serialization.Deserializer 7 | 8 | class SpecificAvroDeserializer[T <: org.apache.avro.specific.SpecificRecord] extends Deserializer[T] with Configurable { 9 | private[myprocessor] var inner: KafkaAvroDeserializer = null 10 | 11 | def this(client: SchemaRegistryClient) { 12 | this() 13 | inner = new KafkaAvroDeserializer(client) 14 | } 15 | 16 | def this(client: SchemaRegistryClient, props: java.util.Map[String, _]) { 17 | this() 18 | inner = new KafkaAvroDeserializer(client, props) 19 | } 20 | 21 | def configure(configs: java.util.Map[String, _], isKey: Boolean) { 22 | inner = new KafkaAvroDeserializer 23 | inner.configure(configs, isKey) 24 | } 25 | 26 | def configure(configs: java.util.Map[String, _]) { 27 | inner = new KafkaAvroDeserializer 28 | inner.configure(configs, false) 29 | } 30 | 31 | @SuppressWarnings(Array("unchecked")) def deserialize(s: String, bytes: Array[Byte]): T = { 32 | return inner.deserialize(s, bytes).asInstanceOf[T] 33 | } 34 | 35 | def close { 36 | inner.close 37 | } 38 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Kafka Streams Playground 2 | ======================== 3 | 4 | Here are a few examples I made while experimenting with the Kafka Streams. 5 | At the moment of writing (April 2016), Confluent provides a tech preview version of Kafka Streams in their 2.1.0-alpha version of their platform. 6 | I hope that my examples help others getting getting started. 7 | 8 | My examples source from the [Twitter Kafka Connector](https://github.com/Eneco/kafka-connect-twitter). 9 | AVRO serialized `TwitterStatus` are used as input. 10 | 11 | Usage 12 | ===== 13 | 14 | mvn package 15 | java -cp target/kafka-streams-playground-0.1-jar-with-dependencies.jar com.github.rollulus.myprocessor.TweetsPerMinuteEventTime tweets-per-user-counter.properties 16 | 17 | To run the `TweetsPerMinuteCounter` example. Despite the name, `tweets-per-user-counter.properties` can be used for all examples. 18 | 19 | Example Stream Processors 20 | ========================= 21 | 22 | TweetsPerUserCounter 23 | -------------------- 24 | 25 | Gives an unbounded count of tweets per user. 26 | The key is the username, the value is a long. 27 | 28 | TweetsPerMinuteProcessingTime 29 | ----------------------------- 30 | 31 | Counts tweets per minute at processing time, using `HoppingWindows`. 32 | 33 | TweetsPerMinuteEventTime 34 | ------------------------ 35 | 36 | Counts tweets per minute at event time (i.e. the timestamp assigned by Twitter). 37 | 38 | -------------------------------------------------------------------------------- /src/main/scala/com/github/rollulus/myprocessor/TweetsPerMinuteProcessingTime.scala: -------------------------------------------------------------------------------- 1 | package com.github.rollulus.myprocessor 2 | 3 | import com.eneco.trading.kafka.connect.twitter.TwitterStatus 4 | import org.apache.kafka.common.serialization.{LongDeserializer, StringDeserializer, LongSerializer, StringSerializer} 5 | import org.apache.kafka.streams.KafkaStreams 6 | import org.apache.kafka.streams.kstream.internals.WindowedSerializer 7 | import org.apache.kafka.streams.kstream.{HoppingWindows, KStreamBuilder} 8 | import KeyValueImplicits._ 9 | import Properties._ 10 | 11 | object TweetsPerMinuteProcessingTime { 12 | lazy val SOURCE_TOPIC_CONFIG = "source.topic" 13 | lazy val SINK_TOPIC_CONFIG = "sink.topic" 14 | 15 | def propertiesFromFiles(files: Array[String]) = files.map(Properties.fromFile).foldLeft(new java.util.Properties)(Properties.union) 16 | 17 | def main(args: Array[String]): Unit = { 18 | // configure 19 | require(args.length > 0, "at least one .properties file should be given as program argument") 20 | val builder = new KStreamBuilder 21 | val cfg = propertiesFromFiles(args).union(fixedProperties()) 22 | val sourceTopic = cfg.getProperty(SOURCE_TOPIC_CONFIG) 23 | val sinkTopic = cfg.getProperty(SINK_TOPIC_CONFIG) 24 | 25 | // source 26 | val tweets = builder.stream[String, TwitterStatus](sourceTopic) 27 | 28 | // transformation 29 | // per/minute is at processing time, update every second with overlapping windows 30 | val tweetcount = tweets.map[String, TwitterStatus]((k, v) => { 31 | ("somekey", v) 32 | }).countByKey(HoppingWindows.of("x").`with`(60*1000).every(1*1000),new StringSerializer, new LongSerializer, new StringDeserializer, new LongDeserializer) 33 | 34 | // sink 35 | tweetcount.to(sinkTopic, new WindowedSerializer[String](new StringSerializer), new LongSerializer) 36 | 37 | // run 38 | new KafkaStreams(builder, cfg).start() 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/com/github/rollulus/myprocessor/TweetsPerMinuteEventTime.scala: -------------------------------------------------------------------------------- 1 | package com.github.rollulus.myprocessor 2 | 3 | import java.text.SimpleDateFormat 4 | 5 | import com.eneco.trading.kafka.connect.twitter.TwitterStatus 6 | import org.apache.kafka.common.serialization.{LongDeserializer, StringDeserializer, LongSerializer, StringSerializer} 7 | import org.apache.kafka.streams.KafkaStreams 8 | import org.apache.kafka.streams.kstream.{KStreamBuilder} 9 | import KeyValueImplicits._ 10 | import Properties._ 11 | 12 | object TweetsPerMinuteEventTime { 13 | lazy val SOURCE_TOPIC_CONFIG = "source.topic" 14 | lazy val SINK_TOPIC_CONFIG = "sink.topic" 15 | 16 | def propertiesFromFiles(files: Array[String]) = files.map(Properties.fromFile).foldLeft(new java.util.Properties)(Properties.union) 17 | 18 | def main(args: Array[String]): Unit = { 19 | // configure 20 | require(args.length > 0, "at least one .properties file should be given as program argument") 21 | val builder = new KStreamBuilder 22 | val cfg = propertiesFromFiles(args).union(fixedProperties()) 23 | val sourceTopic = cfg.getProperty(SOURCE_TOPIC_CONFIG) 24 | val sinkTopic = cfg.getProperty(SINK_TOPIC_CONFIG) 25 | 26 | // source 27 | val tweets = builder.stream[String, TwitterStatus](sourceTopic) 28 | 29 | // transformation 30 | // per/minute is at event time, updated every minute with non-overlapping windows 31 | val tweetcount = tweets.map[String, TwitterStatus]((k, v) => { 32 | val dt = new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy").parse(v.getCreatedAt); 33 | (new SimpleDateFormat("EEE MMM dd HH:mm:00 Z yyyy").format(dt), v) 34 | }).countByKey(new StringSerializer, new LongSerializer, new StringDeserializer, new LongDeserializer, "Count") 35 | 36 | // sink 37 | tweetcount.to(sinkTopic, new StringSerializer, new LongSerializer) 38 | 39 | // run 40 | new KafkaStreams(builder, cfg).start() 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/scala/com/github/rollulus/myprocessor/TweetsPerUserCounter.scala: -------------------------------------------------------------------------------- 1 | package com.github.rollulus.myprocessor 2 | 3 | import com.eneco.trading.kafka.connect.twitter.TwitterStatus 4 | import io.confluent.kafka.serializers.KafkaAvroDeserializerConfig 5 | import org.apache.kafka.common.serialization._ 6 | import org.apache.kafka.streams.kstream.{KStreamBuilder} 7 | import org.apache.kafka.streams._ 8 | import KeyValueImplicits._ 9 | import Properties._ 10 | 11 | object TweetsPerUserCounter { 12 | lazy val SOURCE_TOPIC_CONFIG = "source.topic" 13 | lazy val SINK_TOPIC_CONFIG = "sink.topic" 14 | 15 | def propertiesFromFiles(files: Array[String]) = files.map(Properties.fromFile).foldLeft(new java.util.Properties)(Properties.union) 16 | 17 | def main(args: Array[String]): Unit = { 18 | // configure 19 | require(args.length > 0, "at least one .properties file should be given as program argument") 20 | val builder = new KStreamBuilder 21 | val cfg = propertiesFromFiles(args).union(fixedProperties()) 22 | val sourceTopic = cfg.getProperty(SOURCE_TOPIC_CONFIG) 23 | val sinkTopic = cfg.getProperty(SINK_TOPIC_CONFIG) 24 | 25 | // source 26 | val tweets = builder.stream[String, TwitterStatus](sourceTopic) 27 | 28 | // transformation 29 | val tweetcount = tweets.map[String, TwitterStatus]((k, v) => { 30 | (v.getUser().getScreenName, v) 31 | }).countByKey(new StringSerializer, new LongSerializer, new StringDeserializer, new LongDeserializer, "Count") 32 | 33 | // sink 34 | tweetcount.to(sinkTopic, new StringSerializer, new LongSerializer) 35 | 36 | // run 37 | new KafkaStreams(builder, cfg).start() 38 | } 39 | } 40 | 41 | object fixedProperties { 42 | def apply() = Properties.create(Map( 43 | StreamsConfig.KEY_SERIALIZER_CLASS_CONFIG -> classOf[StringSerializer], 44 | StreamsConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer], 45 | StreamsConfig.VALUE_SERIALIZER_CLASS_CONFIG -> classOf[StringSerializer], 46 | StreamsConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[SpecificAvroDeserializer[TwitterStatus]], 47 | KafkaAvroDeserializerConfig.SPECIFIC_AVRO_READER_CONFIG -> "true")) 48 | } 49 | 50 | object KeyValueImplicits { 51 | implicit def Tuple2ToKeyValue[K, V](tuple: (K, V)): KeyValue[K, V] = new KeyValue(tuple._1, tuple._2) 52 | } 53 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | kafka-streams-playground 5 | kafka-streams-playground 6 | jar 7 | 0.1 8 | kafka-streams-playground 9 | 10 | 11 | confluent 12 | http://packages.confluent.io/maven/ 13 | 14 | 15 | 16 | confluent-staging 17 | http://staging-confluent-packages-maven-2.1.0.s3.amazonaws.com/maven/ 18 | 19 | 20 | 21 | 1.8 22 | 0.9.1.0-cp1 23 | 2.11 24 | ${kafka.scala.version}.7 25 | 2.1.0-alpha1 26 | 2.2.6 27 | 1.7.7 28 | UTF-8 29 | 30 | 31 | 32 | io.confluent 33 | kafka-avro-serializer 34 | ${confluent.version} 35 | 36 | 37 | org.apache.kafka 38 | kafka-clients 39 | ${kafka.version} 40 | 41 | 42 | org.apache.kafka 43 | kafka-streams 44 | ${kafka.version} 45 | 46 | 47 | org.apache.avro 48 | avro 49 | ${avro.version} 50 | 51 | 52 | org.apache.avro 53 | avro-maven-plugin 54 | ${avro.version} 55 | 56 | 57 | org.scala-lang 58 | scala-library 59 | 60 | ${scala.version} 61 | 62 | 63 | 66 | com.101tec 67 | zkclient 68 | 0.7 69 | 70 | 71 | 72 | 73 | junit 74 | junit 75 | 4.12 76 | test 77 | 78 | 79 | org.assertj 80 | assertj-core 81 | 3.3.0 82 | test 83 | 84 | 85 | org.apache.kafka 86 | kafka_${kafka.scala.version} 87 | ${kafka.version} 88 | test 89 | 90 | 91 | org.apache.curator 92 | curator-test 93 | 2.9.0 94 | test 95 | 96 | 97 | 100 | 101 | 103 | org.scalactic 104 | scalactic_${kafka.scala.version} 105 | ${scalatest.version} 106 | 107 | 108 | org.scalatest 109 | scalatest_${kafka.scala.version} 110 | ${scalatest.version} 111 | test 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | net.alchim31.maven 120 | scala-maven-plugin 121 | 3.2.1 122 | 123 | 124 | 125 | 126 | 127 | 128 | 132 | 133 | org.codehaus.mojo 134 | build-helper-maven-plugin 135 | 1.10 136 | 137 | 138 | add-source 139 | generate-sources 140 | 141 | add-source 142 | 143 | 144 | 145 | src/main/scala 146 | 147 | 148 | 149 | 150 | add-test-source 151 | generate-test-sources 152 | 153 | add-test-source 154 | 155 | 156 | 157 | src/test/scala 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | net.alchim31.maven 167 | scala-maven-plugin 168 | 3.2.1 169 | 170 | 171 | 176 | -Xexperimental 177 | 178 | 179 | 180 | 181 | 182 | compile 183 | testCompile 184 | 185 | 186 | 187 | 188 | 189 | 190 | org.apache.maven.plugins 191 | maven-compiler-plugin 192 | 3.3 193 | true 194 | 195 | ${java.version} 196 | ${java.version} 197 | 198 | 199 | 200 | 201 | org.apache.avro 202 | avro-maven-plugin 203 | 1.7.7 204 | 205 | 206 | generate-sources 207 | 208 | schema 209 | 210 | 211 | ${project.basedir}/src/main/avro/com/github/rollulus/myprocessor 212 | ${project.build.directory}/generated-sources 213 | String 214 | 215 | 216 | 217 | 218 | 219 | 220 | maven-assembly-plugin 221 | 2.4 222 | 223 | 224 | jar-with-dependencies 225 | 226 | 227 | 228 | 229 | make-assembly 230 | package 231 | 232 | single 233 | 234 | 235 | 236 | 237 | 238 | 263 | 264 | 265 | 269 | org.jasig.maven 270 | maven-notice-plugin 271 | 1.0.6.1 272 | 273 | 274 | ../license-mappings.xml 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | --------------------------------------------------------------------------------