├── LICENSE.md ├── project └── assembly.sbt ├── .gitignore ├── docker ├── ingest │ └── Dockerfile └── aggregation │ └── Dockerfile ├── application.conf.template ├── docker-compose.yml ├── twitterstream └── src │ └── main │ └── scala │ ├── Logging.scala │ ├── IngestApp.scala │ ├── TweetSource.scala │ ├── AggregationApp.scala │ ├── Settings.scala │ └── Serdes.scala └── README.md /LICENSE.md: -------------------------------------------------------------------------------- 1 | Whatever you want. 2 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.3") 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | application.conf 2 | twitterstream/target/** 3 | target 4 | target/** 5 | src 6 | src/** 7 | project/** 8 | project/target/** 9 | .idea/** 10 | -------------------------------------------------------------------------------- /docker/ingest/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM anapsix/alpine-java:8 2 | 3 | ADD application.conf /application.conf 4 | ADD twitterstream.jar /twitterstream.jar 5 | 6 | CMD ["java","-cp","twitterstream.jar","mwt.twitterstream.IngestApp"] 7 | 8 | -------------------------------------------------------------------------------- /docker/aggregation/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM anapsix/alpine-java 2 | 3 | ADD application.conf /application.conf 4 | ADD twitterstream.jar /twitterstream.jar 5 | 6 | CMD ["java","-cp","twitterstream.jar","mwt.twitterstream.AggregationApp"] 7 | 8 | -------------------------------------------------------------------------------- /application.conf.template: -------------------------------------------------------------------------------- 1 | kafka { 2 | brokers = "kafka:9092" 3 | zookeepers = "zookeeper:2181" 4 | raw_topic = "tweets" 5 | aggregation_topic = "aggregation" 6 | state_dir = "/tmp" 7 | partition = 0 8 | } 9 | 10 | twitter { 11 | consumer_key = "" 12 | consumer_secret = "" 13 | token = "" 14 | token_secret = "" 15 | terms = ["hashtag", "#hashtag"] 16 | } 17 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | ingest: 4 | container_name: twitterstream_ingest 5 | build: docker/ingest 6 | hostname: ingest 7 | links: 8 | - zookeeper 9 | - kafka 10 | 11 | aggregation: 12 | container_name: twitterstream_aggregation 13 | build: docker/aggregation 14 | hostname: aggregation 15 | links: 16 | - zookeeper 17 | - kafka 18 | 19 | zookeeper: 20 | container_name: twitterstream_zookeeper 21 | image: wurstmeister/zookeeper:3.4.6 22 | ports: 23 | - "2181:2181" 24 | 25 | kafka: 26 | container_name: twitterstream_kafka 27 | image: wurstmeister/kafka:0.10.0.0 28 | ports: 29 | - "9092:9092" 30 | environment: 31 | KAFKA_CREATE_TOPICS: "tweets:1:1,aggregation:1:1" 32 | KAFKA_ADVERTISED_HOST_NAME: kafka 33 | KAFKA_PORT: 9092 34 | KAFKA_ADVERTISED_PORT: 9092 35 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 36 | 37 | -------------------------------------------------------------------------------- /twitterstream/src/main/scala/Logging.scala: -------------------------------------------------------------------------------- 1 | package mwt.twitterstream 2 | 3 | import com.typesafe.scalalogging.Logger 4 | import org.apache.log4j.{BasicConfigurator, Level, Logger => UnderlyingLogger} 5 | import org.slf4j.{ILoggerFactory, LoggerFactory} 6 | 7 | import scala.sys.SystemProperties 8 | 9 | object Logging { 10 | 11 | BasicConfigurator.configure() 12 | UnderlyingLogger.getRootLogger().setLevel(Level.INFO) 13 | 14 | def initWithConfigAt(path: String): Unit = { 15 | (new SystemProperties).getOrElseUpdate("logback.configurationFile", path) 16 | () 17 | } 18 | 19 | 20 | private lazy val loggerFactory: ILoggerFactory = { 21 | LoggerFactory.getILoggerFactory 22 | } 23 | 24 | def logger(name: String): Logger = Logger(loggerFactory.getLogger(name)) 25 | } 26 | 27 | 28 | trait Logging { 29 | 30 | import Logging._ 31 | 32 | lazy val log: Logger = { 33 | Logger(loggerFactory.getLogger(getClass.getName)) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /twitterstream/src/main/scala/IngestApp.scala: -------------------------------------------------------------------------------- 1 | package mwt.twitterstream 2 | 3 | import sys.addShutdownHook 4 | import org.apache.kafka.clients.producer.ProducerRecord 5 | 6 | object IngestApp extends App with Logging { 7 | 8 | var closing = false 9 | 10 | // close gracefully 11 | addShutdownHook { 12 | closing = true 13 | producer.close 14 | source.hosebirdClient.stop 15 | } 16 | 17 | log.info(Settings.config.toString) 18 | 19 | val source = Settings.tweetSource 20 | val producer = Settings.kafkaProducer 21 | val topic = Settings.rawTopic 22 | val partition = Settings.partition 23 | 24 | while (!(source.hosebirdClient.isDone) & !(closing)) { 25 | source.take() match { 26 | case Some(json) => 27 | send(json) 28 | case None => 29 | } 30 | } 31 | 32 | def send(msg: String): Unit = { 33 | val ts = System.currentTimeMillis() 34 | val key = TweetKey(Settings.filterTerms) 35 | val keyPayload = Json.ByteArray.encode(key) 36 | val payload = msg.map(_.toByte).toArray 37 | val record = new ProducerRecord[Array[Byte], Array[Byte]](topic, partition, ts, keyPayload, payload) 38 | log.info(s"Sending to Kafka ${record}") 39 | producer.send(record) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /twitterstream/src/main/scala/TweetSource.scala: -------------------------------------------------------------------------------- 1 | package mwt.twitterstream 2 | 3 | import java.util.concurrent.LinkedBlockingQueue 4 | 5 | import com.twitter.hbc.ClientBuilder 6 | import com.twitter.hbc.core.{Constants, HttpHosts} 7 | import com.twitter.hbc.core.endpoint.StatusesFilterEndpoint 8 | import com.twitter.hbc.core.processor.StringDelimitedProcessor 9 | import com.twitter.hbc.httpclient.auth.OAuth1 10 | import java.util.ArrayList 11 | import com.fasterxml.jackson.annotation.JsonIgnoreProperties 12 | 13 | @JsonIgnoreProperties(ignoreUnknown = true) 14 | case class Tweet(text: String) 15 | 16 | case class TweetKey(filterTerms: Seq[String]) 17 | 18 | class TweetSource(oAuth1: OAuth1, terms: Seq[String]) extends Logging { 19 | val msgQueue = new LinkedBlockingQueue[String](1000) 20 | 21 | val hosebirdEndpoint = new StatusesFilterEndpoint() 22 | val listOfTerms = new ArrayList[String]() 23 | terms.foreach { term => listOfTerms.add(term) } 24 | hosebirdEndpoint.trackTerms(listOfTerms) 25 | 26 | val builder = new ClientBuilder() 27 | .hosts(new HttpHosts(Constants.STREAM_HOST)) 28 | .authentication(oAuth1) 29 | .endpoint(hosebirdEndpoint) 30 | .processor(new StringDelimitedProcessor(msgQueue)) 31 | 32 | val hosebirdClient = builder.build() 33 | hosebirdClient.connect() 34 | 35 | def take(): Option[String] = { 36 | if (hosebirdClient.isDone) 37 | None 38 | else 39 | Some(msgQueue.take()) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /twitterstream/src/main/scala/AggregationApp.scala: -------------------------------------------------------------------------------- 1 | package mwt.twitterstream 2 | 3 | import com.twitter.conversions.time._ 4 | import org.apache.kafka.streams.{KafkaStreams, KeyValue} 5 | import org.apache.kafka.streams.kstream._ 6 | 7 | case class WindowedWordHistogram(start: Long, end: Long, histogram: Map[String, Int]) 8 | 9 | object AggregationApp extends App { 10 | 11 | class WordHistogramInitializer extends Initializer[Map[String, Int]] { 12 | override def apply(): Map[String, Int] = Map() 13 | } 14 | 15 | class WordHistogramAggregator extends Aggregator[TweetKey, Array[String], Map[String, Int]] { 16 | override def apply(aggKey: TweetKey, value: Array[String], aggregate: Map[String, Int]) = { 17 | // within-list frequencies 18 | val frequencies = value 19 | .groupBy[String] { word => word } 20 | .map { case (word, instances) => (word -> instances.length) } 21 | 22 | // add frequencies to global word frequencies 23 | val updates = frequencies.keys.map { case key => 24 | if (aggregate.contains(key)) (key, aggregate(key) + frequencies(key)) else (key, 1) 25 | } 26 | 27 | aggregate ++ updates 28 | } 29 | } 30 | 31 | val (builder, properties) = Settings.kafkaStreamSource 32 | 33 | val out = builder.stream(new JSONSerde[TweetKey], new JSONSerde[Tweet], Settings.rawTopic) 34 | .mapValues(new ValueMapper[Tweet, Array[String]] { 35 | override def apply(value: Tweet): Array[String] = 36 | value.text 37 | .toLowerCase 38 | .split(" ") 39 | .map { word => word.trim } 40 | }) 41 | .aggregateByKey( 42 | new WordHistogramInitializer(), 43 | new WordHistogramAggregator(), 44 | TimeWindows.of("WORD_HISTOGRAM", (10.minutes).inMillis), 45 | new JSONSerde[TweetKey], 46 | new JSONSerde[Map[String, Int]]) 47 | .toStream 48 | .map { 49 | new KeyValueMapper[Windowed[TweetKey], Map[String, Int], KeyValue[TweetKey, WindowedWordHistogram]] { 50 | override def apply(key: Windowed[TweetKey], value: Map[String, Int]) = { 51 | new KeyValue(key.key(), WindowedWordHistogram(key.window.start(), key.window.end(), value)) 52 | } 53 | } 54 | } 55 | 56 | out.print() 57 | out.to(new JSONSerde[TweetKey], new JSONSerde[WindowedWordHistogram], Settings.aggregationTopic) 58 | 59 | val stream: KafkaStreams = new KafkaStreams(builder, properties) 60 | stream.start() 61 | } 62 | -------------------------------------------------------------------------------- /twitterstream/src/main/scala/Settings.scala: -------------------------------------------------------------------------------- 1 | package mwt.twitterstream 2 | 3 | import java.io.File 4 | import java.util.{Properties, UUID} 5 | 6 | import com.twitter.hbc.httpclient.auth.OAuth1 7 | import com.typesafe.config.ConfigFactory 8 | import org.apache.kafka.clients.consumer.ConsumerConfig 9 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig} 10 | import org.apache.kafka.streams.StreamsConfig 11 | import org.apache.kafka.streams.kstream.KStreamBuilder 12 | 13 | object Settings { 14 | 15 | val config = ConfigFactory.parseFile(new File("application.conf")) 16 | val kConfig = config.getConfig("kafka") 17 | val tConfig = config.getConfig("twitter") 18 | 19 | def filterTerms = { 20 | val terms = tConfig.getStringList("terms") 21 | Range(0, terms.size()).map { i => terms.get(i) } 22 | } 23 | 24 | def zookeepers = kConfig.getString("zookeepers") 25 | 26 | def brokers = kConfig.getString("brokers") 27 | 28 | def rawTopic = kConfig.getString("raw_topic") 29 | 30 | def aggregationTopic = kConfig.getString("aggregation_topic") 31 | 32 | def partition = kConfig.getInt("partition") 33 | 34 | def stateDir = kConfig.getString("state_dir") 35 | 36 | def kafkaProducer = { 37 | val props = new Properties() 38 | val serde = "org.apache.kafka.common.serialization.ByteArraySerializer" 39 | props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers) 40 | props.put(ProducerConfig.ACKS_CONFIG, "all") 41 | props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, serde) 42 | props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, serde) 43 | new KafkaProducer[Array[Byte], Array[Byte]](props) 44 | } 45 | 46 | def kafkaStreamSource = { 47 | val builder: KStreamBuilder = new KStreamBuilder 48 | 49 | val streamingConfig = { 50 | val settings = new Properties 51 | settings.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest") 52 | settings.put(StreamsConfig.APPLICATION_ID_CONFIG, UUID.randomUUID().toString) 53 | settings.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, brokers) 54 | settings.put(StreamsConfig.ZOOKEEPER_CONNECT_CONFIG, zookeepers) 55 | settings.put(StreamsConfig.STATE_DIR_CONFIG, stateDir) 56 | settings 57 | } 58 | 59 | (builder, streamingConfig) 60 | } 61 | 62 | def tweetSource = { 63 | val oAuth1 = new OAuth1( 64 | tConfig.getString("consumer_key"), 65 | tConfig.getString("consumer_secret"), 66 | tConfig.getString("token"), 67 | tConfig.getString("token_secret")) 68 | 69 | new TweetSource(oAuth1, filterTerms) 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Processing Tweets with Kafka Streams in Scala 2 | 3 | The example application consists of two services written in Scala, an ingestion service ([code](https://github.com/jpzk/twitterstream/blob/master/twitterstream/src/main/scala/IngestApp.scala)) and an aggregation service ([code](https://github.com/jpzk/twitterstream/blob/master/twitterstream/src/main/scala/AggregationApp.scala)). The ingestion service subscribes to the [Twitter Streaming API](https://dev.twitter.com/streaming/overview) and receives fresh tweets filtered by a list of terms. Any raw tweet is sent to the Kafka topic 'tweets' in JSON. The aggregation service retrieves raw tweets, parses tweets, and aggregates word counts in tumbling time windows, see the code [here](https://github.com/jpzk/twitterstream/blob/master/twitterstream/src/main/scala/AggregationApp.scala). Kafka Streams uses an embedded [RocksDB](http://rocksdb.org/) for maintaining a local state. Any change to the aggregate will be propagated to the topic 'aggregate'. 4 | 5 | Both services share the same [SBT](http://www.scala-sbt.org/index.html) project, and will be located in the same fat jar including all dependencies. Which allows us to easily share code in this small example project. Both applications access the [application.conf](https://github.com/jpzk/twitterstream/blob/master/application.conf.template) in runtime via the Settings object, see [code](https://github.com/jpzk/twitterstream/blob/master/twitterstream/src/main/scala/Settings.scala). I wrote a small [build script](https://github.com/jpzk/twitterstream/blob/master/build-run-containers.sh) to compile the services, building the Docker images and running the containers. 6 | 7 | Read the [full article](https://www.madewithtea.com/processing-tweets-with-kafka-streams.html) 8 | 9 | ## Twitter Hosebird Client: References 10 | 11 | * https://dev.twitter.com/streaming/overview 12 | * https://github.com/twitter/hbc 13 | 14 | ## Kafka Streams: References 15 | 16 | ### Official Documentation 17 | 18 | * http://www.confluent.io/blog/introducing-kafka-streams-stream-processing-made-simple 19 | * https://kafka.apache.org/documentation.html 20 | * http://docs.confluent.io/3.0.0/streams/javadocs/index.html 21 | * http://docs.confluent.io/3.0.0/streams/developer-guide.html#kafka-streams-dsl 22 | 23 | ### Other Code Examples 24 | 25 | * https://github.com/bbejeck/kafka-streams 26 | * https://github.com/confluentinc/examples/blob/kafka-0.10.0.0-cp-3.0.0/kafka-streams/src/main/scala/io/confluent/examples/streams/MapFunctionScalaExample.scala 27 | 28 | ### Articles 29 | 30 | * http://codingjunkie.net/kafka-processor-part1/ 31 | * http://codingjunkie.net/kafka-streams-part2/ 32 | * http://codingjunkie.net/kafka-streams-machine-learning/ 33 | * https://dzone.com/articles/machine-learning-with-kafka-streams 34 | -------------------------------------------------------------------------------- /twitterstream/src/main/scala/Serdes.scala: -------------------------------------------------------------------------------- 1 | package mwt.twitterstream 2 | 3 | import java.lang.reflect.{ParameterizedType, Type} 4 | import java.util 5 | 6 | import com.fasterxml.jackson.annotation.JsonInclude 7 | import com.fasterxml.jackson.core.JsonParseException 8 | import com.fasterxml.jackson.core.`type`.TypeReference 9 | import com.fasterxml.jackson.databind.ObjectMapper 10 | import com.fasterxml.jackson.databind.exc.{UnrecognizedPropertyException => UPE} 11 | import com.fasterxml.jackson.module.scala.DefaultScalaModule 12 | import org.apache.kafka.common.serialization.{Deserializer, Serde, Serializer} 13 | 14 | object Json { 15 | 16 | type ParseException = JsonParseException 17 | type UnrecognizedPropertyException = UPE 18 | 19 | private val mapper = new ObjectMapper() 20 | mapper.registerModule(DefaultScalaModule) 21 | mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL) 22 | 23 | private def typeReference[T: Manifest] = new TypeReference[T] { 24 | override def getType = typeFromManifest(manifest[T]) 25 | } 26 | 27 | private def typeFromManifest(m: Manifest[_]): Type = { 28 | if (m.typeArguments.isEmpty) { 29 | m.runtimeClass 30 | } 31 | else new ParameterizedType { 32 | def getRawType = m.runtimeClass 33 | 34 | def getActualTypeArguments = m.typeArguments.map(typeFromManifest).toArray 35 | 36 | def getOwnerType = null 37 | } 38 | } 39 | 40 | object ByteArray { 41 | def encode(value: Any): Array[Byte] = mapper.writeValueAsBytes(value) 42 | 43 | def decode[T: Manifest](value: Array[Byte]): T = 44 | mapper.readValue(value, typeReference[T]) 45 | } 46 | } 47 | 48 | /** 49 | * JSON serializer for JSON serde 50 | * 51 | * @tparam T 52 | */ 53 | class JSONSerializer[T] extends Serializer[T] { 54 | override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = () 55 | 56 | override def serialize(topic: String, data: T): Array[Byte] = 57 | Json.ByteArray.encode(data) 58 | 59 | override def close(): Unit = () 60 | } 61 | 62 | /** 63 | * JSON deserializer for JSON serde 64 | * 65 | * @tparam T 66 | */ 67 | class JSONDeserializer[T >: Null <: Any : Manifest] extends Deserializer[T] { 68 | override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = () 69 | 70 | override def close(): Unit = () 71 | 72 | override def deserialize(topic: String, data: Array[Byte]): T = { 73 | if (data == null) { 74 | return null 75 | } else { 76 | Json.ByteArray.decode[T](data) 77 | } 78 | } 79 | } 80 | 81 | /** 82 | * JSON serde for local state serialization 83 | * 84 | * @tparam T 85 | */ 86 | class JSONSerde[T >: Null <: Any : Manifest] extends Serde[T] { 87 | override def deserializer(): Deserializer[T] = new JSONDeserializer[T] 88 | 89 | override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = () 90 | 91 | override def close(): Unit = () 92 | 93 | override def serializer(): Serializer[T] = new JSONSerializer[T] 94 | } 95 | 96 | --------------------------------------------------------------------------------