├── lib ├── project │ ├── build.properties │ ├── Versions.scala │ └── Dependencies.scala ├── src │ └── main │ │ └── scala │ │ └── com │ │ └── lightbend │ │ └── kafka │ │ └── scala │ │ ├── iq │ │ ├── serializers │ │ │ ├── ModelSerializer.scala │ │ │ ├── SpecificAvroSerDeserializer.scala │ │ │ ├── SpecificAvroSerde.scala │ │ │ └── Serializers.scala │ │ ├── http │ │ │ ├── HttpRequester.scala │ │ │ ├── InteractiveQueryHttpService.scala │ │ │ └── KeyValueFetcher.scala │ │ └── services │ │ │ ├── MetadataService.scala │ │ │ └── LocalStateStoreQuery.scala │ │ └── package.scala ├── build.sbt └── README.md ├── examples ├── project │ ├── build.properties │ ├── plugins.sbt │ ├── Versions.scala │ ├── Dependencies.scala │ └── Common.scala ├── kafka-local-server │ └── src │ │ └── main │ │ └── scala │ │ └── com │ │ └── lightbend │ │ └── kafka │ │ └── scala │ │ └── server │ │ ├── RecordProcessorTrait.scala │ │ ├── Utils.scala │ │ ├── MessageSender.scala │ │ ├── MessageListener.scala │ │ └── KafkaLocalServer.scala ├── example-dsl │ ├── src │ │ └── main │ │ │ ├── scala │ │ │ └── com │ │ │ │ └── lightbend │ │ │ │ └── kafka │ │ │ │ └── scala │ │ │ │ └── iq │ │ │ │ ├── example │ │ │ │ ├── models │ │ │ │ │ ├── LogRecord.scala │ │ │ │ │ └── LogParseUtil.scala │ │ │ │ ├── serializers │ │ │ │ │ ├── Tuple2Serializer.scala │ │ │ │ │ ├── SpecificAvroSerdeWithSchemaRegistry.scala │ │ │ │ │ ├── SpecificAvroDeserializerWithSchemaRegistry.scala │ │ │ │ │ ├── SpecificAvroSerializerWithSchemaRegistry.scala │ │ │ │ │ └── AppSerializers.scala │ │ │ │ ├── http │ │ │ │ │ ├── SummaryInfoFetcher.scala │ │ │ │ │ └── WeblogDSLHttpService.scala │ │ │ │ ├── ingestion │ │ │ │ │ └── DataIngestion.scala │ │ │ │ ├── config │ │ │ │ │ └── KStreamConfig.scala │ │ │ │ ├── WeblogWorkflow.scala │ │ │ │ └── WeblogProcessing.scala │ │ │ │ └── package.scala │ │ │ └── resources │ │ │ ├── log4j.properties │ │ │ ├── com │ │ │ └── lightbend │ │ │ │ └── kafka │ │ │ │ └── scala │ │ │ │ └── iq │ │ │ │ └── example │ │ │ │ └── LogRecord.avsc │ │ │ ├── logback-dsl.xml │ │ │ └── application-dsl.conf.template │ └── README.md ├── example-proc │ ├── src │ │ └── main │ │ │ ├── scala │ │ │ └── com │ │ │ │ └── lightbend │ │ │ │ └── kafka │ │ │ │ └── scala │ │ │ │ └── iq │ │ │ │ ├── example │ │ │ │ ├── models │ │ │ │ │ ├── LogRecord.scala │ │ │ │ │ └── LogParseUtil.scala │ │ │ │ ├── serializers │ │ │ │ │ ├── AppSerializers.scala │ │ │ │ │ └── Tuple2Serializer.scala │ │ │ │ ├── services │ │ │ │ │ └── LocalStateStoreQuery.scala │ │ │ │ ├── processor │ │ │ │ │ ├── BFStoreSupplier.scala │ │ │ │ │ ├── BFStoreType.scala │ │ │ │ │ ├── BFStoreBuilder.scala │ │ │ │ │ ├── BFStoreChangeLogger.scala │ │ │ │ │ ├── WeblogProcessor.scala │ │ │ │ │ ├── BFSerde.scala │ │ │ │ │ └── BFStore.scala │ │ │ │ ├── http │ │ │ │ │ ├── WeblogProcHttpService.scala │ │ │ │ │ └── BFValueFetcher.scala │ │ │ │ ├── ingestion │ │ │ │ │ └── DataIngestion.scala │ │ │ │ ├── config │ │ │ │ │ └── KStreamConfig.scala │ │ │ │ ├── WeblogDriver.scala │ │ │ │ └── WeblogWorkflow.scala │ │ │ │ └── package.scala │ │ │ └── resources │ │ │ ├── log4j.properties │ │ │ ├── logback-proc.xml │ │ │ └── application-proc.conf.template │ └── README.md └── build.sbt ├── .travis.yml ├── .gitignore ├── README.md └── LICENSE /lib/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.1.0 2 | -------------------------------------------------------------------------------- /examples/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.1.0 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | sudo: false 3 | jdk: oraclejdk8 4 | cache: 5 | directories: 6 | - "$HOME/.ivy2/cache" 7 | - "$HOME/.sbt/launchers" 8 | before_cache: 9 | - find $HOME/.sbt -name "*.lock" | xargs rm 10 | - find $HOME/.ivy2 -name "ivydata-*.properties" | xargs rm 11 | matrix: 12 | include: 13 | - env: PROJECT="lib" 14 | script: 15 | - cd "${PROJECT}" 16 | - sbt +test 17 | -------------------------------------------------------------------------------- /examples/kafka-local-server/src/main/scala/com/lightbend/kafka/scala/server/RecordProcessorTrait.scala: -------------------------------------------------------------------------------- 1 | package com.lightbend.kafka.scala.server 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerRecord 4 | 5 | // A trait, that should be implemented by any listener implementation 6 | 7 | trait RecordProcessorTrait[K, V] { 8 | def processRecord(record: ConsumerRecord[K, V]): Unit 9 | } 10 | -------------------------------------------------------------------------------- /examples/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | resolvers += "Bintray Repository" at "https://dl.bintray.com/shmishleniy/" 2 | 3 | resolvers += "JAnalyse Repository" at "http://www.janalyse.fr/repository/" 4 | 5 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") 6 | addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.9.0") 7 | addSbtPlugin("com.typesafe.sbt" % "sbt-native-packager" % "1.3.2") 8 | 9 | addSbtPlugin("com.cavorite" % "sbt-avro-1-8" % "1.1.3") 10 | 11 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/example/models/LogRecord.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package models 7 | 8 | import java.time.OffsetDateTime 9 | 10 | case class LogRecord( 11 | host: String, 12 | clientId: String, 13 | user: String, 14 | timestamp: OffsetDateTime, 15 | method: String, 16 | endpoint: String, 17 | protocol: String, 18 | httpReplyCode: Int, 19 | payloadSize: Long 20 | ) 21 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/models/LogRecord.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package models 7 | 8 | import java.time.OffsetDateTime 9 | 10 | case class LogRecord( 11 | host: String, 12 | clientId: String, 13 | user: String, 14 | timestamp: OffsetDateTime, 15 | method: String, 16 | endpoint: String, 17 | protocol: String, 18 | httpReplyCode: Int, 19 | payloadSize: Long 20 | ) 21 | -------------------------------------------------------------------------------- /lib/project/Versions.scala: -------------------------------------------------------------------------------- 1 | object Versions { 2 | val algebirdVersion = "0.13.0" 3 | val chillVersion = "0.9.2" 4 | val logbackVersion = "1.2.3" 5 | val kafkaVersion = "1.0.0" 6 | val scalaLoggingVersion = "3.5.0" 7 | val curatorVersion = "4.0.0" 8 | val minitestVersion = "2.0.0" 9 | val JDKVersion = "1.8" 10 | val scalaVersion = "2.12.4" 11 | val crossScalaVersions = Seq(scalaVersion, "2.11.11") 12 | val circeVersion = "0.8.0" 13 | val akkaVersion = "2.5.3" 14 | val akkaHttpVersion = "10.0.11" 15 | val akkaHttpCirceVersion = "1.17.0" 16 | val bijectionVersion = "0.9.5" 17 | } 18 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=ERROR, R 3 | 4 | # A1 is set to be a ConsoleAppender. 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | 7 | log4j.appender.R=org.apache.log4j.RollingFileAppender 8 | log4j.appender.R.File=logs/kafka-server.log 9 | 10 | log4j.appender.R.MaxFileSize=100KB 11 | # Keep one backup file 12 | log4j.appender.R.MaxBackupIndex=1 13 | 14 | # A1 uses PatternLayout. 15 | log4j.appender.R.layout=org.apache.log4j.PatternLayout 16 | log4j.appender.R.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n 17 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=ERROR, R 3 | 4 | # A1 is set to be a ConsoleAppender. 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | 7 | log4j.appender.R=org.apache.log4j.RollingFileAppender 8 | log4j.appender.R.File=logs/kafka-server.log 9 | 10 | log4j.appender.R.MaxFileSize=100KB 11 | # Keep one backup file 12 | log4j.appender.R.MaxBackupIndex=1 13 | 14 | # A1 uses PatternLayout. 15 | log4j.appender.R.layout=org.apache.log4j.PatternLayout 16 | log4j.appender.R.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n 17 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/serializers/AppSerializers.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package serializers 7 | 8 | import models.LogRecord 9 | import org.apache.kafka.common.serialization.Serdes 10 | import com.lightbend.kafka.scala.iq.serializers._ 11 | 12 | trait AppSerializers extends Serializers { 13 | final val ts = new Tuple2Serializer[String, String]() 14 | final val ms = new ModelSerializer[LogRecord]() 15 | final val logRecordSerde = Serdes.serdeFrom(ms, ms) 16 | final val tuple2StringSerde = Serdes.serdeFrom(ts, ts) 17 | } 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | .cache 4 | .history 5 | .DS_Store 6 | .lib/ 7 | app/* 8 | dist/* 9 | target/ 10 | tmp/ 11 | logs/ 12 | build/ 13 | lib_managed/ 14 | src_managed/ 15 | project/boot/ 16 | project/target/ 17 | project/project/ 18 | project/plugins/project/ 19 | #idea 20 | .idea 21 | *.iml 22 | .idea_modules 23 | *.json 24 | *.json-- 25 | tmp/ 26 | local_state_data/ 27 | *.swp 28 | .scala_dependencies 29 | .worksheet 30 | release/staging/ 31 | 32 | lib/project/build.properties 33 | examples/project/build.properties 34 | examples/kafka-local-server/project/build.properties 35 | 36 | examples/example-proc/src/main/resources/application-proc.conf 37 | examples/example-dsl/src/main/resources/application-dsl.conf 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # This library is not maintained anymore. 2 | 3 | 4 | ### HTTP Endpoints for Interactive Queries for Kafka Streams 5 | 6 | [![Build Status](https://secure.travis-ci.org/lightbend/kafka-streams-query.png)](http://travis-ci.org/lightbend/kafka-streams-query) 7 | 8 | Library offering http based query on top of Kafka Streams Interactive Queries. The project has 2 parts: 9 | 10 | 1. The core library, as described [here](lib/README.md) 11 | 2. A couple of example implementations, as described [here](examples/example-dsl/README.md) and [here](examples/example-proc/README.md) 12 | 13 | Please go through the above links to learn more about the library. 14 | 15 | > **NOTE:** This functionality may now exist in Kafka Streams itself. Check its current capabilities before introducing this library to your projects. 16 | -------------------------------------------------------------------------------- /examples/project/Versions.scala: -------------------------------------------------------------------------------- 1 | object Versions { 2 | val ksVersion = "0.1.2" 3 | val kqVersion = "0.1.1" 4 | val scala2_12Version = "2.12.4" 5 | val scala2_11Version = "2.11.11" 6 | val scalaVersion = scala2_12Version 7 | val crossScalaVersions = Seq(scala2_12Version, scala2_11Version) 8 | val algebirdVersion = "0.13.0" 9 | val chillVersion = "0.9.2" 10 | val bijectionVersion = "0.9.5" 11 | val alpakkaFileVersion = "0.16" 12 | val reactiveKafkaVersion = "0.18" 13 | val confluentPlatformVersion = "3.3.0" 14 | val akkaVersion = "2.5.3" 15 | val akkaHttpVersion = "10.0.11" 16 | val akkaHttpCirceVersion = "1.17.0" 17 | val circeVersion = "0.8.0" 18 | val scalaLoggingVersion = "3.5.0" 19 | val logbackVersion = "1.2.3" 20 | val curatorVersion = "4.0.0" 21 | val kafkaVersion = "1.0.0" 22 | } 23 | 24 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/lightbend/kafka/scala/iq/serializers/ModelSerializer.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq 6 | package serializers 7 | 8 | import java.util.Map 9 | 10 | import io.circe._, io.circe.parser._, io.circe.syntax._ 11 | 12 | 13 | class ModelSerializer[T : Encoder : Decoder] extends SerDeserializer[T] { 14 | 15 | override def configure(configs: Map[String, _], isKey: Boolean): Unit = () 16 | 17 | override def serialize(topic: String, t: T): Array[Byte] = 18 | t.asJson.noSpaces.getBytes(CHARSET) 19 | 20 | override def deserialize(topic: String, bytes: Array[Byte]): T = 21 | decode[T](new String(bytes, CHARSET)) match { 22 | case Right(t) => t 23 | case Left(err) => throw new IllegalArgumentException(err.toString) 24 | } 25 | 26 | override def close(): Unit = () 27 | } 28 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/lightbend/kafka/scala/iq/serializers/SpecificAvroSerDeserializer.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq 6 | package serializers 7 | 8 | import com.twitter.bijection.Injection 9 | import org.apache.avro.specific.SpecificRecordBase 10 | import java.util.{Map => JMap} 11 | 12 | import scala.util.Try 13 | 14 | class SpecificAvroSerDeserializer[T <: SpecificRecordBase](injection: Injection[T, Array[Byte]]) extends SerDeserializer[T] { 15 | val inverted: Array[Byte] => Try[T] = injection.invert _ 16 | 17 | override def configure(configs: JMap[String, _], isKey: Boolean): Unit = () 18 | 19 | override def serialize(topic: String, record: T): Array[Byte] = injection(record) 20 | 21 | override def deserialize(s: String, bytes: Array[Byte]): T = inverted(bytes).get 22 | 23 | override def close(): Unit = () 24 | } 25 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/services/LocalStateStoreQuery.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package services 7 | 8 | import org.apache.kafka.streams.KafkaStreams 9 | 10 | import scala.concurrent.{Future, ExecutionContext} 11 | import akka.actor.ActorSystem 12 | 13 | import processor.BFStoreType 14 | import com.twitter.algebird.Hash128 15 | 16 | import com.lightbend.kafka.scala.iq.services.LocalStateStoreQuery 17 | 18 | class AppStateStoreQuery[K, V] extends LocalStateStoreQuery[K, V] { 19 | 20 | def queryBFStore(streams: KafkaStreams, store: String, value: K) 21 | (implicit ex: ExecutionContext, mk: Hash128[K], as: ActorSystem): Future[Boolean] = { 22 | 23 | val q = new BFStoreType[K]()(mk) 24 | retry(streams.store(store, q), DelayBetweenRetries, MaxRetryCount)(ex, as.scheduler).map(_.read(value)) 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/resources/com/lightbend/kafka/scala/iq/example/LogRecord.avsc: -------------------------------------------------------------------------------- 1 | { 2 | "namespace": "com.lightbend.kafka.scala.iq.example", 3 | "type": "record", 4 | "name": "LogRecordAvro", 5 | "fields":[ 6 | { 7 | "name": "host", "type": "string" 8 | }, 9 | { 10 | "name": "clientId", "type": "string" 11 | }, 12 | { 13 | "name": "user", "type": "string" 14 | }, 15 | { 16 | "name": "timestamp", "type": "string" 17 | }, 18 | { 19 | "name": "method", "type": "string" 20 | }, 21 | { 22 | "name": "endpoint", "type": "string" 23 | }, 24 | { 25 | "name": "protocol", "type": "string" 26 | }, 27 | { 28 | "name": "httpReplyCode", "type": "int" 29 | }, 30 | { 31 | "name": "payloadSize", "type": "long" 32 | } 33 | ] 34 | } 35 | 36 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/lightbend/kafka/scala/iq/serializers/SpecificAvroSerde.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq 6 | package serializers 7 | 8 | import org.apache.kafka.common.serialization.{ Deserializer, Serde, Serializer } 9 | 10 | import org.apache.avro.Schema 11 | 12 | import com.twitter.bijection.Injection 13 | import com.twitter.bijection.avro.SpecificAvroCodecs 14 | 15 | import java.util.Map 16 | 17 | class SpecificAvroSerde[T <: org.apache.avro.specific.SpecificRecordBase](schema: Schema) extends Serde[T] { 18 | 19 | val recordInjection: Injection[T, Array[Byte]] = SpecificAvroCodecs.toBinary(schema) 20 | val avroSerde = new SpecificAvroSerDeserializer(recordInjection) 21 | 22 | override def serializer(): Serializer[T] = avroSerde 23 | 24 | override def deserializer(): Deserializer[T] = avroSerde 25 | 26 | override def configure(configs: Map[String, _], isKey: Boolean): Unit = () 27 | 28 | override def close(): Unit = () 29 | } 30 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/example/serializers/Tuple2Serializer.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package serializers 7 | 8 | import org.apache.kafka.common.serialization.{ Deserializer, Serializer } 9 | 10 | import io.circe._, io.circe.generic.auto._, io.circe.parser._, io.circe.syntax._ 11 | 12 | class Tuple2Serializer[T : Encoder : Decoder, 13 | U : Encoder : Decoder] extends Serializer[(T, U)] with Deserializer[(T, U)] { 14 | 15 | override def configure(configs: java.util.Map[String, _], isKey: Boolean) = {} 16 | 17 | override def serialize(topic: String, data: (T, U)) = 18 | data.asJson.noSpaces.getBytes(CHARSET) 19 | 20 | override def deserialize(topic: String, bytes: Array[Byte]) = { 21 | decode[(T, U)](new String(bytes, CHARSET)) match { 22 | case Right(t) => t 23 | case Left(err) => throw new IllegalArgumentException(err.toString) 24 | } 25 | } 26 | 27 | override def close() = {} 28 | } 29 | 30 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/serializers/Tuple2Serializer.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package serializers 7 | 8 | import org.apache.kafka.common.serialization.{ Deserializer, Serializer } 9 | 10 | import io.circe._, io.circe.generic.auto._, io.circe.parser._, io.circe.syntax._ 11 | 12 | class Tuple2Serializer[T : Encoder : Decoder, 13 | U : Encoder : Decoder] extends Serializer[(T, U)] with Deserializer[(T, U)] { 14 | 15 | override def configure(configs: java.util.Map[String, _], isKey: Boolean) = {} 16 | 17 | override def serialize(topic: String, data: (T, U)) = 18 | data.asJson.noSpaces.getBytes(CHARSET) 19 | 20 | override def deserialize(topic: String, bytes: Array[Byte]) = { 21 | decode[(T, U)](new String(bytes, CHARSET)) match { 22 | case Right(t) => t 23 | case Left(err) => throw new IllegalArgumentException(err.toString) 24 | } 25 | } 26 | 27 | override def close() = {} 28 | } 29 | 30 | -------------------------------------------------------------------------------- /examples/kafka-local-server/src/main/scala/com/lightbend/kafka/scala/server/Utils.scala: -------------------------------------------------------------------------------- 1 | package com.lightbend.kafka.scala.server 2 | 3 | 4 | import java.io.File 5 | import java.nio.file.{ FileVisitOption, Files, Paths } 6 | import java.util.Comparator 7 | 8 | import scala.util.{ Try, Success, Failure } 9 | import scala.collection.JavaConverters._ 10 | 11 | object Utils { 12 | def deleteDirectory(directory: File): Try[Unit] = Try { 13 | if (directory.exists()) { 14 | val rootPath = Paths.get(directory.getAbsolutePath) 15 | 16 | val files = Files.walk(rootPath, FileVisitOption.FOLLOW_LINKS).sorted(Comparator.reverseOrder()).iterator().asScala 17 | files.foreach(Files.delete) 18 | } 19 | } 20 | 21 | def dataDirectory(baseDir: String, directoryName: String): Try[File] = Try { 22 | 23 | val dataDirectory = new File(baseDir + directoryName) 24 | 25 | if (dataDirectory.exists() && !dataDirectory.isDirectory()) 26 | throw new IllegalArgumentException( 27 | s"Cannot use $directoryName as a directory name because a file with that name already exists in $dataDirectory.") 28 | 29 | dataDirectory 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/example/serializers/SpecificAvroSerdeWithSchemaRegistry.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package serializers 7 | 8 | import org.apache.kafka.common.serialization.{ Deserializer, Serde, Serdes, Serializer } 9 | 10 | import java.util.Map 11 | 12 | class SpecificAvroSerdeWithSchemaRegistry[T <: org.apache.avro.specific.SpecificRecord] extends Serde[T] { 13 | 14 | val inner: Serde[T] = Serdes.serdeFrom(new SpecificAvroSerializerWithSchemaRegistry[T](), new SpecificAvroDeserializerWithSchemaRegistry[T]()) 15 | 16 | override def serializer(): Serializer[T] = inner.serializer() 17 | 18 | override def deserializer(): Deserializer[T] = inner.deserializer() 19 | 20 | override def configure(configs: Map[String, _], isKey: Boolean): Unit = { 21 | inner.serializer().configure(configs, isKey) 22 | inner.deserializer().configure(configs, isKey) 23 | } 24 | 25 | override def close(): Unit = { 26 | inner.serializer().close() 27 | inner.deserializer().close() 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/processor/BFStoreSupplier.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package processor 7 | 8 | import org.apache.kafka.common.serialization.Serde 9 | import org.apache.kafka.streams.state.StoreSupplier 10 | import com.twitter.algebird.Hash128 11 | 12 | class BFStoreSupplier[T: Hash128](val name: String, 13 | val serde: Serde[T], 14 | val loggingEnabled: Boolean, 15 | val logConfig: java.util.Map[String, String]) extends StoreSupplier[BFStore[T]] { 16 | 17 | def this(name: String, serde: Serde[T]) { 18 | this(name, serde, true, new java.util.HashMap[String, String]) 19 | } 20 | 21 | def this(name: String, serde: Serde[T], loggingEnabled: Boolean) { 22 | this(name, serde, loggingEnabled, new java.util.HashMap[String, String]) 23 | } 24 | 25 | override def get(): BFStore[T] = new BFStore[T](name, width = 1048576) 26 | 27 | override def metricsScope(): String = "" 28 | 29 | } 30 | 31 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/http/WeblogProcHttpService.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package http 7 | 8 | import akka.actor.ActorSystem 9 | 10 | import akka.stream.ActorMaterializer 11 | 12 | import io.circe.syntax._ 13 | 14 | import org.apache.kafka.streams.state.HostInfo 15 | 16 | import scala.concurrent.ExecutionContext 17 | import com.lightbend.kafka.scala.iq.http.InteractiveQueryHttpService 18 | 19 | 20 | class WeblogProcHttpService( 21 | hostInfo: HostInfo, 22 | bfValueFetcher: BFValueFetcher, 23 | actorSystem: ActorSystem, 24 | actorMaterializer: ActorMaterializer, 25 | ec: ExecutionContext 26 | ) extends InteractiveQueryHttpService(hostInfo, actorSystem, actorMaterializer, ec) { 27 | 28 | // define the routes 29 | val routes = handleExceptions(myExceptionHandler) { 30 | pathPrefix("weblog") { 31 | (get & pathPrefix("access" / "check") & path(Segment)) { hostKey => 32 | complete { 33 | bfValueFetcher.checkIfPresent(hostKey).map(_.asJson) 34 | } 35 | } 36 | } 37 | } 38 | } 39 | 40 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/example/serializers/SpecificAvroDeserializerWithSchemaRegistry.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package serializers 7 | 8 | import org.apache.kafka.common.serialization.Deserializer 9 | 10 | import scala.collection.JavaConverters._ 11 | 12 | import java.util.{ Map => JMap } 13 | 14 | import io.confluent.kafka.serializers.KafkaAvroDeserializer 15 | 16 | import io.confluent.kafka.serializers.KafkaAvroDeserializerConfig.SPECIFIC_AVRO_READER_CONFIG 17 | 18 | class SpecificAvroDeserializerWithSchemaRegistry[T <: org.apache.avro.specific.SpecificRecord] extends Deserializer[T] { 19 | 20 | val inner: KafkaAvroDeserializer = new KafkaAvroDeserializer() 21 | 22 | override def configure(configs: JMap[String, _], isKey: Boolean): Unit = { 23 | val effectiveConfigs = Map(SPECIFIC_AVRO_READER_CONFIG -> true) ++ configs.asScala 24 | inner.configure(effectiveConfigs.asJava, isKey) 25 | } 26 | 27 | override def deserialize(s: String, bytes: Array[Byte]): T = inner.deserialize(s, bytes).asInstanceOf[T] 28 | 29 | override def close(): Unit = inner.close() 30 | } 31 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/resources/logback-dsl.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | logs/example-dsl.log 4 | true 5 | 6 | %d{HH:mm:ss.SSS} TKD [%thread] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/resources/logback-proc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | logs/example-proc.log 4 | true 5 | 6 | %d{HH:mm:ss.SSS} TKD [%thread] %-5level %logger{36} - %msg%n 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/example/serializers/SpecificAvroSerializerWithSchemaRegistry.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package serializers 7 | 8 | import org.apache.kafka.common.serialization.Serializer 9 | 10 | import scala.collection.JavaConverters._ 11 | import scala.collection.immutable.Map 12 | 13 | import java.util.{ Map => JMap } 14 | 15 | import io.confluent.kafka.serializers.KafkaAvroSerializer 16 | 17 | import io.confluent.kafka.serializers.KafkaAvroDeserializerConfig.SPECIFIC_AVRO_READER_CONFIG 18 | 19 | class SpecificAvroSerializerWithSchemaRegistry[T <: org.apache.avro.specific.SpecificRecord] extends Serializer[T] { 20 | 21 | val inner: KafkaAvroSerializer = new KafkaAvroSerializer() 22 | 23 | override def configure(configs: JMap[String, _], isKey: Boolean): Unit = { 24 | val effectiveConfigs = Map(SPECIFIC_AVRO_READER_CONFIG -> true) ++ configs.asScala 25 | inner.configure(effectiveConfigs.asJava, isKey) 26 | } 27 | 28 | override def serialize(topic: String, record: T): Array[Byte] = 29 | inner.serialize(topic, record) 30 | 31 | override def close(): Unit = inner.close() 32 | } 33 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/example/models/LogParseUtil.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package models 7 | 8 | import java.time.OffsetDateTime 9 | import java.time.format.DateTimeFormatter 10 | 11 | import scala.util.Try 12 | 13 | object LogParseUtil { 14 | final val logRegex = """^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\S+)""".r 15 | 16 | def parseLine(line: String): Try[LogRecord] = Try { 17 | logRegex.findFirstIn(line) match { 18 | case Some(logRegex(host, clientId, user, timestamp, method, endpoint, protocol, httpReplyCode, bytes)) => 19 | LogRecord(host, clientId, user, parseTimestamp(timestamp), method, endpoint, protocol, httpReplyCode.toInt, toSafeInt(bytes)) 20 | case _ => throw new IllegalArgumentException(s"Cannot parse line $line") 21 | } 22 | } 23 | 24 | private def parseTimestamp(s: String): OffsetDateTime = { 25 | val f = DateTimeFormatter.ofPattern("dd/MMM/yyyy:HH:mm:ss Z") 26 | OffsetDateTime.from(f.parse(s)) 27 | } 28 | 29 | private def toSafeInt(s: String): Int = try { 30 | s.toInt 31 | } catch { 32 | case _: Exception => 0 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/models/LogParseUtil.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package models 7 | 8 | import java.time.OffsetDateTime 9 | import java.time.format.DateTimeFormatter 10 | 11 | import scala.util.Try 12 | 13 | object LogParseUtil { 14 | final val logRegex = """^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\S+)""".r 15 | 16 | def parseLine(line: String): Try[LogRecord] = Try { 17 | logRegex.findFirstIn(line) match { 18 | case Some(logRegex(host, clientId, user, timestamp, method, endpoint, protocol, httpReplyCode, bytes)) => 19 | LogRecord(host, clientId, user, parseTimestamp(timestamp), method, endpoint, protocol, httpReplyCode.toInt, toSafeInt(bytes)) 20 | case _ => throw new IllegalArgumentException(s"Cannot parse line $line") 21 | } 22 | } 23 | 24 | private def parseTimestamp(s: String): OffsetDateTime = { 25 | val f = DateTimeFormatter.ofPattern("dd/MMM/yyyy:HH:mm:ss Z") 26 | OffsetDateTime.from(f.parse(s)) 27 | } 28 | 29 | private def toSafeInt(s: String): Int = try { 30 | s.toInt 31 | } catch { 32 | case _: Exception => 0 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/processor/BFStoreType.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package processor 7 | 8 | import org.apache.kafka.streams.state.QueryableStoreType 9 | import org.apache.kafka.streams.processor.StateStore 10 | import org.apache.kafka.streams.state.internals.StateStoreProvider 11 | 12 | import com.twitter.algebird.Hash128 13 | 14 | import scala.collection.JavaConverters._ 15 | 16 | class BFStoreType[T: Hash128] extends QueryableStoreType[ReadableBFStore[T]] { 17 | def accepts(stateStore: StateStore) = stateStore.isInstanceOf[BFStore[T]] 18 | 19 | def create(storeProvider: StateStoreProvider, storeName: String): BFStoreTypeWrapper[T] = 20 | new BFStoreTypeWrapper[T](storeProvider, storeName, this) 21 | } 22 | 23 | class BFStoreTypeWrapper[T: Hash128](val provider: StateStoreProvider, val storeName: String, 24 | val bfStoreType: QueryableStoreType[ReadableBFStore[T]]) extends ReadableBFStore[T] { 25 | 26 | def read(value: T): Boolean = { 27 | val stores: List[ReadableBFStore[T]] = provider.stores(storeName, bfStoreType).asScala.toList 28 | stores.map(store => store.read(value)).exists(_ == true) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/processor/BFStoreBuilder.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package processor 7 | 8 | import org.apache.kafka.streams.state.StoreBuilder 9 | import com.twitter.algebird.Hash128 10 | 11 | class BFStoreBuilder[T: Hash128](val storeSupplier: BFStoreSupplier[T]) extends StoreBuilder[BFStore[T]] { 12 | 13 | override def name(): String = storeSupplier.name 14 | 15 | override def build(): BFStore[T] = storeSupplier.get() 16 | 17 | override def logConfig: java.util.Map[String, String] = storeSupplier.logConfig 18 | 19 | override def loggingEnabled(): Boolean = storeSupplier.loggingEnabled 20 | 21 | override def withCachingEnabled(): BFStoreBuilder[T] = this 22 | 23 | override def withLoggingDisabled(): BFStoreBuilder[T] = { 24 | storeSupplier.logConfig.clear() 25 | this 26 | } 27 | 28 | override def withLoggingEnabled(config: java.util.Map[String, String]): BFStoreBuilder[T] = { 29 | new BFStoreBuilder[T]( 30 | new BFStoreSupplier( 31 | storeSupplier.name, 32 | storeSupplier.serde, 33 | storeSupplier.loggingEnabled, 34 | config 35 | ) 36 | ) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/lightbend/kafka/scala/package.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala 6 | 7 | import java.nio.charset.Charset 8 | 9 | import scala.concurrent.duration._ 10 | import scala.concurrent.ExecutionContext 11 | import scala.concurrent.Future 12 | import akka.pattern.after 13 | import akka.actor.Scheduler 14 | 15 | package object iq { 16 | final val CHARSET = Charset.forName("UTF-8") 17 | 18 | def translateHostInterface(host: String) = { 19 | if (host == "0.0.0.0") { 20 | java.net.InetAddress.getLocalHost.getHostAddress 21 | } else { 22 | host 23 | } 24 | } 25 | 26 | /** 27 | * Given an operation that produces a T, returns a Future containing the result of T, unless an exception is thrown, 28 | * in which case the operation will be retried after _delay_ time, if there are more possible retries, which is configured through 29 | * the _retries_ parameter. If the operation does not succeed and there is no retries left, the resulting Future will 30 | * contain the last failure. 31 | **/ 32 | // https://gist.github.com/viktorklang/9414163 33 | def retry[T](op: => T, delay: FiniteDuration, retries: Int)(implicit ec: ExecutionContext, s: Scheduler): Future[T] = 34 | Future(op) recoverWith { case _ if retries > 0 => after(delay, s)(retry(op, delay, retries - 1)) } 35 | } 36 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/processor/BFStoreChangeLogger.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package processor 7 | 8 | import org.apache.kafka.streams.processor.ProcessorContext 9 | import org.apache.kafka.streams.processor.internals.{ProcessorStateManager, RecordCollector} 10 | import org.apache.kafka.streams.state.StateSerdes 11 | 12 | class BFStoreChangeLogger[K, V](val storeName: String, 13 | val context: ProcessorContext, 14 | val partition: Int, 15 | val serialization: StateSerdes[K, V]) { 16 | 17 | private val topic = ProcessorStateManager.storeChangelogTopic(context.applicationId, storeName) 18 | private val collector = context.asInstanceOf[RecordCollector.Supplier].recordCollector 19 | 20 | def this(storeName: String, context: ProcessorContext, serialization: StateSerdes[K, V]) { 21 | this(storeName, context, context.taskId.partition, serialization) 22 | } 23 | 24 | def logChange(key: K, value: V): Unit = { 25 | if (collector != null) { 26 | val keySerializer = serialization.keySerializer 27 | val valueSerializer = serialization.valueSerializer 28 | collector.send(this.topic, key, value, this.partition, context.timestamp, keySerializer, valueSerializer) 29 | } 30 | } 31 | } 32 | 33 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/example/serializers/AppSerializers.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package serializers 7 | 8 | import models.LogRecord 9 | import org.apache.kafka.common.serialization.Serdes 10 | import io.confluent.kafka.serializers.AbstractKafkaAvroSerDeConfig 11 | import com.lightbend.kafka.scala.iq.serializers._ 12 | 13 | trait AppSerializers extends Serializers { 14 | final val ts = new Tuple2Serializer[String, String]() 15 | final val ms = new ModelSerializer[LogRecord]() 16 | final val logRecordSerde = Serdes.serdeFrom(ms, ms) 17 | final val tuple2StringSerde = Serdes.serdeFrom(ts, ts) 18 | 19 | /** 20 | * The Serde instance varies depending on whether we are using Schema Registry. If we are using 21 | * schema registry, we use the serde provided by Confluent, else we use Avro serialization backed by 22 | * Twitter's bijection library 23 | */ 24 | def logRecordAvroSerde(maybeSchemaRegistryUrl: Option[String]) = maybeSchemaRegistryUrl.map { url => 25 | val serde = new SpecificAvroSerdeWithSchemaRegistry[LogRecordAvro]() 26 | serde.configure( 27 | java.util.Collections.singletonMap(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, url), 28 | false) 29 | serde 30 | }.getOrElse { 31 | new SpecificAvroSerde[LogRecordAvro](LogRecordAvro.SCHEMA$) 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/processor/WeblogProcessor.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package processor 7 | 8 | import scala.util.{ Success, Failure } 9 | import org.apache.kafka.streams.processor.{ AbstractProcessor, ProcessorContext, PunctuationType, Punctuator } 10 | import models.LogParseUtil 11 | import com.typesafe.scalalogging.LazyLogging 12 | 13 | class WeblogProcessor extends AbstractProcessor[String, String] with LazyLogging { 14 | private var bfStore: BFStore[String] = _ 15 | 16 | override def init(context: ProcessorContext): Unit = { 17 | super.init(context) 18 | this.context.schedule( 19 | 1000, 20 | PunctuationType.WALL_CLOCK_TIME, 21 | new Punctuator() { 22 | override def punctuate(timestamp: Long): Unit = () 23 | } 24 | ) 25 | bfStore = this.context.getStateStore(WeblogDriver.LOG_COUNT_STATE_STORE).asInstanceOf[BFStore[String]] 26 | } 27 | 28 | override def process(dummy: String, record: String): Unit = LogParseUtil.parseLine(record) match { 29 | case Success(r) => { 30 | bfStore + r.host 31 | bfStore.changeLogger.logChange(bfStore.changelogKey, bfStore.bf) 32 | } 33 | case Failure(ex) => { 34 | logger.warn(s"Error processing record $record .. skipping", ex) 35 | } 36 | } 37 | 38 | override def punctuate(timestamp: Long): Unit = super.punctuate(timestamp) 39 | override def close(): Unit = {} 40 | } 41 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/processor/BFSerde.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package processor 7 | 8 | import java.util 9 | 10 | import com.twitter.algebird.BF 11 | import com.twitter.chill.ScalaKryoInstantiator 12 | import org.apache.kafka.common.errors.SerializationException 13 | import org.apache.kafka.common.serialization._ 14 | 15 | class BFSerializer[T] extends Serializer[BF[T]] { 16 | 17 | override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = { 18 | // nothing to do 19 | } 20 | 21 | override def serialize(topic: String, bf: BF[T]): Array[Byte] = 22 | if (bf == null) null 23 | else ScalaKryoInstantiator.defaultPool.toBytesWithClass(bf) 24 | 25 | override def close(): Unit = { 26 | // nothing to do 27 | } 28 | 29 | } 30 | 31 | class BFDeserializer[T] extends Deserializer[BF[T]] { 32 | 33 | override def configure(configs: util.Map[String, _], isKey: Boolean): Unit = { 34 | // nothing to do 35 | } 36 | 37 | override def deserialize(topic: String, bytes: Array[Byte]): BF[T] = 38 | if (bytes == null) null 39 | else if (bytes.isEmpty) throw new SerializationException("byte array must not be empty") 40 | else ScalaKryoInstantiator.defaultPool.fromBytes(bytes).asInstanceOf[BF[T]] 41 | 42 | override def close(): Unit = { 43 | // nothing to do 44 | } 45 | 46 | } 47 | 48 | object BFSerde { 49 | 50 | def apply[T]: Serde[BF[T]] = Serdes.serdeFrom(new BFSerializer[T], new BFDeserializer[T]) 51 | 52 | } 53 | 54 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/resources/application-proc.conf.template: -------------------------------------------------------------------------------- 1 | akka { 2 | loglevel = INFO 3 | log-config-on-start = on 4 | loggers = ["akka.event.slf4j.Slf4jLogger"] 5 | logging-filter = "akka.event.slf4j.Slf4jLoggingFilter" 6 | event-handlers = ["akka.event.slf4j.Slf4jEventHandler"] 7 | } 8 | 9 | kafka { 10 | # true if use local kafka server 11 | # false otherwise 12 | # if true, then setting of brokers below is ignored and set to that of KafkaLocalServer 13 | localserver = true 14 | 15 | ## bootstrap servers for Kafka 16 | brokers = "localhost:9092" 17 | brokers = ${?KAFKA_BROKERS} 18 | 19 | ## consumer group 20 | group = "group-proc" 21 | group = ${?KAFKA_GROUP_PROC} 22 | 23 | ## the source topic - processing starts with 24 | ## data in this topic (to be loaded by ingestion) 25 | fromtopic = "server-log-proc" 26 | fromtopic = ${?KAFKA_FROM_TOPIC_PROC} 27 | 28 | ## error topic for the initial processing 29 | errortopic = "logerr-proc" 30 | errortopic = ${?KAFKA_ERROR_TOPIC_PROC} 31 | 32 | ## folder where state stores are created by Kafka Streams 33 | statestoredir = "/tmp/kafka-streams" 34 | statestoredir = ${?STATESTOREDIR} 35 | 36 | ## settings for data ingestion 37 | loader { 38 | sourcetopic = ${kafka.fromtopic} 39 | sourcetopic = ${?KAFKA_FROM_TOPIC_PROC} 40 | 41 | directorytowatch = "/Users/myhome/ClarkNet-HTTP" 42 | directorytowatch = ${?DIRECTORY_TO_WATCH} 43 | 44 | pollinterval = 1 second 45 | } 46 | } 47 | 48 | # http endpoints of the weblog microservice 49 | http { 50 | # The port the dashboard listens on 51 | port = 7071 52 | port = ${?PORT0} 53 | 54 | # The interface the dashboard listens on 55 | interface = "localhost" 56 | interface = ${?INTERFACE_PROC} 57 | } 58 | 59 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/lightbend/kafka/scala/iq/serializers/Serializers.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq 6 | package serializers 7 | 8 | import org.apache.kafka.streams.kstream.Windowed 9 | import org.apache.kafka.common.serialization._ 10 | import org.apache.kafka.streams.kstream.internals.{WindowedDeserializer, WindowedSerializer} 11 | 12 | trait SerDeserializer[T] extends Serializer[T] with Deserializer[T] 13 | 14 | trait Serializers { 15 | final val stringSerializer = new StringSerializer() 16 | final val stringDeserializer = new StringDeserializer() 17 | final val byteArraySerializer = new ByteArraySerializer() 18 | final val byteArrayDeserializer = new ByteArrayDeserializer() 19 | 20 | final val windowedStringSerializer: WindowedSerializer[String] = new WindowedSerializer[String](stringSerializer) 21 | final val windowedStringDeserializer: WindowedDeserializer[String] = new WindowedDeserializer[String](stringDeserializer) 22 | final val windowedStringSerde: Serde[Windowed[String]] = Serdes.serdeFrom(windowedStringSerializer, windowedStringDeserializer) 23 | 24 | final val windowedByteArraySerializer: WindowedSerializer[Array[Byte]] = new WindowedSerializer[Array[Byte]](byteArraySerializer) 25 | final val windowedByteArrayDeserializer: WindowedDeserializer[Array[Byte]] = new WindowedDeserializer[Array[Byte]](byteArrayDeserializer) 26 | final val windowedByteArraySerde: Serde[Windowed[Array[Byte]]] = Serdes.serdeFrom(windowedByteArraySerializer, windowedByteArrayDeserializer) 27 | 28 | final val stringSerde = Serdes.String() 29 | final val longSerde: Serde[Long] = Serdes.Long().asInstanceOf[Serde[Long]] 30 | final val byteArraySerde = Serdes.ByteArray() 31 | } 32 | -------------------------------------------------------------------------------- /lib/build.sbt: -------------------------------------------------------------------------------- 1 | import Dependencies._ 2 | 3 | name := "kafka-streams-query" 4 | 5 | organization := "com.lightbend" 6 | 7 | version := "0.1.1" 8 | 9 | scalaVersion := Versions.scalaVersion 10 | 11 | crossScalaVersions := Versions.crossScalaVersions 12 | 13 | scalacOptions := Seq("-Xexperimental", "-unchecked", "-deprecation", "-Ywarn-unused-import") 14 | 15 | parallelExecution in Test := false 16 | 17 | libraryDependencies ++= Seq( 18 | kafkaStreams excludeAll(ExclusionRule("org.slf4j", "slf4j-log4j12"), ExclusionRule("org.apache.zookeeper", "zookeeper")), 19 | scalaLogging, 20 | circeCore, 21 | circeGeneric, 22 | circeParser, 23 | akkaHttp, 24 | akkaStreams, 25 | akkaHttpCirce, 26 | akkaSlf4j, 27 | bijection 28 | ) 29 | 30 | licenses := Seq("Apache 2" -> new URL("http://www.apache.org/licenses/LICENSE-2.0.txt")) 31 | 32 | developers := List( 33 | Developer("debasishg", "Debasish Ghosh", "@debasishg", url("https://github.com/debasishg")), 34 | Developer("blublinsky", "Boris Lublinsky", "@blublinsky", url("https://github.com/blublinsky")), 35 | Developer("maasg", "Gerard Maas", "@maasg", url("https://github.com/maasg")), 36 | Developer("seglo", "Sean Glover", "@seglo", url("https://github.com/seglo")) 37 | ) 38 | 39 | organizationName := "lightbend" 40 | 41 | organizationHomepage := Some(url("http://lightbend.com/")) 42 | 43 | homepage := scmInfo.value map (_.browseUrl) 44 | 45 | scmInfo := Some(ScmInfo(url("https://github.com/lightbend/kafka-streams-query"), "git@github.com:lightbend/kafka-streams-query.git")) 46 | 47 | credentials += Credentials(Path.userHome / ".ivy2" / ".credentials") 48 | 49 | publishTo := { 50 | val nexus = "https://oss.sonatype.org/" 51 | if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots") 52 | else Some("releases" at nexus + "service/local/staging/deploy/maven2") 53 | } 54 | 55 | publishMavenStyle := true 56 | 57 | publishArtifact in Test := false 58 | -------------------------------------------------------------------------------- /lib/project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | import Versions._ 3 | 4 | object Dependencies { 5 | 6 | implicit class Exclude(module: ModuleID) { 7 | def log4jExclude: ModuleID = 8 | module excludeAll(ExclusionRule("log4j")) 9 | 10 | def driverExclusions: ModuleID = 11 | module.log4jExclude.exclude("com.google.guava", "guava") 12 | .excludeAll(ExclusionRule("org.slf4j")) 13 | } 14 | 15 | val kafkaStreams = "org.apache.kafka" % "kafka-streams" % kafkaVersion 16 | val scalaLogging = "com.typesafe.scala-logging" %% "scala-logging" % scalaLoggingVersion 17 | val logback = "ch.qos.logback" % "logback-classic" % logbackVersion 18 | val kafka = "org.apache.kafka" %% "kafka" % kafkaVersion 19 | val curator = "org.apache.curator" % "curator-test" % curatorVersion 20 | val minitest = "io.monix" %% "minitest" % minitestVersion 21 | val minitestLaws = "io.monix" %% "minitest-laws" % minitestVersion 22 | val algebird = "com.twitter" %% "algebird-core" % algebirdVersion 23 | val chill = "com.twitter" %% "chill" % chillVersion 24 | val circeCore = "io.circe" %% "circe-core" % circeVersion 25 | val circeGeneric = "io.circe" %% "circe-generic" % circeVersion 26 | val circeParser = "io.circe" %% "circe-parser" % circeVersion 27 | val akkaSlf4j = "com.typesafe.akka" %% "akka-slf4j" % akkaVersion 28 | val akkaStreams = "com.typesafe.akka" %% "akka-stream" % akkaVersion 29 | val akkaHttp = "com.typesafe.akka" %% "akka-http" % akkaHttpVersion 30 | val akkaHttpCirce = "de.heikoseeberger" %% "akka-http-circe" % akkaHttpCirceVersion 31 | val bijection = "com.twitter" %% "bijection-avro" % bijectionVersion 32 | } 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/package.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq 6 | 7 | import java.nio.charset.Charset 8 | import cats.syntax.either._ 9 | import java.time.OffsetDateTime 10 | import io.circe._, io.circe.generic.semiauto._ 11 | import example.models.LogRecord 12 | 13 | import scala.concurrent.duration._ 14 | import scala.concurrent.ExecutionContext 15 | import scala.concurrent.Future 16 | import akka.pattern.after 17 | import akka.actor.Scheduler 18 | 19 | package object example { 20 | final val CHARSET = Charset.forName("UTF-8") 21 | 22 | implicit val encodeOffsetDateTime: Encoder[OffsetDateTime] = Encoder.encodeString.contramap[OffsetDateTime](_.toString) 23 | 24 | implicit val decodeInstant: Decoder[OffsetDateTime] = Decoder.decodeString.emap { str => 25 | Either.catchNonFatal(OffsetDateTime.parse(str)).leftMap(t => "OffsetDateTime") 26 | } 27 | 28 | implicit val logRecordDecoder: Decoder[LogRecord] = deriveDecoder[LogRecord] 29 | implicit val logRecordEncoder: Encoder[LogRecord] = deriveEncoder[LogRecord] 30 | 31 | implicit def asFiniteDuration(d: java.time.Duration) = 32 | scala.concurrent.duration.Duration.fromNanos(d.toNanos) 33 | 34 | def translateHostInterface(host: String) = host match { 35 | case "0.0.0.0" => java.net.InetAddress.getLocalHost.getHostAddress 36 | case x => x 37 | } 38 | 39 | /** 40 | * Given an operation that produces a T, returns a Future containing the result of T, unless an exception is thrown, 41 | * in which case the operation will be retried after _delay_ time, if there are more possible retries, which is configured through 42 | * the _retries_ parameter. If the operation does not succeed and there is no retries left, the resulting Future will 43 | * contain the last failure. 44 | **/ 45 | // https://gist.github.com/viktorklang/9414163 46 | def retry[T](op: => T, delay: FiniteDuration, retries: Int)(implicit ec: ExecutionContext, s: Scheduler): Future[T] = 47 | Future(op) recoverWith { case _ if retries > 0 => after(delay, s)(retry(op, delay, retries - 1)) } 48 | } 49 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/package.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq 6 | 7 | import java.nio.charset.Charset 8 | import cats.syntax.either._ 9 | import java.time.OffsetDateTime 10 | import io.circe._, io.circe.generic.semiauto._ 11 | import example.models.LogRecord 12 | 13 | import scala.concurrent.duration._ 14 | import scala.concurrent.ExecutionContext 15 | import scala.concurrent.Future 16 | import akka.pattern.after 17 | import akka.actor.Scheduler 18 | 19 | package object example { 20 | final val CHARSET = Charset.forName("UTF-8") 21 | 22 | implicit val encodeOffsetDateTime: Encoder[OffsetDateTime] = Encoder.encodeString.contramap[OffsetDateTime](_.toString) 23 | 24 | implicit val decodeInstant: Decoder[OffsetDateTime] = Decoder.decodeString.emap { str => 25 | Either.catchNonFatal(OffsetDateTime.parse(str)).leftMap(t => "OffsetDateTime") 26 | } 27 | 28 | implicit val logRecordDecoder: Decoder[LogRecord] = deriveDecoder[LogRecord] 29 | implicit val logRecordEncoder: Encoder[LogRecord] = deriveEncoder[LogRecord] 30 | 31 | implicit def asFiniteDuration(d: java.time.Duration) = 32 | scala.concurrent.duration.Duration.fromNanos(d.toNanos) 33 | 34 | def translateHostInterface(host: String) = host match { 35 | case "0.0.0.0" => java.net.InetAddress.getLocalHost.getHostAddress 36 | case x => x 37 | } 38 | 39 | /** 40 | * Given an operation that produces a T, returns a Future containing the result of T, unless an exception is thrown, 41 | * in which case the operation will be retried after _delay_ time, if there are more possible retries, which is configured through 42 | * the _retries_ parameter. If the operation does not succeed and there is no retries left, the resulting Future will 43 | * contain the last failure. 44 | **/ 45 | // https://gist.github.com/viktorklang/9414163 46 | def retry[T](op: => T, delay: FiniteDuration, retries: Int)(implicit ec: ExecutionContext, s: Scheduler): Future[T] = 47 | Future(op) recoverWith { case _ if retries > 0 => after(delay, s)(retry(op, delay, retries - 1)) } 48 | } 49 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/lightbend/kafka/scala/iq/http/HttpRequester.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq 6 | package http 7 | 8 | import akka.actor.ActorSystem 9 | import akka.http.scaladsl.Http 10 | 11 | import akka.http.scaladsl.model.{ HttpResponse, HttpRequest, ResponseEntity } 12 | import akka.http.scaladsl.model.StatusCodes._ 13 | import akka.http.scaladsl.unmarshalling.{ Unmarshal, Unmarshaller } 14 | 15 | import akka.stream.ActorMaterializer 16 | 17 | import scala.concurrent.{ Future, ExecutionContext} 18 | 19 | import com.typesafe.scalalogging.LazyLogging 20 | import services.HostStoreInfo 21 | import java.io.IOException 22 | 23 | /** 24 | * Provides a generic API over HTTP to query from a host and a store. The result is 25 | * returned as a Future. 26 | */ 27 | class HttpRequester(val actorSystem: ActorSystem, val mat: ActorMaterializer, 28 | val executionContext: ExecutionContext) extends LazyLogging { 29 | 30 | private implicit val as: ActorSystem = actorSystem 31 | private implicit val mt: ActorMaterializer = mat 32 | private implicit val ec: ExecutionContext = executionContext 33 | 34 | private def apiRequest(path: String, host: HostStoreInfo): Future[HttpResponse] = 35 | Http().singleRequest(HttpRequest(uri = s"http://${host.host}:${host.port}$path")) 36 | 37 | def queryFromHost[V](host: HostStoreInfo, 38 | path: String)(implicit u: Unmarshaller[ResponseEntity, V]): Future[V] = { 39 | apiRequest(path, host).flatMap { response => 40 | response.status match { 41 | case OK => Unmarshal(response.entity).to[V] 42 | 43 | case BadRequest => { 44 | logger.error(s"$path: incorrect path") 45 | Future.failed(new IOException(s"$path: incorrect path")) 46 | } 47 | 48 | case otherStatus => Unmarshal(response.entity).to[String].flatMap { entity => 49 | val error = s"state fetch request failed with status code ${otherStatus} and entity $entity" 50 | logger.error(error) 51 | Future.failed(new IOException(error)) 52 | } 53 | } 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/http/BFValueFetcher.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package http 7 | 8 | import akka.actor.ActorSystem 9 | 10 | import org.apache.kafka.streams.{ KafkaStreams } 11 | import org.apache.kafka.streams.state.HostInfo 12 | 13 | import scala.concurrent.{ Future, ExecutionContext} 14 | import scala.util.{ Success, Failure } 15 | 16 | import com.typesafe.scalalogging.LazyLogging 17 | import com.lightbend.kafka.scala.iq.services.{ MetadataService, HostStoreInfo } 18 | import services.AppStateStoreQuery 19 | import com.lightbend.kafka.scala.iq.http.HttpRequester 20 | import de.heikoseeberger.akkahttpcirce.FailFastCirceSupport 21 | import serializers.AppSerializers 22 | 23 | class BFValueFetcher( 24 | metadataService: MetadataService, 25 | localStateStoreQuery: AppStateStoreQuery[String, Long], 26 | httpRequester: HttpRequester, 27 | streams: KafkaStreams, 28 | executionContext: ExecutionContext, 29 | hostInfo: HostInfo)(implicit actorSystem: ActorSystem) extends LazyLogging with FailFastCirceSupport with AppSerializers { 30 | 31 | private implicit val ec: ExecutionContext = executionContext 32 | 33 | def checkIfPresent(hostKey: String): Future[Boolean] = { 34 | 35 | val store = WeblogDriver.LOG_COUNT_STATE_STORE 36 | val path = s"/weblog/access/check/$hostKey" 37 | 38 | metadataService.streamsMetadataForStoreAndKey(store, hostKey, stringSerializer) match { 39 | case Success(host) => { 40 | // hostKey is on another instance. call the other instance to fetch the data. 41 | if (!thisHost(host)) { 42 | logger.warn(s"Key $hostKey is on another instance not on ${translateHostInterface(hostInfo.host)}:${hostInfo.port} - requerying ..") 43 | httpRequester.queryFromHost[Boolean](host, path) 44 | } else { 45 | // hostKey is on this instance 46 | localStateStoreQuery.queryBFStore(streams, store, hostKey) 47 | } 48 | } 49 | case Failure(ex) => Future.failed(ex) 50 | } 51 | } 52 | 53 | private def thisHost(host: HostStoreInfo): Boolean = 54 | host.host.equals(translateHostInterface(hostInfo.host)) && host.port == hostInfo.port 55 | } 56 | 57 | 58 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/example/http/SummaryInfoFetcher.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package http 7 | 8 | import com.lightbend.kafka.scala.iq.http.KeyValueFetcher 9 | import scala.concurrent.Future 10 | 11 | class SummaryInfoFetcher(kvf: KeyValueFetcher[String, Long]) { 12 | def fetchAccessCountSummary(hostKey: String): Future[Long] = 13 | kvf.fetch(hostKey, WeblogProcessing.ACCESS_COUNT_PER_HOST_STORE, "/weblog/access/" + hostKey) 14 | 15 | def fetchPayloadSizeSummary(hostKey: String): Future[Long] = 16 | kvf.fetch(hostKey, WeblogProcessing.PAYLOAD_SIZE_PER_HOST_STORE, "/weblog/bytes/" + hostKey) 17 | 18 | def fetchRangeAccessCountSummary(fromKey: String, toKey: String): Future[List[(String, Long)]] = 19 | kvf.fetchRange(fromKey, toKey, WeblogProcessing.ACCESS_COUNT_PER_HOST_STORE, "/weblog/access/range/") 20 | 21 | def fetchRangePayloadSizeSummary(fromKey: String, toKey: String): Future[List[(String, Long)]] = 22 | kvf.fetchRange(fromKey, toKey, WeblogProcessing.PAYLOAD_SIZE_PER_HOST_STORE, "/weblog/bytes/range/") 23 | 24 | def fetchAllAccessCountSummary: Future[List[(String, Long)]] = 25 | kvf.fetchAll(WeblogProcessing.ACCESS_COUNT_PER_HOST_STORE, "/weblog/access/ALL") 26 | 27 | def fetchAllPayloadSizeSummary: Future[List[(String, Long)]] = 28 | kvf.fetchAll(WeblogProcessing.PAYLOAD_SIZE_PER_HOST_STORE, "/weblog/bytes/ALL") 29 | 30 | def fetchApproxAccessCountNumEntries: Future[Long] = 31 | kvf.fetchApproxNumEntries(WeblogProcessing.ACCESS_COUNT_PER_HOST_STORE, "/weblog/access/COUNT") 32 | 33 | def fetchApproxPayloadNumEntries: Future[Long] = 34 | kvf.fetchApproxNumEntries(WeblogProcessing.PAYLOAD_SIZE_PER_HOST_STORE, "/weblog/bytes/COUNT") 35 | 36 | def fetchWindowedAccessCountSummary(hostKey: String, fromTime: Long, toTime: Long): Future[List[(Long, Long)]] = 37 | kvf.fetchWindowed(hostKey, WeblogProcessing.WINDOWED_ACCESS_COUNT_PER_HOST_STORE, "/weblog/access/win/", fromTime, toTime) 38 | 39 | def fetchWindowedPayloadSizeSummary(hostKey: String, fromTime: Long, toTime: Long): Future[List[(Long, Long)]] = 40 | kvf.fetchWindowed(hostKey, WeblogProcessing.WINDOWED_PAYLOAD_SIZE_PER_HOST_STORE, "/weblog/bytes/win/", fromTime, toTime) 41 | 42 | } 43 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/example/ingestion/DataIngestion.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package ingestion 7 | 8 | import java.nio.file.{ Path, FileSystems } 9 | 10 | import akka.{ NotUsed, Done } 11 | import akka.util.ByteString 12 | import akka.actor.ActorSystem 13 | 14 | import akka.stream.ActorMaterializer 15 | import akka.stream.scaladsl.{ Framing, Source } 16 | import akka.stream.alpakka.file.DirectoryChange._ 17 | import akka.stream.alpakka.file.scaladsl._ 18 | 19 | import akka.kafka.ProducerSettings 20 | import akka.kafka.scaladsl.Producer 21 | 22 | import org.apache.kafka.clients.producer.ProducerRecord 23 | 24 | import scala.concurrent.duration._ 25 | import scala.concurrent.Future 26 | 27 | import config.KStreamConfig._ 28 | import serializers.AppSerializers 29 | import com.typesafe.scalalogging.LazyLogging 30 | 31 | object DataIngestion extends LazyLogging with AppSerializers { 32 | def registerForIngestion(config: ConfigData) 33 | (implicit system: ActorSystem, materializer: ActorMaterializer): Future[Done] = { 34 | 35 | val fs = FileSystems.getDefault 36 | 37 | config.directoryToWatch.map { dir => 38 | DirectoryChangesSource(fs.getPath(dir), 39 | config.pollInterval, 40 | maxBufferSize = 1024).runForeach { 41 | 42 | case (path, _@(Creation | Modification)) => { 43 | val _ = produce(path, config) 44 | () 45 | } 46 | case (_, Deletion) => () 47 | } 48 | }.getOrElse(Future.failed(new IllegalArgumentException("No directoryToWatch set in data ingestion module"))) 49 | } 50 | 51 | private def produce(path: Path, config: ConfigData) 52 | (implicit system: ActorSystem, materializer: ActorMaterializer): NotUsed = { 53 | 54 | val MAX_CHUNK_SIZE = 25000 55 | val POLLING_INTERVAL = 250 millis 56 | 57 | val producerSettings = ProducerSettings(system, byteArraySerde.serializer, stringSerializer).withBootstrapServers(config.brokers) 58 | 59 | val logLines: Source[String, NotUsed] = 60 | FileTailSource(path, MAX_CHUNK_SIZE, 0, POLLING_INTERVAL) 61 | .via(Framing.delimiter(ByteString.fromString("\n"), MAX_CHUNK_SIZE)) 62 | .map(_.utf8String) 63 | 64 | logLines 65 | .map(new ProducerRecord[Array[Byte], String](config.sourceTopic, _)) 66 | .to(Producer.plainSink(producerSettings)) 67 | .run() 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/ingestion/DataIngestion.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package ingestion 7 | 8 | import java.nio.file.{ Path, FileSystems } 9 | 10 | import akka.{ NotUsed, Done } 11 | import akka.util.ByteString 12 | import akka.actor.ActorSystem 13 | 14 | import akka.stream.ActorMaterializer 15 | import akka.stream.scaladsl.{ Framing, Source } 16 | import akka.stream.alpakka.file.DirectoryChange._ 17 | import akka.stream.alpakka.file.scaladsl._ 18 | 19 | import akka.kafka.ProducerSettings 20 | import akka.kafka.scaladsl.Producer 21 | 22 | import org.apache.kafka.clients.producer.ProducerRecord 23 | 24 | import scala.concurrent.duration._ 25 | import scala.concurrent.Future 26 | 27 | import config.KStreamConfig._ 28 | import serializers.AppSerializers 29 | import com.typesafe.scalalogging.LazyLogging 30 | 31 | object DataIngestion extends LazyLogging with AppSerializers { 32 | def registerForIngestion(config: ConfigData) 33 | (implicit system: ActorSystem, materializer: ActorMaterializer): Future[Done] = { 34 | 35 | val fs = FileSystems.getDefault 36 | 37 | config.directoryToWatch.map { dir => 38 | DirectoryChangesSource(fs.getPath(dir), 39 | config.pollInterval, 40 | maxBufferSize = 1024).runForeach { 41 | 42 | case (path, _@(Creation | Modification)) => { 43 | val _ = produce(path, config) 44 | () 45 | } 46 | case (_, Deletion) => () 47 | } 48 | }.getOrElse(Future.failed(new IllegalArgumentException("No directoryToWatch set in data ingestion module"))) 49 | } 50 | 51 | private def produce(path: Path, config: ConfigData) 52 | (implicit system: ActorSystem, materializer: ActorMaterializer): NotUsed = { 53 | 54 | val MAX_CHUNK_SIZE = 25000 55 | val POLLING_INTERVAL = 250 millis 56 | 57 | val producerSettings = ProducerSettings(system, byteArraySerde.serializer, stringSerializer) 58 | .withBootstrapServers(config.brokers) 59 | 60 | val logLines: Source[String, NotUsed] = 61 | FileTailSource(path, MAX_CHUNK_SIZE, 0, POLLING_INTERVAL) 62 | .via(Framing.delimiter(ByteString.fromString("\n"), MAX_CHUNK_SIZE)) 63 | .map(_.utf8String) 64 | 65 | logLines 66 | .map(new ProducerRecord[Array[Byte], String](config.sourceTopic, _)) 67 | .to(Producer.plainSink(producerSettings)) 68 | .run() 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/lightbend/kafka/scala/iq/http/InteractiveQueryHttpService.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq 6 | package http 7 | 8 | import akka.actor.ActorSystem 9 | 10 | import akka.http.scaladsl.server.Directives 11 | import akka.http.scaladsl.Http 12 | 13 | import akka.http.scaladsl.model.{ HttpRequest, HttpResponse } 14 | import akka.http.scaladsl.model.StatusCodes._ 15 | import akka.http.scaladsl.server.ExceptionHandler 16 | import de.heikoseeberger.akkahttpcirce.FailFastCirceSupport 17 | 18 | import akka.stream.ActorMaterializer 19 | import akka.stream.scaladsl.Flow 20 | 21 | import org.apache.kafka.streams.state.HostInfo 22 | 23 | import scala.concurrent.{ Future, ExecutionContext} 24 | import scala.util.{ Success, Failure } 25 | 26 | import com.typesafe.scalalogging.LazyLogging 27 | 28 | 29 | /** 30 | * The interactive http query service. Offers APIs to start and stop the service. 31 | */ 32 | abstract class InteractiveQueryHttpService(hostInfo: HostInfo, 33 | actorSystem: ActorSystem, 34 | actorMaterializer: ActorMaterializer, 35 | ec: ExecutionContext) 36 | extends Directives with FailFastCirceSupport with LazyLogging { 37 | 38 | implicit val _actorSystem = actorSystem 39 | implicit val _actorMaterializer = actorMaterializer 40 | implicit val _ec = ec 41 | 42 | val myExceptionHandler = ExceptionHandler { 43 | case ex: Exception => 44 | extractUri { uri => 45 | logger.error(s"Request to $uri could not be handled normally", ex) 46 | complete(HttpResponse(InternalServerError, entity = "Request Failed!")) 47 | } 48 | } 49 | 50 | // define the routes 51 | val routes: Flow[HttpRequest, HttpResponse, Any] 52 | var bindingFuture: Future[Http.ServerBinding] = _ 53 | 54 | 55 | // start the http server 56 | def start(): Unit = { 57 | bindingFuture = Http().bindAndHandle(routes, hostInfo.host, hostInfo.port) 58 | 59 | bindingFuture.onComplete { 60 | case Success(serverBinding) => 61 | logger.info(s"Server bound to ${serverBinding.localAddress} ") 62 | 63 | case Failure(ex) => 64 | logger.error(s"Failed to bind to ${hostInfo.host}:${hostInfo.port}!", ex) 65 | actorSystem.terminate() 66 | } 67 | } 68 | 69 | 70 | // stop the http server 71 | def stop(): Unit = { 72 | logger.info("Stopping the http server") 73 | bindingFuture 74 | .flatMap(_.unbind()) 75 | .onComplete(_ => actorSystem.terminate()) 76 | } 77 | } 78 | 79 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/resources/application-dsl.conf.template: -------------------------------------------------------------------------------- 1 | akka { 2 | loglevel = INFO 3 | log-config-on-start = on 4 | loggers = ["akka.event.slf4j.Slf4jLogger"] 5 | logging-filter = "akka.event.slf4j.Slf4jLoggingFilter" 6 | event-handlers = ["akka.event.slf4j.Slf4jEventHandler"] 7 | } 8 | 9 | kafka { 10 | # true if use local kafka server 11 | # false otherwise 12 | # if true, then setting of brokers below is ignored and set to that of KafkaLocalServer 13 | localserver = true 14 | 15 | ## bootstrap servers for Kafka 16 | brokers = "localhost:9092" 17 | brokers = ${?KAFKA_BROKERS} 18 | 19 | ## consumer group 20 | group = "group-dsl" 21 | group = ${?KAFKA_GROUP_DSL} 22 | 23 | ## the source topic - processing starts with 24 | ## data in this topic (to be loaded by ingestion) 25 | fromtopic = "server-log-dsl" 26 | fromtopic = ${?KAFKA_FROM_TOPIC_DSL} 27 | 28 | ## processed records goes here in json of LogRecord 29 | totopic = "processed-log" 30 | totopic = ${?KAFKA_TO_TOPIC_DSL} 31 | 32 | ## this gets the avro serialized data from totopic for processing by Kafka Connect 33 | ## HDFS sink connector 34 | avrotopic = "avro-topic" 35 | avrotopic = ${?KAFKA_AVRO_TOPIC_DSL} 36 | 37 | ## summary access information gets pushed here 38 | summaryaccesstopic = "summary-access-log" 39 | summaryaccesstopic = ${?KAFKA_SUMMARY_ACCESS_TOPIC_DSL} 40 | 41 | ## windowed summary access information gets pushed here 42 | windowedsummaryaccesstopic = "windowed-summary-access-log" 43 | windowedsummaryaccesstopic = ${?KAFKA_WINDOWED_SUMMARY_ACCESS_TOPIC_DSL} 44 | 45 | ## summary payload information gets pushed here 46 | summarypayloadtopic = "summary-payload-log" 47 | summarypayloadtopic = ${?KAFKA_SUMMARY_PAYLOAD_TOPIC_DSL} 48 | 49 | ## windowed summary payload information gets pushed here 50 | windowedsummarypayloadtopic = "windowed-summary-payload-log" 51 | windowedsummarypayloadtopic = ${?KAFKA_WINDOWED_SUMMARY_PAYLOAD_TOPIC_DSL} 52 | 53 | ## error topic for the initial processing 54 | errortopic = "logerr-dsl" 55 | errortopic = ${?KAFKA_ERROR_TOPIC_DSL} 56 | 57 | # schemaregistryurl = "http://localhost:8081" 58 | # schemaregistryurl = ${?SCHEMA_REGISTRY_URL} 59 | 60 | ## folder where state stores are created by Kafka Streams 61 | statestoredir = "/tmp/kafka-streams" 62 | statestoredir = ${?STATESTOREDIR} 63 | 64 | ## settings for data ingestion 65 | loader { 66 | sourcetopic = ${kafka.fromtopic} 67 | sourcetopic = ${?KAFKA_FROM_TOPIC_DSL} 68 | 69 | directorytowatch = "/Users/myhome/ClarkNet-HTTP" 70 | directorytowatch = ${?DIRECTORY_TO_WATCH} 71 | 72 | pollinterval = 1 second 73 | } 74 | } 75 | 76 | # http endpoints of the weblog microservice 77 | http { 78 | # The port the dashboard listens on 79 | port = 7070 80 | port = ${?PORT0} 81 | 82 | # The interface the dashboard listens on 83 | interface = "localhost" 84 | interface = ${?INTERFACE_DSL} 85 | } 86 | 87 | -------------------------------------------------------------------------------- /examples/kafka-local-server/src/main/scala/com/lightbend/kafka/scala/server/MessageSender.scala: -------------------------------------------------------------------------------- 1 | package com.lightbend.kafka.scala.server 2 | 3 | import org.apache.kafka.clients.producer.{ KafkaProducer, ProducerConfig, ProducerRecord, RecordMetadata } 4 | import java.util.Properties 5 | 6 | object MessageSender { 7 | private val ACKS_CONFIG = "all" // Blocking on the full commit of the record 8 | private val RETRIES_CONFIG = "1" // Number of retries on put 9 | private val BATCH_SIZE_CONFIG = "1024" // Buffers for unsent records for each partition - controlls batching 10 | private val LINGER_MS_CONFIG = "1" // Timeout for more records to arive - controlls batching 11 | 12 | private val BUFFER_MEMORY_CONFIG = "1024000" // Controls the total amount of memory available to the producer for buffering. 13 | // If records are sent faster than they can be transmitted to the server then this 14 | // buffer space will be exhausted. When the buffer space is exhausted additional 15 | // send calls will block. The threshold for time to block is determined by max.block.ms 16 | // after which it throws a TimeoutException. 17 | 18 | def providerProperties(brokers: String, keySerializer: String, valueSerializer: String): Properties = { 19 | val props = new Properties 20 | props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers) 21 | props.put(ProducerConfig.ACKS_CONFIG, ACKS_CONFIG) 22 | props.put(ProducerConfig.RETRIES_CONFIG, RETRIES_CONFIG) 23 | props.put(ProducerConfig.BATCH_SIZE_CONFIG, BATCH_SIZE_CONFIG) 24 | props.put(ProducerConfig.LINGER_MS_CONFIG, LINGER_MS_CONFIG) 25 | props.put(ProducerConfig.BUFFER_MEMORY_CONFIG, BUFFER_MEMORY_CONFIG) 26 | props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, keySerializer) 27 | props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, valueSerializer) 28 | props 29 | } 30 | 31 | def apply[K, V](brokers: String, keySerializer: String, valueSerializer: String): MessageSender[K, V] = 32 | new MessageSender[K, V](brokers, keySerializer, valueSerializer) 33 | } 34 | 35 | class MessageSender[K, V](val brokers: String, val keySerializer: String, val valueSerializer: String) { 36 | 37 | import MessageSender._ 38 | val producer = new KafkaProducer[K, V](providerProperties(brokers, keySerializer, valueSerializer)) 39 | 40 | def writeKeyValue(topic: String, key: K, value: V): Unit = { 41 | producer.send(new ProducerRecord[K, V](topic, key, value)).get 42 | producer.flush() 43 | } 44 | 45 | def writeValue(topic: String, value: V): Unit = { 46 | producer.send(new ProducerRecord[K, V](topic, null.asInstanceOf[K], value)).get 47 | producer.flush() 48 | } 49 | 50 | def batchWriteValue(topic: String, batch: Seq[V]): Seq[RecordMetadata] = { 51 | val result = batch.map(value => { 52 | producer.send(new ProducerRecord[K, V](topic, null.asInstanceOf[K], value)).get}) 53 | producer.flush() 54 | result 55 | } 56 | 57 | def close(): Unit = { 58 | producer.close() 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/example/http/WeblogDSLHttpService.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package http 7 | 8 | import akka.actor.ActorSystem 9 | 10 | import akka.stream.ActorMaterializer 11 | 12 | import io.circe.generic.auto._ 13 | import io.circe.syntax._ 14 | 15 | import org.apache.kafka.streams.state.HostInfo 16 | 17 | import scala.concurrent.ExecutionContext 18 | import com.lightbend.kafka.scala.iq.http.InteractiveQueryHttpService 19 | 20 | 21 | class WeblogDSLHttpService( 22 | hostInfo: HostInfo, 23 | summaryInfoFetcher: SummaryInfoFetcher, 24 | actorSystem: ActorSystem, 25 | actorMaterializer: ActorMaterializer, 26 | ec: ExecutionContext 27 | ) extends InteractiveQueryHttpService(hostInfo, actorSystem, actorMaterializer, ec) { 28 | 29 | 30 | // define the routes 31 | val routes = handleExceptions(myExceptionHandler) { 32 | pathPrefix("weblog") { 33 | (get & pathPrefix("access" / "win") & path(Segment)) { hostKey => 34 | complete { 35 | summaryInfoFetcher.fetchWindowedAccessCountSummary(hostKey, 0, System.currentTimeMillis).map(_.asJson) 36 | } 37 | } ~ 38 | (get & pathPrefix("bytes" / "win") & path(Segment)) { hostKey => 39 | complete { 40 | summaryInfoFetcher.fetchWindowedPayloadSizeSummary(hostKey, 0, System.currentTimeMillis).map(_.asJson) 41 | } 42 | } ~ 43 | (get & pathPrefix("access" / "win" / Segment / LongNumber / LongNumber)) { (hostKey, fromTime, toTime) => 44 | complete { 45 | summaryInfoFetcher.fetchWindowedAccessCountSummary(hostKey, fromTime, toTime).map(_.asJson) 46 | } 47 | } ~ 48 | (get & pathPrefix("bytes" / "win" / Segment / LongNumber / LongNumber)) { (hostKey, fromTime, toTime) => 49 | complete { 50 | summaryInfoFetcher.fetchWindowedPayloadSizeSummary(hostKey, fromTime, toTime).map(_.asJson) 51 | } 52 | } ~ 53 | (get & pathPrefix("access" / "range" / Segment / Segment)) { (fromKey, toKey) => 54 | complete { 55 | summaryInfoFetcher.fetchRangeAccessCountSummary(fromKey, toKey).map(_.asJson) 56 | } 57 | } ~ 58 | (get & pathPrefix("bytes" / "range" / Segment / Segment)) { (fromKey, toKey) => 59 | complete { 60 | summaryInfoFetcher.fetchRangePayloadSizeSummary(fromKey, toKey).map(_.asJson) 61 | } 62 | } ~ 63 | (get & pathPrefix("access") & path(Segment)) { hostKey => 64 | complete { 65 | if (hostKey == "ALL") summaryInfoFetcher.fetchAllAccessCountSummary.map(_.asJson) 66 | else if (hostKey == "COUNT") summaryInfoFetcher.fetchApproxAccessCountNumEntries.map(_.asJson) 67 | else summaryInfoFetcher.fetchAccessCountSummary(hostKey).map(_.asJson) 68 | } 69 | } ~ 70 | (get & pathPrefix("bytes") & path(Segment)) { hostKey => 71 | complete { 72 | if (hostKey == "ALL") summaryInfoFetcher.fetchAllPayloadSizeSummary.map(_.asJson) 73 | else if (hostKey == "COUNT") summaryInfoFetcher.fetchApproxPayloadNumEntries.map(_.asJson) 74 | else summaryInfoFetcher.fetchPayloadSizeSummary(hostKey).map(_.asJson) 75 | } 76 | } 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/lightbend/kafka/scala/iq/services/MetadataService.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | /* 6 | * Copyright Confluent Inc. 7 | * 8 | * Licensed under the Apache License, Version 2.0 (the "License"); 9 | * you may not use this file except in compliance with the License. 10 | * You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | package com.lightbend.kafka.scala.iq 22 | package services 23 | 24 | import org.apache.kafka.common.serialization.Serializer 25 | import org.apache.kafka.streams.KafkaStreams 26 | import org.apache.kafka.streams.state.StreamsMetadata 27 | 28 | import scala.collection.JavaConverters._ 29 | import scala.util.{Failure, Success, Try} 30 | import com.typesafe.scalalogging.LazyLogging 31 | 32 | case class HostStoreInfo(host: String, port: Int, storeNames: Set[String]) 33 | 34 | /** 35 | * Looks up StreamsMetadata from KafkaStreams 36 | * Adapted from https://github.com/confluentinc/kafka-streams-examples/blob/4.0.0-post/src/main/java/io/confluent/examples/streams/interactivequeries/MetadataService.java 37 | */ 38 | class MetadataService(val streams: KafkaStreams) extends LazyLogging { 39 | 40 | /** 41 | * Get the metadata for all of the instances of this Kafka Streams application 42 | * @return List of {@link HostStoreInfo} 43 | */ 44 | def streamsMetadata(): List[HostStoreInfo] = { 45 | // Get metadata for all of the instances of this Kafka Streams application 46 | streams.allMetadata().asScala.toList.map(streamsMetadataToHostStoreInfo) 47 | } 48 | 49 | /** 50 | * Get the metadata for all instances of this Kafka Streams application that currently 51 | * has the provided store. 52 | * @param store The store to locate 53 | * @return List of {@link HostStoreInfo} 54 | */ 55 | def streamsMetadataForStore(store: String): List[HostStoreInfo] = { 56 | // Get metadata for all of the instances of this Kafka Streams application hosting the store 57 | streams.allMetadataForStore(store).asScala.toList.map(streamsMetadataToHostStoreInfo) 58 | } 59 | 60 | /** 61 | * Find the metadata for the instance of this Kafka Streams Application that has the given 62 | * store and would have the given key if it exists. 63 | * @param store Store to find 64 | * @param key The key to find 65 | * @return {@link HostStoreInfo} 66 | */ 67 | def streamsMetadataForStoreAndKey[K](store: String, key: K, serializer: Serializer[K]): Try[HostStoreInfo] = { 68 | // Get metadata for the instances of this Kafka Streams application hosting the store and 69 | // potentially the value for key 70 | logger.info(s"Finding streams metadata for $store, $key, $serializer") 71 | streams.metadataForKey(store, key, serializer) match { 72 | case null => Failure(new IllegalArgumentException(s"Metadata for key $key not found in $store")) 73 | case metadata => Success(new HostStoreInfo(metadata.host, metadata.port, metadata.stateStoreNames.asScala.toSet)) 74 | } 75 | } 76 | 77 | private[services] val streamsMetadataToHostStoreInfo: StreamsMetadata => HostStoreInfo = metadata => { 78 | HostStoreInfo(metadata.host(), metadata.port(), metadata.stateStoreNames().asScala.toSet) 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /examples/project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | import Keys._ 3 | import Versions._ 4 | 5 | object Dependencies { 6 | 7 | object Common { 8 | 9 | val ks = "com.lightbend" %% "kafka-streams-scala" % ksVersion exclude("org.slf4j", "slf4j-log4j12") 10 | val kq = "com.lightbend" %% "kafka-streams-query" % kqVersion exclude("org.slf4j", "slf4j-log4j12") 11 | val alpakka = "com.lightbend.akka" %% "akka-stream-alpakka-file" % alpakkaFileVersion 12 | val reactiveKafka = "com.typesafe.akka" %% "akka-stream-kafka" % reactiveKafkaVersion 13 | val akkaSlf4j = "com.typesafe.akka" %% "akka-slf4j" % akkaVersion 14 | val akkaStreams = "com.typesafe.akka" %% "akka-stream" % akkaVersion 15 | val akkaHttp = "com.typesafe.akka" %% "akka-http" % akkaHttpVersion 16 | val akkaHttpCirce = "de.heikoseeberger" %% "akka-http-circe" % akkaHttpCirceVersion 17 | val circeCore = "io.circe" %% "circe-core" % circeVersion 18 | val circeGeneric = "io.circe" %% "circe-generic" % circeVersion 19 | val circeParser = "io.circe" %% "circe-parser" % circeVersion 20 | val logback = "ch.qos.logback" % "logback-classic" % logbackVersion 21 | val scalaLogging = "com.typesafe.scala-logging" %% "scala-logging" % scalaLoggingVersion 22 | } 23 | 24 | object Dsl { 25 | 26 | val bijection = "com.twitter" %% "bijection-avro" % bijectionVersion 27 | val confluentAvro = "io.confluent" % "kafka-avro-serializer" % confluentPlatformVersion exclude("org.slf4j", "slf4j-log4j12") 28 | val kafka = "org.apache.kafka" %% "kafka" % kafkaVersion excludeAll(ExclusionRule("org.slf4j", "slf4j-log4j12"), ExclusionRule("org.apache.zookeeper", "zookeeper")) 29 | } 30 | 31 | object Proc { 32 | val algebird = "com.twitter" %% "algebird-core" % algebirdVersion 33 | val chill = "com.twitter" %% "chill" % chillVersion 34 | } 35 | 36 | object Server { 37 | val scalaLogging = "com.typesafe.scala-logging" %% "scala-logging" % scalaLoggingVersion 38 | val curator = "org.apache.curator" % "curator-test" % curatorVersion 39 | val kafkaStreams = "org.apache.kafka" % "kafka-streams" % kafkaVersion 40 | val kafka = "org.apache.kafka" %% "kafka" % kafkaVersion excludeAll(ExclusionRule("org.slf4j", "slf4j-log4j12"), ExclusionRule("org.apache.zookeeper", "zookeeper")) 41 | } 42 | 43 | val commonDependencies: Seq[ModuleID] = Seq(Common.ks, 44 | Common.kq, 45 | Common.alpakka, 46 | Common.reactiveKafka, 47 | Common.akkaSlf4j, 48 | Common.akkaStreams, 49 | Common.akkaHttp, 50 | Common.akkaHttpCirce, 51 | Common.circeCore, 52 | Common.circeGeneric, 53 | Common.circeParser, 54 | Common.logback, 55 | Common.scalaLogging 56 | ) 57 | 58 | val dslDependencies: Seq[ModuleID] = commonDependencies ++ Seq(Dsl.bijection, 59 | Dsl.confluentAvro, 60 | Dsl.kafka 61 | ) 62 | 63 | val procDependencies: Seq[ModuleID] = commonDependencies ++ Seq(Proc.algebird, Proc.chill) 64 | val serverDependencies: Seq[ModuleID] = Seq(Server.scalaLogging, Server.curator, Server.kafkaStreams, Server.kafka) 65 | } 66 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/processor/BFStore.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package processor 7 | 8 | import com.twitter.algebird.{BloomFilterMonoid, BF, Hash128, Approximate} 9 | import org.apache.kafka.common.serialization.Serdes 10 | import org.apache.kafka.streams.processor.{ProcessorContext, StateStore} 11 | import org.apache.kafka.streams.state.StateSerdes 12 | 13 | /** 14 | * Bloom Filter as a StateStore. The only query it supports is membership. 15 | */ 16 | class BFStore[T: Hash128](override val name: String, 17 | val loggingEnabled: Boolean = true, 18 | val numHashes: Int = 6, 19 | val width: Int = 32, 20 | val seed: Int = 1) extends WriteableBFStore[T] with StateStore { 21 | 22 | private val bfMonoid = new BloomFilterMonoid[T](numHashes, width) 23 | 24 | /** 25 | * The "storage backend" of this store. 26 | * 27 | * Needs proper initializing in case the store's changelog is empty. 28 | */ 29 | private[processor] var bf: BF[T] = bfMonoid.zero 30 | 31 | private[processor] var changeLogger: BFStoreChangeLogger[Integer, BF[T]] = _ 32 | 33 | private[processor] val changelogKey = 42 34 | private final val ACCEPTABLE_PROBABILITY = 0.75 35 | 36 | private[processor] def bfFrom(items: Seq[T]): BF[T] = bfMonoid.create(items:_*) 37 | 38 | private[processor] def bfFrom(item: T): BF[T] = bfMonoid.create(item) 39 | 40 | @volatile private var open: Boolean = false 41 | 42 | /** 43 | * Initializes this store, including restoring the store's state from its changelog. 44 | */ 45 | override def init(context: ProcessorContext, root: StateStore): Unit = { 46 | val serdes = new StateSerdes[Integer, BF[T]]( 47 | name, 48 | Serdes.Integer(), 49 | BFSerde[T]) 50 | 51 | changeLogger = new BFStoreChangeLogger[Integer, BF[T]](name, context, serdes) 52 | 53 | // Note: We must manually guard with `loggingEnabled` here because `context.register()` ignores 54 | // that parameter. 55 | if (root != null && loggingEnabled) { 56 | context.register(root, loggingEnabled, (_, value) => { 57 | if (value == null) { 58 | bf = bfMonoid.zero 59 | } 60 | else { 61 | bf = serdes.valueFrom(value) 62 | } 63 | }) 64 | } 65 | 66 | open = true 67 | } 68 | 69 | def +(item: T): Unit = bf = bf + item 70 | 71 | def contains(item: T): Boolean = { 72 | val v = bf.contains(item) 73 | v.isTrue && v.withProb > ACCEPTABLE_PROBABILITY 74 | } 75 | 76 | def maybeContains(item: T): Boolean = bf.maybeContains(item) 77 | def size: Approximate[Long] = bf.size 78 | 79 | 80 | override val persistent: Boolean = false 81 | 82 | override def isOpen: Boolean = open 83 | 84 | /** 85 | * Periodically saves the latest BF state to Kafka. 86 | * 87 | * =Implementation detail= 88 | * 89 | * The changelog records have the form: (hardcodedKey, BF). That is, we are backing up the 90 | * underlying CMS data structure in its entirety to Kafka. 91 | */ 92 | override def flush(): Unit = { 93 | // if (loggingEnabled) { 94 | // changeLogger.logChange(changelogKey, bf) 95 | // } 96 | } 97 | 98 | override def close(): Unit = { 99 | open = false 100 | } 101 | 102 | override def read(value: T): Boolean = contains(value) 103 | 104 | override def write(value: T): Unit = this + value 105 | 106 | } 107 | 108 | abstract class ReadableBFStore[T: Hash128] { 109 | def read(value: T): Boolean 110 | } 111 | 112 | abstract class WriteableBFStore[T: Hash128] extends ReadableBFStore[T] { 113 | def write(value: T): Unit 114 | } 115 | -------------------------------------------------------------------------------- /examples/kafka-local-server/src/main/scala/com/lightbend/kafka/scala/server/MessageListener.scala: -------------------------------------------------------------------------------- 1 | package com.lightbend.kafka.scala.server 2 | 3 | import org.apache.kafka.clients.consumer.{ ConsumerConfig, KafkaConsumer } 4 | import org.apache.kafka.streams.KeyValue 5 | import scala.collection.JavaConverters._ 6 | import scala.collection.mutable.ListBuffer 7 | 8 | 9 | object MessageListener { 10 | private val AUTO_COMMIT_INTERVAL_MS_CONFIG = "1000" // Frequency of offset commits 11 | private val SESSION_TIMEOUT_MS_CONFIG = "30000" // The timeout used to detect failures - should be greater then processing time 12 | private val MAX_POLL_RECORDS_CONFIG = "50" // Max number of records consumed in a single poll 13 | 14 | def consumerProperties(brokers: String, group: String, keyDeserializer: String, valueDeserializer: String): Map[String, AnyRef] = { 15 | Map[String, AnyRef]( 16 | ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers, 17 | ConsumerConfig.GROUP_ID_CONFIG -> group, 18 | ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> "true", 19 | ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG -> AUTO_COMMIT_INTERVAL_MS_CONFIG, 20 | ConsumerConfig.SESSION_TIMEOUT_MS_CONFIG -> SESSION_TIMEOUT_MS_CONFIG, 21 | ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> MAX_POLL_RECORDS_CONFIG, 22 | ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest", 23 | ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> keyDeserializer, 24 | ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> valueDeserializer 25 | ) 26 | } 27 | 28 | def apply[K, V](brokers: String, topic: String, group: String, keyDeserializer: String, valueDeserializer: String, 29 | processor: RecordProcessorTrait[K, V]): MessageListener[K, V] = 30 | new MessageListener[K, V](brokers, topic, group, keyDeserializer, valueDeserializer, processor) 31 | } 32 | 33 | class MessageListener[K, V]( 34 | brokers: String, 35 | topic: String, 36 | group: String, 37 | keyDeserializer: String, 38 | valueDeserializer: String, 39 | processor: RecordProcessorTrait[K, V]) { 40 | 41 | import MessageListener._ 42 | 43 | def readKeyValues(maxMessages: Int): List[KeyValue[K, V]] = { 44 | val pollIntervalMs = 100 45 | val maxTotalPollTimeMs = 2000 46 | var totalPollTimeMs = 0 47 | 48 | val consumer = new KafkaConsumer[K, V](consumerProperties(brokers, group, keyDeserializer, valueDeserializer).asJava) 49 | consumer.subscribe(Seq(topic).asJava) 50 | 51 | val consumedValues = ListBuffer.empty[KeyValue[K, V]] 52 | 53 | while (totalPollTimeMs < maxTotalPollTimeMs && continueConsuming(consumedValues.size, maxMessages)) { 54 | totalPollTimeMs = totalPollTimeMs + pollIntervalMs 55 | val records = consumer.poll(pollIntervalMs) 56 | records.asScala.foreach { record => 57 | processor.processRecord(record) 58 | consumedValues += new KeyValue(record.key, record.value) 59 | } 60 | } 61 | consumer.close() 62 | consumedValues.toList 63 | } 64 | 65 | def continueConsuming(messagesConsumed: Int, maxMessages: Int): Boolean = { 66 | maxMessages <= 0 || messagesConsumed < maxMessages 67 | } 68 | 69 | def waitUntilMinKeyValueRecordsReceived(expectedNumRecords: Int, waitTime: Long, 70 | startTime: Long = System.currentTimeMillis(), 71 | accumData: ListBuffer[KeyValue[K, V]] = ListBuffer.empty[KeyValue[K, V]]): List[KeyValue[K, V]] = { 72 | 73 | val readData = readKeyValues(-1) 74 | accumData ++= readData 75 | 76 | if (accumData.size >= expectedNumRecords) accumData.toList 77 | else if (System.currentTimeMillis() > startTime + waitTime) { 78 | throw new AssertionError( 79 | s"Expected $expectedNumRecords but received only ${accumData.size} records before timeout $waitTime ms") 80 | } else { 81 | Thread.sleep(Math.min(waitTime, 1000L)) 82 | waitUntilMinKeyValueRecordsReceived(expectedNumRecords, waitTime, startTime, accumData) 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/config/KStreamConfig.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package config 7 | 8 | import cats.data._ 9 | import cats.instances.all._ 10 | 11 | import scala.util.Try 12 | import com.typesafe.config.Config 13 | import scala.concurrent.duration._ 14 | import com.lightbend.kafka.scala.server._ 15 | 16 | 17 | /** 18 | * This object wraps the native Java config APIs into a monadic 19 | * interpreter 20 | */ 21 | object KStreamConfig { 22 | 23 | private[KStreamConfig] case class KafkaSettings( 24 | serverSettings: ServerSettings, 25 | topicSettings: TopicSettings 26 | ) 27 | 28 | private[KStreamConfig] case class ServerSettings( 29 | localServer: Boolean, 30 | brokers: String, 31 | stateStoreDir: String 32 | ) 33 | 34 | private[KStreamConfig] case class TopicSettings( 35 | fromTopic: String, 36 | errorTopic: String 37 | ) 38 | 39 | private[KStreamConfig] case class HttpSettings( 40 | interface: String, 41 | port: Int 42 | ) 43 | 44 | private[KStreamConfig] case class DataLoaderSettings( 45 | sourceTopic: String, 46 | directoryToWatch: Option[String], 47 | pollInterval: FiniteDuration 48 | ) 49 | 50 | case class ConfigData(ks: KafkaSettings, hs: HttpSettings, dls: DataLoaderSettings) { 51 | def localServer = ks.serverSettings.localServer 52 | def brokers = ks.serverSettings.brokers 53 | def fromTopic = ks.topicSettings.fromTopic 54 | def errorTopic = ks.topicSettings.errorTopic 55 | def stateStoreDir = ks.serverSettings.stateStoreDir 56 | def httpInterface = hs.interface 57 | def httpPort = hs.port 58 | def sourceTopic = dls.sourceTopic 59 | def directoryToWatch = dls.directoryToWatch 60 | def pollInterval = dls.pollInterval 61 | } 62 | 63 | type ConfigReader[A] = ReaderT[Try, Config, A] 64 | 65 | private def getStringMaybe(config: Config, key: String): Option[String] = try { 66 | val str = config.getString(key) 67 | if (str.trim.isEmpty) None else Some(str) 68 | } catch { 69 | case _: Exception => None 70 | } 71 | 72 | private def fromKafkaConfig: ConfigReader[KafkaSettings] = Kleisli { (config: Config) => 73 | Try { 74 | val local = config.getBoolean("kafka.localserver") 75 | val serverSettings = 76 | if (local) { 77 | ServerSettings( 78 | local, 79 | s"localhost:${KafkaLocalServer.DefaultPort}", 80 | config.getString("kafka.statestoredir") 81 | ) 82 | } else { 83 | ServerSettings( 84 | local, 85 | config.getString("kafka.brokers"), 86 | config.getString("kafka.statestoredir") 87 | ) 88 | } 89 | KafkaSettings( 90 | serverSettings, 91 | TopicSettings( 92 | config.getString("kafka.fromtopic"), 93 | config.getString("kafka.errortopic") 94 | ) 95 | ) 96 | } 97 | } 98 | 99 | private def fromHttpConfig: ConfigReader[HttpSettings] = Kleisli { (config: Config) => 100 | Try { 101 | HttpSettings( 102 | config.getString("http.interface"), 103 | config.getInt("http.port") 104 | ) 105 | } 106 | } 107 | 108 | private def fromDataLoaderConfig: ConfigReader[DataLoaderSettings] = Kleisli { (config: Config) => 109 | Try { 110 | DataLoaderSettings( 111 | config.getString("kafka.loader.sourcetopic"), 112 | getStringMaybe(config, "kafka.loader.directorytowatch"), 113 | config.getDuration("kafka.loader.pollinterval") 114 | ) 115 | } 116 | } 117 | 118 | def fromConfig: ConfigReader[ConfigData] = for { 119 | k <- fromKafkaConfig 120 | h <- fromHttpConfig 121 | d <- fromDataLoaderConfig 122 | } yield ConfigData(k, h, d) 123 | } 124 | 125 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/lightbend/kafka/scala/iq/services/LocalStateStoreQuery.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq 6 | package services 7 | 8 | import org.apache.kafka.streams.KafkaStreams 9 | import org.apache.kafka.streams.state.{QueryableStoreType, QueryableStoreTypes, ReadOnlyKeyValueStore, ReadOnlyWindowStore} 10 | 11 | import scala.collection.JavaConverters._ 12 | import scala.concurrent.{ExecutionContext, Future} 13 | import scala.concurrent.duration._ 14 | import com.typesafe.scalalogging.LazyLogging 15 | import akka.actor.ActorSystem 16 | 17 | /** 18 | * Abstraction that supports query from a local state store. The query supports retry semantics if 19 | * invoked during Kafka Streams' rebalancing act when states may migrate across stores. 20 | */ 21 | class LocalStateStoreQuery[K, V] extends LazyLogging { 22 | 23 | final val MaxRetryCount = 10 24 | final val DelayBetweenRetries = 1.second 25 | 26 | /** 27 | * For all the following query methods, we need to implement a retry semantics when we invoke 28 | * `streams.store()`. This is because if the application is run in a distributed mode (multiple 29 | * instances), this function call can throw `InvalidStateStoreException` if state stores are being 30 | * migrated when the call is made. And migration is done when new instances of the application come up 31 | * or Kafka Streams does a rebalancing. 32 | * 33 | * In such cases we need to retry till the rebalancing is complete or we run out of retry count. 34 | */ 35 | private def _retry[T](op: => T )(implicit ec: ExecutionContext, as: ActorSystem): Future[T] = { 36 | retry(op, DelayBetweenRetries, MaxRetryCount)(ec, as.scheduler) 37 | } 38 | 39 | /** 40 | * Query for a key 41 | */ 42 | def queryStateStore(streams: KafkaStreams, store: String, key: K) 43 | (implicit ex: ExecutionContext, as: ActorSystem): Future[V] = { 44 | 45 | val q: QueryableStoreType[ReadOnlyKeyValueStore[K, V]] = QueryableStoreTypes.keyValueStore() 46 | _retry(streams.store(store, q)).map(_.get(key)) 47 | } 48 | 49 | /** 50 | * Query all 51 | */ 52 | def queryStateStoreForAll(streams: KafkaStreams, store: String) 53 | (implicit ex: ExecutionContext, as: ActorSystem): Future[List[(K, V)]] = { 54 | 55 | def fetchNClose(rs: ReadOnlyKeyValueStore[K, V]) = { 56 | val kvi = rs.all 57 | val kvs = kvi.asScala.toList.map(kv => (kv.key, kv.value)) 58 | kvi.close() 59 | kvs 60 | } 61 | 62 | val q: QueryableStoreType[ReadOnlyKeyValueStore[K, V]] = QueryableStoreTypes.keyValueStore() 63 | _retry(streams.store(store, q)).map(fetchNClose) 64 | } 65 | 66 | /** 67 | * Query for a range of keys 68 | */ 69 | def queryStateStoreForRange(streams: KafkaStreams, store: String, fromKey: K, toKey: K) 70 | (implicit ex: ExecutionContext, as: ActorSystem): Future[List[(K, V)]] = { 71 | 72 | def fetchNClose(rs: ReadOnlyKeyValueStore[K, V]) = { 73 | val kvi = rs.range(fromKey, toKey) 74 | val kvs = kvi.asScala.toList.map(kv => (kv.key, kv.value)) 75 | kvi.close() 76 | kvs 77 | } 78 | 79 | val q: QueryableStoreType[ReadOnlyKeyValueStore[K, V]] = QueryableStoreTypes.keyValueStore() 80 | _retry(streams.store(store, q)).map(fetchNClose) 81 | } 82 | 83 | /** 84 | * Query approximate num entries 85 | */ 86 | def queryStateStoreForApproxNumEntries(streams: KafkaStreams, store: String) 87 | (implicit ex: ExecutionContext, as: ActorSystem): Future[Long] = { 88 | 89 | val q: QueryableStoreType[ReadOnlyKeyValueStore[K, V]] = QueryableStoreTypes.keyValueStore() 90 | _retry(streams.store(store, q)).map(_.approximateNumEntries) 91 | } 92 | 93 | /** 94 | * Query for a window 95 | */ 96 | def queryWindowedStateStore(streams: KafkaStreams, store: String, key: K, fromTime: Long, toTime: Long) 97 | (implicit ex: ExecutionContext, as: ActorSystem): Future[List[(Long, V)]] = { 98 | 99 | val q: QueryableStoreType[ReadOnlyWindowStore[K, V]] = QueryableStoreTypes.windowStore() 100 | 101 | _retry(streams.store(store, q)).map( 102 | _.fetch(key, fromTime, toTime) 103 | .asScala 104 | .toList 105 | .map(kv => (Long2long(kv.key), kv.value))) 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /examples/project/Common.scala: -------------------------------------------------------------------------------- 1 | import sbt._ 2 | import Keys._ 3 | 4 | object Common { 5 | 6 | val settings: Seq[Def.Setting[_]] = Seq( 7 | resolvers += "confluent" at "http://packages.confluent.io/maven/", 8 | scalaVersion := Versions.scalaVersion, 9 | scalacOptions ++= Seq( 10 | "-deprecation", // Emit warning and location for usages of deprecated APIs. 11 | "-encoding", "utf-8", // Specify character encoding used by source files. 12 | "-explaintypes", // Explain type errors in more detail. 13 | "-feature", // Emit warning and location for usages of features that should be imported explicitly. 14 | "-language:existentials", // Existential types (besides wildcard types) can be written and inferred 15 | "-language:experimental.macros", // Allow macro definition (besides implementation and application) 16 | "-language:higherKinds", // Allow higher-kinded types 17 | "-language:implicitConversions", // Allow definition of implicit functions called views 18 | "-language:postfixOps", // Allow postfix operator 19 | "-unchecked", // Enable additional warnings where generated code depends on assumptions. 20 | "-Xcheckinit", // Wrap field accessors to throw an exception on uninitialized access. 21 | "-Xfatal-warnings", // Fail the compilation if there are any warnings. 22 | "-Xfuture", // Turn on future language features. 23 | "-Xlint:adapted-args", // Warn if an argument list is modified to match the receiver. 24 | "-Xlint:by-name-right-associative", // By-name parameter of right associative operator. 25 | "-Xlint:constant", // Evaluation of a constant arithmetic expression results in an error. 26 | "-Xlint:delayedinit-select", // Selecting member of DelayedInit. 27 | "-Xlint:doc-detached", // A Scaladoc comment appears to be detached from its element. 28 | "-Xlint:inaccessible", // Warn about inaccessible types in method signatures. 29 | "-Xlint:infer-any", // Warn when a type argument is inferred to be `Any`. 30 | "-Xlint:missing-interpolator", // A string literal appears to be missing an interpolator id. 31 | "-Xlint:nullary-override", // Warn when non-nullary `def f()' overrides nullary `def f'. 32 | "-Xlint:nullary-unit", // Warn when nullary methods return Unit. 33 | "-Xlint:option-implicit", // Option.apply used implicit view. 34 | "-Xlint:package-object-classes", // Class or object defined in package object. 35 | "-Xlint:poly-implicit-overload", // Parameterized overloaded implicit methods are not visible as view bounds. 36 | "-Xlint:private-shadow", // A private field (or class parameter) shadows a superclass field. 37 | "-Xlint:stars-align", // Pattern sequence wildcard must align with sequence component. 38 | "-Xlint:type-parameter-shadow", // A local type parameter shadows a type already in scope. 39 | "-Xlint:unsound-match", // Pattern match may not be typesafe. 40 | "-Yno-adapted-args", // Do not adapt an argument list (either by inserting () or creating a tuple) to match the receiver. 41 | "-Ypartial-unification", // Enable partial unification in type constructor inference 42 | "-Ywarn-dead-code", // Warn when dead code is identified. 43 | "-Ywarn-extra-implicit", // Warn when more than one implicit parameter section is defined. 44 | "-Ywarn-inaccessible", // Warn about inaccessible types in method signatures. 45 | "-Ywarn-infer-any", // Warn when a type argument is inferred to be `Any`. 46 | "-Ywarn-nullary-override", // Warn when non-nullary `def f()' overrides nullary `def f'. 47 | "-Ywarn-nullary-unit", // Warn when nullary methods return Unit. 48 | "-Ywarn-unused:implicits", // Warn if an implicit parameter is unused. 49 | "-Ywarn-unused:locals", // Warn if a local definition is unused. 50 | "-Ywarn-unused:params", // Warn if a value parameter is unused. 51 | "-Ywarn-unused:patvars", // Warn if a variable bound in a pattern is unused. 52 | "-Ywarn-unused:privates", // Warn if a private member is unused. 53 | "-Ywarn-value-discard" // Warn when non-Unit expression results are unused. 54 | ) 55 | ) 56 | } 57 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/WeblogDriver.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | 7 | import java.util.Properties 8 | import java.util.concurrent.Executors 9 | 10 | import scala.concurrent.ExecutionContext 11 | 12 | import akka.actor.ActorSystem 13 | import akka.stream.ActorMaterializer 14 | 15 | import org.apache.kafka.streams.Topology 16 | import org.apache.kafka.streams.state.HostInfo 17 | import org.apache.kafka.streams.{ StreamsConfig, KafkaStreams } 18 | import org.apache.kafka.common.serialization.Serdes 19 | import org.apache.kafka.clients.consumer.ConsumerConfig; 20 | 21 | import config.KStreamConfig._ 22 | import http.{ WeblogProcHttpService, BFValueFetcher } 23 | import services.AppStateStoreQuery 24 | import processor.{ BFStoreSupplier, BFStoreBuilder, WeblogProcessor } 25 | 26 | import com.lightbend.kafka.scala.iq.services.MetadataService 27 | import com.lightbend.kafka.scala.iq.http.HttpRequester 28 | 29 | object WeblogDriver extends WeblogWorkflow { 30 | 31 | final val LOG_COUNT_STATE_STORE = "log-counts" 32 | 33 | def main(args: Array[String]): Unit = workflow() 34 | 35 | override def startRestProxy(streams: KafkaStreams, hostInfo: HostInfo, 36 | actorSystem: ActorSystem, materializer: ActorMaterializer): WeblogProcHttpService = { 37 | 38 | implicit val system = actorSystem 39 | 40 | lazy val defaultParallelism: Int = { 41 | val rt = Runtime.getRuntime() 42 | rt.availableProcessors() * 4 43 | } 44 | 45 | def defaultExecutionContext(parallelism: Int = defaultParallelism): ExecutionContext = 46 | ExecutionContext.fromExecutor(Executors.newFixedThreadPool(parallelism)) 47 | 48 | val executionContext = defaultExecutionContext() 49 | 50 | // service for fetching metadata information 51 | val metadataService = new MetadataService(streams) 52 | 53 | // service for fetching from local state store 54 | val localStateStoreQuery = new AppStateStoreQuery[String, Long] 55 | 56 | // http service for request handling 57 | val httpRequester = new HttpRequester(system, materializer, executionContext) 58 | 59 | val restService = new WeblogProcHttpService( 60 | hostInfo, 61 | new BFValueFetcher(metadataService, localStateStoreQuery, httpRequester, streams, executionContext, hostInfo), 62 | system, materializer, executionContext 63 | ) 64 | restService.start() 65 | restService 66 | } 67 | 68 | override def createStreams(config: ConfigData): KafkaStreams = { 69 | val changelogConfig = { 70 | val cfg = new java.util.HashMap[String, String] 71 | val segmentSizeBytes = (20 * 1024 * 1024).toString 72 | cfg.put("segment.bytes", segmentSizeBytes) 73 | cfg 74 | } 75 | 76 | // Kafka stream configuration 77 | val streamingConfig = { 78 | val settings = new Properties 79 | settings.put(StreamsConfig.APPLICATION_ID_CONFIG, "kstream-log-count") 80 | settings.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, config.brokers) 81 | settings.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String.getClass.getName) 82 | settings.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String.getClass.getName) 83 | 84 | // setting offset reset to earliest so that we can re-run the demo code with the same pre-loaded data 85 | // Note: To re-run the demo, you need to use the offset reset tool: 86 | // https://cwiki.apache.org/confluence/display/KAFKA/Kafka+Streams+Application+Reset+Tool 87 | settings.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest") 88 | 89 | // need this for query service 90 | val endpointHostName = translateHostInterface(config.httpInterface) 91 | logger.info(s"Endpoint host name $endpointHostName") 92 | 93 | settings.put(StreamsConfig.APPLICATION_SERVER_CONFIG, s"$endpointHostName:${config.httpPort}") 94 | 95 | // default is /tmp/kafka-streams 96 | settings.put(StreamsConfig.STATE_DIR_CONFIG, config.stateStoreDir) 97 | 98 | // Set the commit interval to 500ms so that any changes are flushed frequently and the summary 99 | // data are updated with low latency. 100 | settings.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, "500"); 101 | 102 | settings 103 | } 104 | 105 | val topology: Topology = new Topology() 106 | topology.addSource("Source", config.fromTopic) 107 | .addProcessor("Process", WeblogProcessorSupplier, "Source") 108 | .addStateStore( 109 | new BFStoreBuilder[String](new BFStoreSupplier[String](LOG_COUNT_STATE_STORE, stringSerde, true, changelogConfig)), 110 | "Process" 111 | ) 112 | 113 | new KafkaStreams(topology, streamingConfig) 114 | } 115 | } 116 | 117 | import org.apache.kafka.streams.processor.ProcessorSupplier 118 | object WeblogProcessorSupplier extends ProcessorSupplier[String, String] { 119 | override def get(): WeblogProcessor = new WeblogProcessor() 120 | } 121 | -------------------------------------------------------------------------------- /lib/README.md: -------------------------------------------------------------------------------- 1 | # HTTP Layer for Interactive Queries in Kafka Streams 2 | 3 | Kafka Streams' stateful streaming creates and uses local state information in the node where the application is running. If the application runs in a distributed mode on multiple nodes, then each node contains the respective state information. Kafka Streams does not publish any unifying API that allows you to query across all the nodes for the state information. However it has a set of infrastructure components that can be used to implement a query service based on your favorite end points. 4 | 5 | Interactive Queries were introduced on version `0.10.1` and the main goal is stated as follows: 6 | 7 | > This feature allows you to treat the stream processing layer as a lightweight embedded database and, more concretely, to directly query the latest state of your stream processing application, without needing to materialize that state to external databases or external storage first. 8 | 9 | However Kafka Streams documentation also makes it clear that the query layer for the global state of your application does not come out of the box. 10 | 11 | > Kafka Streams provides all the required functionality for interactively querying your application’s state out of the box, with but one exception: if you want to expose your application’s full state via interactive queries, then – for reasons we explain further down below – it is your responsibility to add an appropriate RPC layer to your application that allows application instances to communicate over the network. If, however, you only need to let your application instances access their own local state, then you do not need to add such an RPC layer at all. 12 | 13 | The goal of this small library is to offer such a query layer based on [akka-http](https://doc.akka.io/docs/akka-http/current/scala/http/). 14 | 15 | ## Quick Start 16 | 17 | `kafka-streams-query` is published and cross-built for Scala `2.11`, and `2.12`, so you can just add the following to your build: 18 | 19 | ```scala 20 | val kafka_streams_query_version = "0.1.1" 21 | 22 | libraryDependencies ++= Seq("com.lightbend" %% 23 | "kafka-streams-query" % kafka_streams_query_version) 24 | ``` 25 | 26 | > Note: `kafka-streams-query` supports Kafka Streams `1.0.0`. 27 | 28 | The API docs for `kafka-streams-query` is available [here](https://developer.lightbend.com/docs/api/kafka-streams-query/0.1.1/com/lightbend/kafka/scala/iq) for Scala 2.12 and [here](https://developer.lightbend.com/docs/api/kafka-streams-query_2.11/0.1.1/#package) for Scala 2.11. 29 | 30 | ## The Library 31 | 32 | The library is organized around 3 main packages containing the following: 33 | 34 | 1. `http`: The main end point implementations including a class `InteractiveQueryHttpService` that provides methods for starting and stopping the HTTP service. The other classes provided are `HttpRequester` that handles the request, does some validations and forwards the request to `KeyValueFetcher` that invokes the actual service for fetching the state information. 35 | 2. `services`: This layer interacts with the underlying Kafka Streams APIs to fetch data from the local state. The 2 classes in this layer are (a) `MetadataService` that uses Kafka Streams API to fetch the metadata for the state and (b) `LocalStateStoreQueryService` that does the actual query for the state. 36 | 3. `serializers`: A bunch of serializers useful for application development that help you serialize your model structures. 37 | 38 | ## Distributed Query 39 | 40 | If the application is run in a distributed mode across multiple physical nodes, local state information is spread across all the nodes. The `http` services that the library offers can handle this and provide a unified view of the global application state. 41 | 42 | Consider the following scenario: 43 | 44 | 1. The application is deployed in 3 nodes with IPs, `ip1`, `ip2` and `ip3`. Assuming the application uses this library, the HTTP services run on port `7070` in each of the nodes. 45 | 2. The user queries for some information from `http://ip1:7070///`. 46 | 47 | It may so happen that the `` that she is looking for may not reside in host `ip1`. The query service handles this situation by interacting with the `MetadataService` as follows: 48 | 49 | 1. User queries from host `ip1` 50 | 2. Check `MetadataService` to get information about the `key` that the user is looking for 51 | 3. If the metadata for the key indicates that the data is part of the local state in `ip1`, then we are done. Return the query result 52 | 4. Otherwise, get the host information from the metadata where this state resides 53 | 5. Query the appropriate node by reissuing the HTTP request to get the state information 54 | 55 | ## Handling Rebalancing of Partitions 56 | 57 | It may so happen that when the user does the query, Kafka Streams may be doing a partition rebalancing when states may migrate from one store (node) to another. During such a situation Kafka Streams throws `InvalidStateStoreException`. 58 | 59 | Migration is typically done when new instances of the application come up or Kafka Streams does a rebalancing. The library handles such situation through retry semantics. The query API will continue to retry until the rebalancing is complete or the retry count is exhausted. 60 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/example/config/KStreamConfig.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | package config 7 | 8 | import cats.data._ 9 | import cats.instances.all._ 10 | 11 | import scala.util.Try 12 | import com.typesafe.config.Config 13 | import scala.concurrent.duration._ 14 | import com.lightbend.kafka.scala.server._ 15 | 16 | 17 | /** 18 | * This object wraps the native Java config APIs into a monadic 19 | * interpreter 20 | */ 21 | object KStreamConfig { 22 | 23 | private[KStreamConfig] case class KafkaSettings( 24 | serverSettings: ServerSettings, 25 | topicSettings: TopicSettings 26 | ) 27 | 28 | private[KStreamConfig] case class ServerSettings( 29 | localServer: Boolean, 30 | brokers: String, 31 | schemaRegistryUrl: Option[String], 32 | stateStoreDir: String 33 | ) 34 | 35 | private[KStreamConfig] case class TopicSettings( 36 | fromTopic: String, 37 | errorTopic: String, 38 | toTopic: String, 39 | avroTopic: String, 40 | summaryAccessTopic: String, 41 | windowedSummaryAccessTopic: String, 42 | summaryPayloadTopic: String, 43 | windowedSummaryPayloadTopic: String 44 | ) 45 | 46 | private[KStreamConfig] case class HttpSettings( 47 | interface: String, 48 | port: Int 49 | ) 50 | 51 | private[KStreamConfig] case class DataLoaderSettings( 52 | sourceTopic: String, 53 | directoryToWatch: Option[String], 54 | pollInterval: FiniteDuration 55 | ) 56 | 57 | case class ConfigData(ks: KafkaSettings, hs: HttpSettings, dls: DataLoaderSettings) { 58 | def localServer = ks.serverSettings.localServer 59 | def brokers = ks.serverSettings.brokers 60 | def schemaRegistryUrl = ks.serverSettings.schemaRegistryUrl 61 | def fromTopic = ks.topicSettings.fromTopic 62 | def toTopic = ks.topicSettings.toTopic 63 | def avroTopic = ks.topicSettings.avroTopic 64 | def summaryAccessTopic = ks.topicSettings.summaryAccessTopic 65 | def windowedSummaryAccessTopic = ks.topicSettings.windowedSummaryAccessTopic 66 | def summaryPayloadTopic = ks.topicSettings.summaryPayloadTopic 67 | def windowedSummaryPayloadTopic = ks.topicSettings.windowedSummaryPayloadTopic 68 | def errorTopic = ks.topicSettings.errorTopic 69 | def stateStoreDir = ks.serverSettings.stateStoreDir 70 | def httpInterface = hs.interface 71 | def httpPort = hs.port 72 | def sourceTopic = dls.sourceTopic 73 | def directoryToWatch = dls.directoryToWatch 74 | def pollInterval = dls.pollInterval 75 | } 76 | 77 | type ConfigReader[A] = ReaderT[Try, Config, A] 78 | 79 | private def getStringMaybe(config: Config, key: String): Option[String] = try { 80 | val str = config.getString(key) 81 | if (str.trim.isEmpty) None else Some(str) 82 | } catch { 83 | case _: Exception => None 84 | } 85 | 86 | private def fromKafkaConfig: ConfigReader[KafkaSettings] = Kleisli { (config: Config) => 87 | Try { 88 | val local = config.getBoolean("kafka.localserver") 89 | val serverSettings = 90 | if (local) { 91 | ServerSettings( 92 | local, 93 | s"localhost:${KafkaLocalServer.DefaultPort}", 94 | getStringMaybe(config, "kafka.schemaregistryurl"), 95 | config.getString("kafka.statestoredir") 96 | ) 97 | } else { 98 | ServerSettings( 99 | local, 100 | config.getString("kafka.brokers"), 101 | getStringMaybe(config, "kafka.schemaregistryurl"), 102 | config.getString("kafka.statestoredir") 103 | ) 104 | } 105 | 106 | KafkaSettings( 107 | serverSettings, 108 | TopicSettings( 109 | config.getString("kafka.fromtopic"), 110 | config.getString("kafka.errortopic"), 111 | config.getString("kafka.totopic"), 112 | config.getString("kafka.avrotopic"), 113 | config.getString("kafka.summaryaccesstopic"), 114 | config.getString("kafka.windowedsummaryaccesstopic"), 115 | config.getString("kafka.summarypayloadtopic"), 116 | config.getString("kafka.windowedsummarypayloadtopic") 117 | ) 118 | ) 119 | } 120 | } 121 | 122 | private def fromHttpConfig: ConfigReader[HttpSettings] = Kleisli { (config: Config) => 123 | Try { 124 | HttpSettings( 125 | config.getString("http.interface"), 126 | config.getInt("http.port") 127 | ) 128 | } 129 | } 130 | 131 | private def fromDataLoaderConfig: ConfigReader[DataLoaderSettings] = Kleisli { (config: Config) => 132 | Try { 133 | DataLoaderSettings( 134 | config.getString("kafka.loader.sourcetopic"), 135 | getStringMaybe(config, "kafka.loader.directorytowatch"), 136 | config.getDuration("kafka.loader.pollinterval") 137 | ) 138 | } 139 | } 140 | 141 | def fromConfig: ConfigReader[ConfigData] = for { 142 | k <- fromKafkaConfig 143 | h <- fromHttpConfig 144 | d <- fromDataLoaderConfig 145 | } yield ConfigData(k, h, d) 146 | } 147 | 148 | -------------------------------------------------------------------------------- /examples/build.sbt: -------------------------------------------------------------------------------- 1 | import sbtassembly.MergeStrategy 2 | import NativePackagerHelper._ 3 | 4 | name := "QueryExampleProject-root" 5 | 6 | version in ThisBuild := "0.1.1" 7 | 8 | scalaVersion := Versions.scalaVersion 9 | 10 | def appProject(id: String)(base:String = id) = Project(id, base = file(base)) 11 | .enablePlugins(JavaAppPackaging) 12 | 13 | // standalone run of the dsl example application 14 | lazy val dslRun = (project in file("./example-dsl")) 15 | .settings(Common.settings: _*) 16 | .settings(libraryDependencies ++= Dependencies.dslDependencies) 17 | .settings ( 18 | fork in run := true, 19 | mainClass in Compile := Some("com.lightbend.kafka.scala.iq.example.WeblogProcessing"), 20 | scalacOptions := Seq("-Xexperimental", "-unchecked", "-deprecation", "-Ywarn-unused-import"), 21 | javaOptions in run ++= Seq( 22 | "-Dconfig.file=" + (resourceDirectory in Compile).value / "application-dsl.conf", 23 | "-Dlogback.configurationFile=" + (resourceDirectory in Compile).value / "logback-dsl.xml", 24 | "-Dlog4j.configurationFile=" + (resourceDirectory in Compile).value / "log4j.properties"), 25 | (sourceDirectory in AvroConfig) := baseDirectory.value / "src/main/resources/com/lightbend/kafka/scala/iq/example", 26 | (stringType in AvroConfig) := "String", 27 | addCommandAlias("dsl", "dslRun/run") 28 | ) 29 | .dependsOn(server) 30 | 31 | // packaged run of the dsl example application 32 | lazy val dslPackage = appProject("dslPackage")("build/dsl") 33 | .settings( 34 | scalaVersion := Versions.scalaVersion, 35 | resourceDirectory in Compile := (resourceDirectory in (dslRun, Compile)).value, 36 | mappings in Universal ++= { 37 | Seq(((resourceDirectory in Compile).value / "application-dsl.conf") -> "conf/application.conf") ++ 38 | Seq(((resourceDirectory in Compile).value / "logback-dsl.xml") -> "conf/logback.xml") ++ 39 | Seq(((resourceDirectory in Compile).value / "log4j.properties") -> "conf/log4j.properties") 40 | }, 41 | assemblyMergeStrategy in assembly := { 42 | case PathList("application-dsl.conf") => MergeStrategy.discard 43 | case PathList("logback-dsl.xml") => MergeStrategy.discard 44 | case PathList("META-INF", "MANIFEST.MF") => MergeStrategy.discard 45 | case PathList("META-INF", xs @ _*) => MergeStrategy.last 46 | case PathList("META-INF", "io.netty.versions.properties") => MergeStrategy.last 47 | case x => 48 | val oldStrategy = (assemblyMergeStrategy in assembly).value 49 | oldStrategy(x) 50 | }, 51 | scriptClasspath := Seq("../conf/") ++ scriptClasspath.value, 52 | mainClass in Compile := Some("com.lightbend.kafka.scala.iq.example.WeblogProcessing") 53 | ) 54 | .dependsOn(server, dslRun) 55 | 56 | // standalone run of the proc example application 57 | lazy val procRun = (project in file("./example-proc")) 58 | .settings(Common.settings: _*) 59 | .settings(libraryDependencies ++= Dependencies.procDependencies) 60 | .settings ( 61 | fork in run := true, 62 | mainClass in Compile := Some("com.lightbend.kafka.scala.iq.example.WeblogDriver"), 63 | scalacOptions := Seq("-Xexperimental", "-unchecked", "-deprecation", "-Ywarn-unused-import"), 64 | javaOptions in run ++= Seq( 65 | "-Dconfig.file=" + (resourceDirectory in Compile).value / "application-proc.conf", 66 | "-Dlogback.configurationFile=" + (resourceDirectory in Compile).value / "logback-proc.xml", 67 | "-Dlog4j.configurationFile=" + (resourceDirectory in Compile).value / "log4j.properties"), 68 | addCommandAlias("proc", "procRun/run") 69 | ) 70 | .dependsOn(server) 71 | 72 | // packaged run of the proc example application 73 | lazy val procPackage = appProject("procPackage")("build/proc") 74 | .settings( 75 | scalaVersion := Versions.scalaVersion, 76 | resourceDirectory in Compile := (resourceDirectory in (procRun, Compile)).value, 77 | mappings in Universal ++= { 78 | Seq(((resourceDirectory in Compile).value / "application-proc.conf") -> "conf/application.conf") ++ 79 | Seq(((resourceDirectory in Compile).value / "logback-proc.xml") -> "conf/logback.xml") ++ 80 | Seq(((resourceDirectory in Compile).value / "log4j.properties") -> "conf/log4j.properties") 81 | }, 82 | assemblyMergeStrategy in assembly := { 83 | case PathList("application-proc.conf") => MergeStrategy.discard 84 | case PathList("logback-proc.xml") => MergeStrategy.discard 85 | case PathList("META-INF", "MANIFEST.MF") => MergeStrategy.discard 86 | case PathList("META-INF", xs @ _*) => MergeStrategy.last 87 | case PathList("META-INF", "io.netty.versions.properties") => MergeStrategy.last 88 | case x => 89 | val oldStrategy = (assemblyMergeStrategy in assembly).value 90 | oldStrategy(x) 91 | }, 92 | scriptClasspath := Seq("../conf/") ++ scriptClasspath.value, 93 | mainClass in Compile := Some("com.lightbend.kafka.scala.iq.example.WeblogDriver") 94 | ) 95 | .dependsOn(server, procRun) 96 | 97 | lazy val server = (project in file("./kafka-local-server")). 98 | settings(Common.settings: _*). 99 | settings(libraryDependencies ++= Dependencies.serverDependencies) 100 | 101 | lazy val root = (project in file(".")). 102 | aggregate(dslRun, dslPackage, procRun, procPackage, server) 103 | -------------------------------------------------------------------------------- /examples/example-proc/src/main/scala/com/lightbend/kafka/scala/iq/example/WeblogWorkflow.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | 7 | import com.typesafe.config.ConfigFactory 8 | 9 | import java.util.concurrent.TimeUnit 10 | 11 | import akka.actor.ActorSystem 12 | import akka.stream.ActorMaterializer 13 | 14 | import org.apache.kafka.streams.KafkaStreams 15 | import org.apache.kafka.streams.state.HostInfo 16 | 17 | import scala.util.{ Success, Failure } 18 | import scala.concurrent.duration._ 19 | import sys.process._ 20 | import com.typesafe.scalalogging.LazyLogging 21 | 22 | import config.KStreamConfig._ 23 | import serializers._ 24 | 25 | import com.lightbend.kafka.scala.iq.http.InteractiveQueryHttpService 26 | 27 | import ingestion.DataIngestion 28 | import com.lightbend.kafka.scala.server._ 29 | 30 | trait WeblogWorkflow extends LazyLogging with AppSerializers { 31 | 32 | def workflow(): Unit = { 33 | 34 | // get config info 35 | val config: ConfigData = fromConfig(ConfigFactory.load()) match { 36 | case Success(c) => c 37 | case Failure(ex) => throw ex 38 | } 39 | 40 | logger.info(s"config = $config") 41 | val maybeServer = startLocalServerIfSetInConfig(config) 42 | 43 | // setup REST endpoints 44 | val restEndpointPort = config.httpPort 45 | val restEndpointHostName = config.httpInterface 46 | val restEndpoint = new HostInfo(restEndpointHostName, restEndpointPort) 47 | 48 | logger.info("Connecting to Kafka cluster via bootstrap servers " + config.brokers) 49 | logger.warn("REST endpoint at http://" + restEndpointHostName + ":" + restEndpointPort) 50 | println("Connecting to Kafka cluster via bootstrap servers " + config.brokers) 51 | println("REST endpoint at http://" + restEndpointHostName + ":" + restEndpointPort) 52 | 53 | implicit val system = ActorSystem() 54 | implicit val materializer = ActorMaterializer() 55 | 56 | import system.dispatcher 57 | 58 | // register for data ingestion 59 | // whenever we find new / changed files in the configured location, we run data loading 60 | // However `directoryToWatch` may not be set if we are trying to run the application in 61 | // distributed mode with multiple instances. In that case only one instance will do the ingestion 62 | // and for subsequent instances of the application, we don't need to do the ingestion. 63 | // Ingestion can be done only from one instance 64 | config.directoryToWatch.foreach { d => 65 | DataIngestion.registerForIngestion(config) 66 | 67 | // schedule a run by touching the data folder 68 | system.scheduler.scheduleOnce(1 minute) { 69 | Seq("/bin/sh", "-c", s"touch $d/*").! 70 | () 71 | } 72 | } 73 | 74 | // set up the topology 75 | val streams: KafkaStreams = createStreams(config) 76 | 77 | // Need to be done for running the application after resetting the state store 78 | // should not be done in production 79 | streams.cleanUp() 80 | 81 | // Start the Restful proxy for servicing remote access to state stores 82 | val restService = startRestProxy(streams, restEndpoint, system, materializer) 83 | 84 | // need to exit for any stream exception 85 | // mesos will restart the application 86 | streams.setUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() { 87 | override def uncaughtException(t: Thread, e: Throwable): Unit = try { 88 | logger.error(s"Stream terminated because of uncaught exception .. Shutting down app", e) 89 | restService.stop() 90 | logger.error(s"Stopping streams service ..") 91 | val closed = streams.close(1, TimeUnit.MINUTES) 92 | logger.error(s"Exiting application after streams close ($closed)") 93 | } catch { 94 | case x: Exception => x.printStackTrace 95 | } finally { 96 | logger.error("Exiting application ..") 97 | logger.error(s"Stopping kafka server ..") 98 | maybeServer.foreach(_.stop()) 99 | System.exit(-1) 100 | } 101 | }) 102 | 103 | // Now that we have finished the definition of the processing topology we can actually run 104 | // it via `start()`. The Streams application as a whole can be launched just like any 105 | // normal Java application that has a `main()` method. 106 | streams.start() 107 | 108 | // Add shutdown hook to respond to SIGTERM and gracefully close Kafka Streams 109 | Runtime.getRuntime().addShutdownHook(new Thread(() => try { 110 | restService.stop() 111 | val closed = streams.close(1, TimeUnit.MINUTES) 112 | logger.error(s"Exiting application after streams close ($closed)") 113 | maybeServer.foreach(_.stop()) 114 | } catch { 115 | case _: Exception => // ignored 116 | })) 117 | } 118 | 119 | private def createTopics(config: ConfigData, server: KafkaLocalServer) = { 120 | import config._ 121 | List(fromTopic, errorTopic).foreach(server.createTopic(_)) 122 | } 123 | 124 | private def startLocalServerIfSetInConfig(config: ConfigData): Option[KafkaLocalServer] = if (config.localServer) { 125 | val s = KafkaLocalServer(true, Some(config.stateStoreDir)) 126 | s.start() 127 | createTopics(config, s) 128 | Some(s) 129 | } else None 130 | 131 | def createStreams(config: ConfigData): KafkaStreams 132 | def startRestProxy(streams: KafkaStreams, hostInfo: HostInfo, 133 | actorSystem: ActorSystem, materializer: ActorMaterializer): InteractiveQueryHttpService 134 | } 135 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/example/WeblogWorkflow.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | 7 | import com.typesafe.config.ConfigFactory 8 | 9 | import java.util.concurrent.TimeUnit 10 | 11 | import akka.actor.ActorSystem 12 | import akka.stream.ActorMaterializer 13 | 14 | import org.apache.kafka.streams.KafkaStreams 15 | import org.apache.kafka.streams.state.HostInfo 16 | 17 | import scala.util.{ Success, Failure } 18 | import scala.concurrent.duration._ 19 | import sys.process._ 20 | import com.typesafe.scalalogging.LazyLogging 21 | 22 | import config.KStreamConfig._ 23 | import serializers._ 24 | 25 | import com.lightbend.kafka.scala.iq.http.InteractiveQueryHttpService 26 | import com.lightbend.kafka.scala.server._ 27 | 28 | import ingestion.DataIngestion 29 | 30 | trait WeblogWorkflow extends LazyLogging with AppSerializers { 31 | 32 | def workflow(): Unit = { 33 | 34 | // get config info 35 | val config: ConfigData = fromConfig(ConfigFactory.load()) match { 36 | case Success(c) => c 37 | case Failure(ex) => throw ex 38 | } 39 | 40 | logger.info(s"config = $config") 41 | config.schemaRegistryUrl.foreach { url => 42 | logger.info(s"Schema Registry will be used - please ensure schema registry service is up and running at $url") 43 | } 44 | 45 | val maybeServer = startLocalServerIfSetInConfig(config) 46 | 47 | // setup REST endpoints 48 | val restEndpointPort = config.httpPort 49 | val restEndpointHostName = config.httpInterface 50 | val restEndpoint = new HostInfo(restEndpointHostName, restEndpointPort) 51 | 52 | logger.info("Connecting to Kafka cluster via bootstrap servers " + config.brokers) 53 | logger.warn("REST endpoint at http://" + restEndpointHostName + ":" + restEndpointPort) 54 | println("REST endpoint at http://" + restEndpointHostName + ":" + restEndpointPort) 55 | 56 | implicit val system = ActorSystem() 57 | implicit val materializer = ActorMaterializer() 58 | 59 | import system.dispatcher 60 | 61 | // register for data ingestion 62 | // whenever we find new / changed files in the configured location, we run data loading 63 | // However `directoryToWatch` may not be set if we are trying to run the application in 64 | // distributed mode with multiple instances. In that case only one instance will do the ingestion 65 | // and for subsequent instances of the application, we don't need to do the ingestion. 66 | // Ingestion can be done only from one instance 67 | config.directoryToWatch.foreach { d => 68 | DataIngestion.registerForIngestion(config) 69 | 70 | // schedule a run by touching the data folder 71 | system.scheduler.scheduleOnce(1 minute) { 72 | Seq("/bin/sh", "-c", s"touch $d/*").! 73 | () 74 | } 75 | } 76 | 77 | // set up the topology 78 | val streams: KafkaStreams = createStreams(config) 79 | 80 | // Need to be done for running the application after resetting the state store 81 | // should not be done in production 82 | streams.cleanUp() 83 | 84 | // Start the Restful proxy for servicing remote access to state stores 85 | val restService = startRestProxy(streams, restEndpoint, system, materializer) 86 | 87 | // need to exit for any stream exception 88 | // mesos will restart the application 89 | streams.setUncaughtExceptionHandler(new Thread.UncaughtExceptionHandler() { 90 | override def uncaughtException(t: Thread, e: Throwable): Unit = try { 91 | logger.error(s"Stream terminated because of uncaught exception .. Shutting down app", e) 92 | restService.stop() 93 | logger.error(s"Stopping streams service ..") 94 | val closed = streams.close(1, TimeUnit.MINUTES) 95 | logger.error(s"Exiting application after streams close ($closed)") 96 | } catch { 97 | case x: Exception => x.printStackTrace 98 | } finally { 99 | logger.error("Exiting application ..") 100 | logger.error(s"Stopping kafka server ..") 101 | maybeServer.foreach(_.stop()) 102 | System.exit(-1) 103 | } 104 | }) 105 | 106 | // Now that we have finished the definition of the processing topology we can actually run 107 | // it via `start()`. The Streams application as a whole can be launched just like any 108 | // normal Java application that has a `main()` method. 109 | streams.start() 110 | 111 | // Add shutdown hook to respond to SIGTERM and gracefully close Kafka Streams 112 | Runtime.getRuntime().addShutdownHook(new Thread(() => try { 113 | restService.stop() 114 | val closed = streams.close(1, TimeUnit.MINUTES) 115 | logger.error(s"Exiting application after streams close ($closed)") 116 | maybeServer.foreach(_.stop()) 117 | } catch { 118 | case _: Exception => // ignored 119 | })) 120 | } 121 | 122 | private def createTopics(config: ConfigData, server: KafkaLocalServer) = { 123 | import config._ 124 | List(fromTopic, 125 | errorTopic, 126 | toTopic, 127 | avroTopic, 128 | summaryAccessTopic, 129 | windowedSummaryAccessTopic, 130 | summaryPayloadTopic, 131 | windowedSummaryPayloadTopic).foreach(server.createTopic(_)) 132 | } 133 | 134 | private def startLocalServerIfSetInConfig(config: ConfigData): Option[KafkaLocalServer] = if (config.localServer) { 135 | val s = KafkaLocalServer(true, Some(config.stateStoreDir)) 136 | s.start() 137 | createTopics(config, s) 138 | Some(s) 139 | } else None 140 | 141 | def createStreams(config: ConfigData): KafkaStreams 142 | def startRestProxy(streams: KafkaStreams, hostInfo: HostInfo, 143 | actorSystem: ActorSystem, materializer: ActorMaterializer): InteractiveQueryHttpService 144 | } 145 | -------------------------------------------------------------------------------- /lib/src/main/scala/com/lightbend/kafka/scala/iq/http/KeyValueFetcher.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq 6 | package http 7 | 8 | import akka.actor.ActorSystem 9 | 10 | import org.apache.kafka.streams.{ KafkaStreams } 11 | import org.apache.kafka.streams.state.HostInfo 12 | import org.apache.kafka.common.serialization.Serializer 13 | 14 | import scala.concurrent.{ Future, ExecutionContext} 15 | import scala.util.{ Success, Failure } 16 | 17 | import com.typesafe.scalalogging.LazyLogging 18 | import services.{ MetadataService, HostStoreInfo, LocalStateStoreQuery } 19 | import de.heikoseeberger.akkahttpcirce.FailFastCirceSupport 20 | import akka.http.scaladsl.model.ResponseEntity 21 | import akka.http.scaladsl.unmarshalling.Unmarshaller 22 | import serializers.Serializers 23 | import io.circe.Decoder 24 | 25 | /** 26 | * Abstraction for fetching information from a key/value state store based on the 27 | * key and the store name passed in the API. 28 | * 29 | * Supports basic fetch as well as fetch over a time window. 30 | * 31 | * The fetch APIs support retry semantics in case the key is not available in the local state store. It 32 | * then fetches the store information from the MetadataService and then requeries that store 33 | * to get the information. 34 | */ 35 | class KeyValueFetcher[K: Decoder, V: Decoder]( 36 | metadataService: MetadataService, 37 | localStateStoreQuery: LocalStateStoreQuery[K, V], 38 | httpRequester: HttpRequester, 39 | streams: KafkaStreams, 40 | executionContext: ExecutionContext, 41 | hostInfo: HostInfo)(implicit actorSystem: ActorSystem, keySerializer: Serializer[K], u: Unmarshaller[ResponseEntity, V]) 42 | 43 | extends LazyLogging 44 | with FailFastCirceSupport with Serializers { 45 | 46 | private implicit val ec: ExecutionContext = executionContext 47 | 48 | /** 49 | * Query for a key 50 | */ 51 | def fetch(key: K, store: String, path: String): Future[V] = { 52 | 53 | metadataService.streamsMetadataForStoreAndKey(store, key, keySerializer) match { 54 | case Success(host) => { 55 | // key is on another instance. call the other instance to fetch the data. 56 | if (!thisHost(host)) { 57 | logger.warn(s"Key $key is on another instance not on $host - requerying ..") 58 | httpRequester.queryFromHost[V](host, path) 59 | } else { 60 | // key is on this instance 61 | localStateStoreQuery.queryStateStore(streams, store, key) 62 | } 63 | } 64 | case Failure(ex) => Future.failed(ex) 65 | } 66 | } 67 | 68 | /** 69 | * Query all: Warning - this may be large depending on the data set 70 | */ 71 | def fetchAll(store: String, path: String): Future[List[(K, V)]] = { 72 | 73 | def fetchAllKVs(host: HostStoreInfo): Future[List[(K, V)]] = { 74 | if (!thisHost(host)) { 75 | 76 | // host is remote - need to requery 77 | httpRequester.queryFromHost[List[(K, V)]](host, path) 78 | } else { 79 | 80 | // fetch all kvs for this local store 81 | localStateStoreQuery.queryStateStoreForAll(streams, store) 82 | } 83 | } 84 | 85 | fetchKVs(store, fetchAllKVs) 86 | } 87 | 88 | /** 89 | * Query for a range of keys 90 | */ 91 | def fetchRange(fromKey: K, toKey: K, store: String, path: String): Future[List[(K, V)]] = { 92 | 93 | def fetchKVsInRange(host: HostStoreInfo): Future[List[(K, V)]] = { 94 | if (!thisHost(host)) { 95 | 96 | // host is remote - need to requery 97 | httpRequester.queryFromHost[List[(K, V)]](host, path) 98 | } else { 99 | 100 | // fetch all kvs in range for this local store 101 | localStateStoreQuery.queryStateStoreForRange(streams, store, fromKey, toKey) 102 | } 103 | } 104 | 105 | fetchKVs(store, fetchKVsInRange) 106 | } 107 | 108 | private def fetchKVs(store: String, fn: HostStoreInfo => Future[List[(K, V)]]): Future[List[(K, V)]] = 109 | metadataService.streamsMetadataForStore(store) match { 110 | 111 | // metadata could not be found for this store 112 | case Nil => Future.failed(new Exception(s"No metadata found for $store")) 113 | 114 | // all hosts that have this store with the same application id 115 | case hosts => Future.traverse(hosts)(fn).map(_.flatten) 116 | } 117 | 118 | /** 119 | * Query all hosts to find the sum of approximate number of entries 120 | */ 121 | def fetchApproxNumEntries(store: String, path: String): Future[Long] = { 122 | 123 | def fetchApproxNumEntries(host: HostStoreInfo): Future[Long] = { 124 | if (!thisHost(host)) { 125 | 126 | // host is remote - need to requery 127 | httpRequester.queryFromHost[Long](host, path) 128 | } else { 129 | 130 | // fetch approx num entries for this local store 131 | localStateStoreQuery.queryStateStoreForApproxNumEntries(streams, store) 132 | } 133 | } 134 | 135 | metadataService.streamsMetadataForStore(store) match { 136 | 137 | // metadata could not be found for this store 138 | case Nil => Future.failed(new Exception(s"No metadata found for $store")) 139 | 140 | // all hosts that have this store with the same application id 141 | case hosts => Future.traverse(hosts)(fetchApproxNumEntries).map(_.sum) 142 | } 143 | } 144 | 145 | /** 146 | * Query for a window 147 | */ 148 | def fetchWindowed(key: K, store: String, path: String, 149 | fromTime: Long, toTime: Long): Future[List[(Long, V)]] = 150 | 151 | metadataService.streamsMetadataForStoreAndKey(store, key, keySerializer) match { 152 | case Success(host) => { 153 | // key is on another instance. call the other instance to fetch the data. 154 | if (!thisHost(host)) { 155 | logger.warn(s"Key $key is on another instance not on $host - requerying ..") 156 | httpRequester.queryFromHost[List[(Long, V)]](host, path) 157 | } else { 158 | // key is on this instance 159 | localStateStoreQuery.queryWindowedStateStore(streams, store, key, fromTime, toTime) 160 | } 161 | } 162 | case Failure(ex) => Future.failed(ex) 163 | } 164 | 165 | private def thisHost(host: HostStoreInfo): Boolean = 166 | host.host.equals(translateHostInterface(hostInfo.host)) && host.port == hostInfo.port 167 | } 168 | 169 | -------------------------------------------------------------------------------- /examples/kafka-local-server/src/main/scala/com/lightbend/kafka/scala/server/KafkaLocalServer.scala: -------------------------------------------------------------------------------- 1 | package com.lightbend.kafka.scala.server 2 | 3 | // Loosely based on Lagom implementation at 4 | // https://github.com/lagom/lagom/blob/master/dev/kafka-server/src/main/scala/com/lightbend/lagom/internal/kafka/KafkaLocalServer.scala 5 | 6 | import java.io.{ IOException, File } 7 | import java.nio.file.{ FileVisitOption, Files, Paths } 8 | import java.util.Properties 9 | 10 | import org.apache.curator.test.TestingServer 11 | import com.typesafe.scalalogging.LazyLogging 12 | 13 | import kafka.server.{KafkaConfig, KafkaServerStartable} 14 | 15 | import scala.collection.JavaConverters._ 16 | import scala.util.{ Try, Success, Failure } 17 | import java.util.Comparator 18 | 19 | import kafka.admin.{AdminUtils, RackAwareMode} 20 | import kafka.utils.ZkUtils 21 | 22 | class KafkaLocalServer private (kafkaProperties: Properties, zooKeeperServer: ZooKeeperLocalServer) 23 | extends LazyLogging { 24 | 25 | import KafkaLocalServer._ 26 | 27 | private var broker = null.asInstanceOf[KafkaServerStartable] 28 | private val zkUtils : ZkUtils = 29 | ZkUtils.apply(s"localhost:${zooKeeperServer.getPort()}", DEFAULT_ZK_SESSION_TIMEOUT_MS, DEFAULT_ZK_CONNECTION_TIMEOUT_MS, false) 30 | 31 | def start(): Unit = { 32 | 33 | broker = KafkaServerStartable.fromProps(kafkaProperties) 34 | broker.startup() 35 | } 36 | 37 | def stop(): Unit = { 38 | if (broker != null) { 39 | broker.shutdown() 40 | zooKeeperServer.stop() 41 | broker = null.asInstanceOf[KafkaServerStartable] 42 | } 43 | } 44 | 45 | /** 46 | * Create a Kafka topic with 1 partition and a replication factor of 1. 47 | * 48 | * @param topic The name of the topic. 49 | */ 50 | def createTopic(topic: String): Unit = { 51 | createTopic(topic, 1, 1, new Properties) 52 | } 53 | 54 | /** 55 | * Create a Kafka topic with the given parameters. 56 | * 57 | * @param topic The name of the topic. 58 | * @param partitions The number of partitions for this topic. 59 | * @param replication The replication factor for (the partitions of) this topic. 60 | */ 61 | def createTopic(topic: String, partitions: Int, replication: Int): Unit = { 62 | createTopic(topic, partitions, replication, new Properties) 63 | } 64 | 65 | /** 66 | * Create a Kafka topic with the given parameters. 67 | * 68 | * @param topic The name of the topic. 69 | * @param partitions The number of partitions for this topic. 70 | * @param replication The replication factor for (partitions of) this topic. 71 | * @param topicConfig Additional topic-level configuration settings. 72 | */ 73 | def createTopic(topic: String, partitions: Int, replication: Int, topicConfig: Properties): Unit = { 74 | AdminUtils.createTopic(zkUtils, topic, partitions, replication, topicConfig, RackAwareMode.Enforced) 75 | } 76 | 77 | def deleteTopic(topic: String) = AdminUtils.deleteTopic(zkUtils, topic) 78 | } 79 | 80 | import Utils._ 81 | 82 | object KafkaLocalServer extends LazyLogging { 83 | final val DefaultPort = 9092 84 | final val DefaultResetOnStart = true 85 | private val DEFAULT_ZK_SESSION_TIMEOUT_MS = 10 * 1000 86 | private val DEFAULT_ZK_CONNECTION_TIMEOUT_MS = 8 * 1000 87 | 88 | final val basDir = "tmp/" 89 | 90 | private final val kafkaDataFolderName = "kafka_data" 91 | 92 | def apply(cleanOnStart: Boolean, localStateDir: Option[String] = None): KafkaLocalServer = 93 | this(DefaultPort, ZooKeeperLocalServer.DefaultPort, cleanOnStart, localStateDir) 94 | 95 | def apply(kafkaPort: Int, zookeeperServerPort: Int, cleanOnStart: Boolean, localStateDir: Option[String]): KafkaLocalServer = { 96 | 97 | // delete kafka data dir on clean start 98 | val kafkaDataDir: File = (for { 99 | kdir <- dataDirectory(basDir, kafkaDataFolderName) 100 | _ <- if (cleanOnStart) deleteDirectory(kdir) else Try(()) 101 | } yield kdir) match { 102 | case Success(d) => d 103 | case Failure(ex) => throw ex 104 | } 105 | 106 | // delete kafka local state dir on clean start 107 | localStateDir.foreach { d => 108 | for { 109 | kdir <- dataDirectory("", d) 110 | _ <- if (cleanOnStart) deleteDirectory(kdir) else Try(()) 111 | } yield (()) 112 | } 113 | 114 | logger.info(s"Kafka data directory is $kafkaDataDir.") 115 | 116 | val kafkaProperties = createKafkaProperties(kafkaPort, zookeeperServerPort, kafkaDataDir) 117 | 118 | val zk = new ZooKeeperLocalServer(zookeeperServerPort, cleanOnStart) 119 | zk.start() 120 | new KafkaLocalServer(kafkaProperties, zk) 121 | } 122 | 123 | /** 124 | * Creates a Properties instance for Kafka customized with values passed in argument. 125 | */ 126 | private def createKafkaProperties(kafkaPort: Int, zookeeperServerPort: Int, dataDir: File): Properties = { 127 | 128 | // TODO: Probably should be externalized into properties. Was rushing this in 129 | val kafkaProperties = new Properties 130 | kafkaProperties.put(KafkaConfig.ListenersProp, s"PLAINTEXT://localhost:$kafkaPort") 131 | kafkaProperties.put(KafkaConfig.ZkConnectProp, s"localhost:$zookeeperServerPort") 132 | kafkaProperties.put(KafkaConfig.ZkConnectionTimeoutMsProp, "6000") 133 | kafkaProperties.put(KafkaConfig.BrokerIdProp, "0") 134 | kafkaProperties.put(KafkaConfig.NumNetworkThreadsProp, "3") 135 | kafkaProperties.put(KafkaConfig.NumIoThreadsProp, "8") 136 | kafkaProperties.put(KafkaConfig.SocketSendBufferBytesProp, "102400") 137 | kafkaProperties.put(KafkaConfig.SocketReceiveBufferBytesProp, "102400") 138 | kafkaProperties.put(KafkaConfig.SocketRequestMaxBytesProp, "104857600") 139 | kafkaProperties.put(KafkaConfig.NumPartitionsProp, "1") 140 | kafkaProperties.put(KafkaConfig.NumRecoveryThreadsPerDataDirProp, "1") 141 | kafkaProperties.put(KafkaConfig.OffsetsTopicReplicationFactorProp, "1") 142 | kafkaProperties.put(KafkaConfig.TransactionsTopicReplicationFactorProp, "1") 143 | kafkaProperties.put(KafkaConfig.LogRetentionTimeHoursProp, "2") 144 | kafkaProperties.put(KafkaConfig.LogSegmentBytesProp, "1073741824") 145 | kafkaProperties.put(KafkaConfig.LogCleanupIntervalMsProp, "300000") 146 | kafkaProperties.put(KafkaConfig.AutoCreateTopicsEnableProp, "true") 147 | kafkaProperties.put(KafkaConfig.ControlledShutdownEnableProp, "true") 148 | kafkaProperties.put(KafkaConfig.LogDirProp, dataDir.getAbsolutePath) 149 | 150 | kafkaProperties 151 | } 152 | } 153 | 154 | private class ZooKeeperLocalServer(port: Int, cleanOnStart: Boolean) extends LazyLogging { 155 | 156 | import KafkaLocalServer._ 157 | import ZooKeeperLocalServer._ 158 | 159 | private var zooKeeper = null.asInstanceOf[TestingServer] 160 | 161 | def start(): Unit = { 162 | // delete kafka data dir on clean start 163 | val zookeeperDataDir: File = (for { 164 | zdir <- dataDirectory(basDir, zookeeperDataFolderName) 165 | _ <- if (cleanOnStart) deleteDirectory(zdir) else Try(()) 166 | } yield zdir) match { 167 | case Success(d) => d 168 | case Failure(ex) => throw ex 169 | } 170 | logger.info(s"Zookeeper data directory is $zookeeperDataDir.") 171 | 172 | zooKeeper = new TestingServer(port, zookeeperDataDir, false) 173 | 174 | zooKeeper.start() // blocking operation 175 | } 176 | 177 | def stop(): Unit = { 178 | if (zooKeeper != null) 179 | try { 180 | zooKeeper.stop() 181 | zooKeeper = null.asInstanceOf[TestingServer] 182 | } 183 | catch { 184 | case _: IOException => () // nothing to do if an exception is thrown while shutting down 185 | } 186 | } 187 | 188 | def getPort() : Int = port 189 | } 190 | 191 | object ZooKeeperLocalServer { 192 | final val DefaultPort = 2181 193 | private final val zookeeperDataFolderName = "zookeeper_data" 194 | } 195 | -------------------------------------------------------------------------------- /examples/example-proc/README.md: -------------------------------------------------------------------------------- 1 | ## Example implementation of HTTP based Interactive Query Service 2 | 3 | The current implementation demonstrates the following usages in Kafka Streams along with an HTTP based interactive query service: 4 | 5 | 1. Data ingestion 6 | 2. Data transformation using Kafka Streams Procedure based implementation 7 | 3. Implementing a custom state store (based on bloom filter) 8 | 4. Managing local state with custom state store 9 | 5. Interactive query service with HTTP end points 10 | 11 | The implementation is based on the [ClarkNet dataset](http://ita.ee.lbl.gov/html/contrib/ClarkNet-HTTP.html), which has to be downloaded in a local folder. 12 | 13 | ## Build and Run Locally 14 | 15 | By default the application runs through an embedded local Kafka Server. In case you want to run separate instances of Kafka and Zookeeper servers, change `kafka.localserver` to `false` in `application.conf`. 16 | 17 | To run the application, do the following steps. 18 | 19 | ### Build the Libraries 20 | 21 | This example application depends on [kafka-streams-scala](https://github.com/lightbend/kafka-streams-scala) and [kafka-streams-query](https://github.com/lightbend/kafka-streams-query/tree/develop/lib). Ensure that you have the proper versions of these libraries in your classpath. Note that in this example Scala 2.12.4 and Kafka 1.0.0 are used. 22 | 23 | ### Start ZooKeeper and Kafka 24 | 25 | > This is only required if the setting of `kafka.localserver` is `false` in `application.conf`. If this is set to `true`, the application runs with an embedded local Kafka server. However, note that if you want to run the application in a distributed mode(see below for details of running in distributed mode), you need to run a separate Kafka and Zookeeper server. 26 | 27 | Start ZooKeeper and Kafka, if not already running. You can download Kafka 1.0.0 for Scala 2.12 [here](https://kafka.apache.org/documentation/#quickstart), then follow the [Quick Start](https://kafka.apache.org/documentation/#quickstart) instructions for running ZooKeeper and Kafka, steps 1 and 2. 28 | 29 | ### Download the ClarkNet dataset 30 | 31 | Download the [ClarkNet dataset](http://ita.ee.lbl.gov/html/contrib/ClarkNet-HTTP.html) and put it in a convenient local folder. 32 | 33 | ### Configure the Application Properties 34 | 35 | Copy `src/main/resources/application-proc.conf.template` to `src/main/resources/application-proc.conf`. 36 | 37 | Edit `src/main/resources/application-proc.conf` and set the entry for `directorytowatch` to match the folder name where you installed the ClarkNet dataset. 38 | 39 | And note that you can run the application with a bundled local Kafka server by setting `kafka.localserver` to `true` in the `application.conf` file. 40 | 41 | ### Create the Kafka Topics 42 | 43 | > This is only required if the setting of `kafka.localserver` is `false` in `application.conf`. If this is set to `true`, the application runs with an embedded local Kafka server and creates all necessary topics on its own. However, note that if you want to run the application in a distributed mode(see below for details of running in distributed mode), you need to run a separate Kafka and Zookeeper server. 44 | 45 | Create the topics using the `kafka-topics.sh` command that comes with the Kafka distribution. We'll refer to the directory where you installed Kafka as `$KAFKA_HOME`. Run the following commands: 46 | 47 | ```bash 48 | $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic logerr-proc 49 | $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic server-log-proc 50 | ``` 51 | 52 | ### Run the Application! 53 | 54 | Now run the application as follows: 55 | 56 | ```bash 57 | $ sbt 58 | > clean 59 | > compile 60 | > proc 61 | ``` 62 | 63 | This will start the application. Now you can query on the global state using `curl`: 64 | 65 | ```bash 66 | $ ## We are querying against a bloom filter based store which checks membership. 67 | $ ## Since world.std.com is a hostkey present in the dataset, we get true here. 68 | $ curl http://localhost:7071/weblog/access/check/world.std.com 69 | true 70 | $ 71 | $ ## We are querying against a bloom filter based store which checks membership. 72 | $ ## Since world.std.co is not a valid hostkey in the dataset, we get false 73 | $ ## here. 74 | $ curl http://localhost:7071/weblog/access/check/world.stx.co 75 | false 76 | ``` 77 | 78 | ## Run in Distributed Mode 79 | 80 | The http query layer is designed to work even when your application runs in the distributed mode. Running your Kafka Streams application in the distributed mode means that all the instances must have the same application id. 81 | 82 | > In order to run the application in distributed mode, you need to run an external Kafka and Zookeeper server. Set `kafka.localserver` to `false` to enable this setting. 83 | 84 | Here are the steps that you need to follow to run the application in distributed mode. We assume here you are running both the instances in the same node with different port numbers. It's fairly easy to scale this on different nodes. 85 | 86 | ### Step 1: Build and configure for distribution 87 | 88 | ```bash 89 | $ sbt 90 | > procPackage/universal:packageZipTarball 91 | ``` 92 | 93 | This creates a distribution under a folder `/build`. 94 | 95 | ```bash 96 | $ pwd 97 | 98 | $ cd build/proc/target/universal 99 | $ ls 100 | procpackage-0.0.1.tgz 101 | ## unpack the distribution 102 | $ tar xvfz procpackage-0.0.1.tgz 103 | $ cd procpackage-0.0.1 104 | $ ls 105 | bin conf lib 106 | $ cd conf 107 | $ ls 108 | application.conf logback.xml 109 | ## change the above 2 files based on your requirements. 110 | $ cd .. 111 | $ pwd 112 | <...>/procpackage-0.0.1 113 | ``` 114 | 115 | ### Step 2: Run the first instance of the application 116 | Ensure the following: 117 | 118 | 1. Zookeeper and Kafka are running 119 | 2. All topics mentioned above are created 120 | 3. The folder mentioned in `directoryToWatch` in `application.conf` has the data file 121 | 122 | ```bash 123 | $ pwd 124 | <...>/procpackage-0.0.1 125 | $ bin/dslpackage 126 | ``` 127 | 128 | This starts the single instance of the application. After some time you will see data printed in the console regarding the host access information as present from the data file. 129 | 130 | In the log file, created under `<...>/procpackage-0.0.1/logs`, check if the REST service has started and note the host and port details. It should be something like `localhost:7070` (the default setting in `application.conf`). 131 | 132 | ### Step 3: Run the second instance of the application 133 | 134 | If you decide to run multiple instances of the application you may choose to split the dataset into 2 parts and keep them in different folders. Also you need to copy the current distribution in some other folder and start the second instance from there, since you need to run it with changed settings in `application.conf`. Say we want to copy in a folder named `clarknet-2`. 135 | 136 | ```bash 137 | $ cp /build/proc/target/universal/procpackage-0.0.1.tgz clarknet-2 138 | $ cd clarknet-2 139 | $ tar xvfz procpackage-0.0.1.tgz 140 | ## unpack the distribution 141 | $ cd procpackage-0.0.1 142 | $ ls 143 | bin conf lib 144 | $ cd conf 145 | $ ls 146 | application.conf logback.xml 147 | ## change the above 2 files based on your requirements. 148 | $ cd .. 149 | $ pwd 150 | <...>/procpackage-0.0.1 151 | ``` 152 | 153 | The following settings need to be changed in `application.conf` before you can run the second instance: 154 | 155 | 1. `dcos.kafka.statestoredir` - This is the folder where the local state information gets persisted by Kafka streams. This has to be different for every new instance set up. 156 | 2. `dcos.kafka.loader.directorytowatch` - The data folder because we would like to ingest different data for the 2 instances. 157 | 3. `dcos.http.interface` and `dcos.http.port` - The REST service endpoints. If the node is not different then it can be `localhost` for both. 158 | 159 | ```bash 160 | $ pwd 161 | <...>/procpackage-0.0.1 162 | $ bin/procpackage 163 | ``` 164 | 165 | This will start the second instance. Check the log file to verify that the REST endpoints are properly started. 166 | 167 | ### Step 4: Do query 168 | 169 | The idea of a distributed interactive query interface is to allow the user to query for *all* keys using *any* of the end points where the REST service are running. Assume that the 2 instances are running at `localhost:7070` and `localhost:7071`. 170 | 171 | Here are a few examples: 172 | 173 | ```bash 174 | ## world.std.com was loaded by the first instance of the app 175 | ## Query using the end points corresponding to the first instance gives correct result 176 | $ curl localhost:7070/weblog/access/check/world.std.com 177 | true 178 | 179 | ## we get correct result even if we query using the end points of of the second instance 180 | $ curl localhost:7071/weblog/access/check/world.std.com 181 | true 182 | 183 | ## ppp19.glas.apc.org was loaded by the second instance of the app 184 | ## Query using the end points corresponding to the first instance also gives correct result 185 | $ curl localhost:7070/weblog/access/check/ppp19.glas.apc.org 186 | true 187 | ``` 188 | -------------------------------------------------------------------------------- /examples/example-dsl/src/main/scala/com/lightbend/kafka/scala/iq/example/WeblogProcessing.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright (C) 2018 Lightbend Inc. 3 | */ 4 | 5 | package com.lightbend.kafka.scala.iq.example 6 | 7 | import java.io.StringWriter 8 | import java.time.format.DateTimeFormatter 9 | import java.util.Properties 10 | import java.util.concurrent.Executors 11 | 12 | import io.confluent.kafka.serializers.AbstractKafkaAvroSerDeConfig 13 | 14 | import akka.actor.ActorSystem 15 | import akka.stream.ActorMaterializer 16 | 17 | import com.lightbend.kafka.scala.iq.http.{HttpRequester, KeyValueFetcher } 18 | import com.lightbend.kafka.scala.iq.services.{ MetadataService, LocalStateStoreQuery } 19 | 20 | import config.KStreamConfig._ 21 | import http.{ WeblogDSLHttpService, SummaryInfoFetcher } 22 | import models.{LogParseUtil, LogRecord} 23 | 24 | import de.heikoseeberger.akkahttpcirce.FailFastCirceSupport 25 | 26 | import org.apache.kafka.clients.consumer.ConsumerConfig 27 | import org.apache.kafka.common.serialization.Serdes 28 | import org.apache.kafka.streams.kstream._ 29 | import org.apache.kafka.streams.Consumed 30 | import org.apache.kafka.streams.state.HostInfo 31 | import org.apache.kafka.streams.{KafkaStreams, StreamsConfig} 32 | 33 | import com.lightbend.kafka.scala.streams._ 34 | 35 | import scala.concurrent.ExecutionContext 36 | import scala.util.{Failure, Success} 37 | 38 | import serializers.AppSerializers 39 | 40 | object WeblogProcessing extends WeblogWorkflow with AppSerializers with FailFastCirceSupport { 41 | 42 | final val ACCESS_COUNT_PER_HOST_STORE = "access-count-per-host" 43 | final val PAYLOAD_SIZE_PER_HOST_STORE = "payload-size-per-host" 44 | final val WINDOWED_ACCESS_COUNT_PER_HOST_STORE = "windowed-access-count-per-host" 45 | final val WINDOWED_PAYLOAD_SIZE_PER_HOST_STORE = "windowed-payload-size-per-host" 46 | 47 | def main(args: Array[String]): Unit = workflow() 48 | 49 | override def startRestProxy(streams: KafkaStreams, hostInfo: HostInfo, 50 | actorSystem: ActorSystem, materializer: ActorMaterializer): WeblogDSLHttpService = { 51 | 52 | implicit val system = actorSystem 53 | 54 | lazy val defaultParallelism: Int = { 55 | val rt = Runtime.getRuntime 56 | rt.availableProcessors() * 4 57 | } 58 | 59 | def defaultExecutionContext(parallelism: Int = defaultParallelism): ExecutionContext = 60 | ExecutionContext.fromExecutor(Executors.newFixedThreadPool(parallelism)) 61 | 62 | val executionContext = defaultExecutionContext() 63 | 64 | // service for fetching metadata information 65 | val metadataService = new MetadataService(streams) 66 | 67 | // service for fetching from local state store 68 | val localStateStoreQuery = new LocalStateStoreQuery[String, Long] 69 | 70 | // http service for request handling 71 | val httpRequester = new HttpRequester(system, materializer, executionContext) 72 | 73 | implicit val ss = stringSerializer 74 | val restService = new WeblogDSLHttpService( 75 | hostInfo, 76 | new SummaryInfoFetcher( 77 | new KeyValueFetcher(metadataService, localStateStoreQuery, httpRequester, streams, executionContext, hostInfo) 78 | ), 79 | system, materializer, executionContext 80 | ) 81 | restService.start() 82 | restService 83 | } 84 | 85 | override def createStreams(config: ConfigData): KafkaStreams = { 86 | // Kafka stream configuration 87 | val streamingConfig = { 88 | val settings = new Properties 89 | settings.put(StreamsConfig.APPLICATION_ID_CONFIG, "kstream-weblog-processing") 90 | settings.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, config.brokers) 91 | 92 | config.schemaRegistryUrl.foreach{ url => 93 | settings.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, url) 94 | } 95 | 96 | settings.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray.getClass.getName) 97 | settings.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String.getClass.getName) 98 | 99 | // setting offset reset to earliest so that we can re-run the demo code with the same pre-loaded data 100 | // Note: To re-run the demo, you need to use the offset reset tool: 101 | // https://cwiki.apache.org/confluence/display/KAFKA/Kafka+Streams+Application+Reset+Tool 102 | settings.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest") 103 | 104 | // need this for query service 105 | val endpointHostName = translateHostInterface(config.httpInterface) 106 | logger.info(s"Endpoint host name $endpointHostName") 107 | 108 | settings.put(StreamsConfig.APPLICATION_SERVER_CONFIG, s"$endpointHostName:${config.httpPort}") 109 | 110 | // default is /tmp/kafka-streams 111 | settings.put(StreamsConfig.STATE_DIR_CONFIG, config.stateStoreDir) 112 | 113 | // Set the commit interval to 500ms so that any changes are flushed frequently and the summary 114 | // data are updated with low latency. 115 | settings.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, "500") 116 | 117 | settings 118 | } 119 | 120 | implicit val builder = new StreamsBuilderS 121 | 122 | generateLogRecords(config) 123 | 124 | // 125 | // assumption : the topic contains serialized records of LogRecord (serialized through logRecordSerde) 126 | val logRecords = 127 | builder.stream(List(config.toTopic), Consumed.`with`(byteArraySerde, logRecordSerde)) 128 | 129 | generateAvro(logRecords, config) 130 | hostCountSummary(logRecords, config) 131 | totalPayloadPerHostSummary(logRecords, config) 132 | 133 | new KafkaStreams(builder.build(), streamingConfig) 134 | } 135 | 136 | /** 137 | * Clean and format input data. Redirect records that cause a parsing error to the error topic. 138 | */ 139 | def generateLogRecords(config: ConfigData)(implicit builder: StreamsBuilderS): Unit = { 140 | 141 | // will read network data from `fromTopic` 142 | val logs = builder.stream[Array[Byte], String](config.fromTopic) 143 | 144 | def predicateValid: (Array[Byte], Extracted) => Boolean = { (_, value) => 145 | value match { 146 | case ValidLogRecord(_) => true 147 | case _ => false 148 | } 149 | } 150 | 151 | def predicateError: (Array[Byte], Extracted) => Boolean = { (_, value) => 152 | value match { 153 | case ValueError(_, _) => true 154 | case _ => false 155 | } 156 | } 157 | 158 | // extract values after transformation 159 | val filtered = logs.mapValues { record => 160 | LogParseUtil.parseLine(record) match { 161 | case Success(r) => ValidLogRecord(r) 162 | case Failure(ex) => ValueError(ex, record) 163 | } 164 | }.branch(predicateValid, predicateError) 165 | 166 | // push the labelled data 167 | filtered(0).mapValues { 168 | case ValidLogRecord(r) => r 169 | case _ => ??? // should never happen since we pre-emptively filtered with `branch` 170 | }.to(config.toTopic, Produced.`with`(byteArraySerde, logRecordSerde)) 171 | 172 | // push the extraction errors 173 | filtered(1).mapValues { 174 | case ValueError(_, v) => 175 | val writer = new StringWriter() 176 | (writer.toString, v) 177 | case _ => ??? // should never happen since we pre-emptively filtered with `branch` 178 | }.to(config.errorTopic, Produced.`with`(byteArraySerde, tuple2StringSerde)) 179 | } 180 | 181 | sealed abstract class Extracted { } 182 | final case class ValidLogRecord(record: LogRecord) extends Extracted 183 | final case class ValueError(exception: Throwable, originalRecord: String) extends Extracted 184 | 185 | def generateAvro(logRecords: KStreamS[Array[Byte], LogRecord], config: ConfigData): Unit = { 186 | logRecords.mapValues(makeAvro) 187 | .to(config.avroTopic, Produced.`with`(byteArraySerde, logRecordAvroSerde(config.schemaRegistryUrl))) 188 | } 189 | 190 | /** 191 | * Transform a LogRecord into an Avro SpecificRecord, LogRecordAvro, generated by the Avro compiler 192 | */ 193 | def makeAvro(record: LogRecord): LogRecordAvro = 194 | LogRecordAvro.newBuilder() 195 | .setHost(record.host) 196 | .setClientId(record.clientId) 197 | .setUser(record.user) 198 | .setTimestamp(record.timestamp.format(DateTimeFormatter.ofPattern("yyyy MM dd"))) 199 | .setMethod(record.method) 200 | .setEndpoint(record.endpoint) 201 | .setProtocol(record.protocol) 202 | .setHttpReplyCode(record.httpReplyCode) 203 | .setPayloadSize(record.payloadSize) 204 | .build() 205 | 206 | /** 207 | * Summary count of number of times each host has been accessed 208 | */ 209 | def hostCountSummary(logRecords: KStreamS[Array[Byte], LogRecord], config: ConfigData)(implicit builder: StreamsBuilderS): Unit = { 210 | 211 | val groupedStream = 212 | logRecords.mapValues(_.host) 213 | .map((_, value) => (value, value)) 214 | .groupByKey(Serialized.`with`(stringSerde, stringSerde)) 215 | 216 | // since this is a KTable (changelog stream), only the latest summarized information 217 | // for a host will be the correct one - all earlier records will be considered out of date 218 | // 219 | // materialize the summarized information into a topic 220 | groupedStream.count(ACCESS_COUNT_PER_HOST_STORE, Some(stringSerde)) 221 | .toStream.to(config.summaryAccessTopic, Produced.`with`(stringSerde, longSerde)) 222 | 223 | groupedStream.windowedBy(TimeWindows.of(60000)) 224 | .count(WINDOWED_ACCESS_COUNT_PER_HOST_STORE, Some(stringSerde)) 225 | .toStream.to(config.windowedSummaryAccessTopic, Produced.`with`(windowedStringSerde, longSerde)) 226 | 227 | // print the topic info (for debugging) 228 | builder.stream(List(config.summaryAccessTopic), Consumed.`with`(stringSerde, longSerde)) 229 | .print(Printed.toSysOut[String, Long].withKeyValueMapper { new KeyValueMapper[String, Long, String]() { 230 | def apply(key: String, value: Long) = s"""$key / $value""" 231 | }}) 232 | } 233 | 234 | /** 235 | * Aggregate value of payloadSize per host 236 | */ 237 | def totalPayloadPerHostSummary(logRecords: KStreamS[Array[Byte], LogRecord], config: ConfigData)(implicit builder: StreamsBuilderS): Unit = { 238 | val groupedStream = 239 | logRecords.mapValues(record => (record.host, record.payloadSize)) 240 | .map { case (_, (host, size)) => (host, size) } 241 | .groupByKey(Serialized.`with`(stringSerde, longSerde)) 242 | 243 | // materialize the summarized information into a topic 244 | groupedStream 245 | .aggregate( 246 | () => 0L, 247 | (_: String, s: Long, agg: Long) => s + agg, 248 | Materialized.as(PAYLOAD_SIZE_PER_HOST_STORE) 249 | .withKeySerde(stringSerde) 250 | .withValueSerde(longSerde) 251 | ) 252 | .toStream.to(config.summaryPayloadTopic, Produced.`with`(stringSerde, longSerde)) 253 | 254 | groupedStream 255 | .windowedBy(TimeWindows.of(60000)) 256 | .aggregate( 257 | () => 0L, 258 | (_: String, s: Long, agg: Long) => s + agg, 259 | Materialized.as(WINDOWED_PAYLOAD_SIZE_PER_HOST_STORE) 260 | .withKeySerde(stringSerde) 261 | .withValueSerde(longSerde) 262 | ) 263 | .toStream.to(config.windowedSummaryPayloadTopic, Produced.`with`(windowedStringSerde, longSerde)) 264 | 265 | builder.stream(List(config.summaryPayloadTopic), Consumed.`with`(stringSerde, longSerde)) 266 | .print(Printed.toSysOut[String, Long].withKeyValueMapper { new KeyValueMapper[String, Long, String]() { 267 | def apply(key: String, value: Long) = s"""$key / $value""" 268 | }}) 269 | } 270 | } 271 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /examples/example-dsl/README.md: -------------------------------------------------------------------------------- 1 | ## Example Implementation of HTTP-based Interactive Query Service 2 | 3 | This example demonstrates the following features in Kafka Streams along with an HTTP based interactive query service: 4 | 5 | 1. Data ingestion 6 | 2. Data transformation using a Kafka Streams DSL-based implementation 7 | 3. Managing local state with key-value stores 8 | 4. Interactive query service with HTTP end points 9 | 10 | The implementation is based on the [ClarkNet dataset](http://ita.ee.lbl.gov/html/contrib/ClarkNet-HTTP.html), which has to be downloaded in a local folder. 11 | 12 | ## Build and Run Locally 13 | 14 | By default the application runs through an embedded local Kafka Server. In case you want to run separate instances of Kafka and Zookeeper servers, change `kafka.localserver` to `false` in `application.conf`. 15 | 16 | To run the application, do the following steps. 17 | 18 | ### Build the Libraries 19 | 20 | This example application depends on [kafka-streams-scala](https://github.com/lightbend/kafka-streams-scala) and [kafka-streams-query](https://github.com/lightbend/kafka-streams-query/tree/develop/lib). Ensure that you have the proper versions of these libraries in your classpath. Note that in this example Scala 2.12.4 and Kafka 1.0.0 are used. 21 | 22 | If you've made local changes `kafka-streams-query` then you'll need to publish them to your local ivy repository using `sbt publishLocal` from within the `./lib/` directory. 23 | 24 | ### Start ZooKeeper and Kafka 25 | 26 | > This is only required if the setting of `kafka.localserver` is `false` in `application.conf`. If this is set to `true`, the application runs with an embedded local Kafka server. However, note that if you want to run the application in a distributed mode(see below for details of running in distributed mode), you need to run a separate Kafka and Zookeeper server. 27 | 28 | Start ZooKeeper and Kafka, if not already running. You can download Kafka 1.0.0 for Scala 2.12 [here](https://kafka.apache.org/documentation/#quickstart), then follow the [Quick Start](https://kafka.apache.org/documentation/#quickstart) instructions for running ZooKeeper and Kafka, steps 1 and 2. 29 | 30 | ### Download the ClarkNet dataset 31 | 32 | Download the [ClarkNet dataset](http://ita.ee.lbl.gov/html/contrib/ClarkNet-HTTP.html) and put it in a convenient local folder. 33 | 34 | ### Configure the Application Properties 35 | 36 | Copy `src/main/resources/application-dsl.conf.template` to `src/main/resources/application-dsl.conf`. 37 | 38 | Edit `src/main/resources/application-dsl.conf` and set the entry for `directorytowatch` to match the folder name where you installed the ClarkNet dataset. 39 | 40 | And note that you can run the application with a bundled local Kafka server by setting `kafka.localserver` to `true` in the `application.conf` file. 41 | 42 | ### Create the Kafka Topics 43 | 44 | > This is only required if the setting of `kafka.localserver` is `false` in `application.conf`. If this is set to 45 | `true`, the application runs with an embedded local Kafka server and creates all necessary topics on its own. However, 46 | note that if you want to run the application in a distributed mode (see below for details of running in distributed 47 | mode), you need to run a separate Kafka and Zookeeper server. If you're running in distributed mode then topics should 48 | have more than 1 partition. 49 | 50 | Create the topics using the `kafka-topics.sh` command that comes with the Kafka distribution. We'll refer to the directory where you installed Kafka as `$KAFKA_HOME`. Run the following commands: 51 | 52 | ```bash 53 | $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 3 --topic logerr-dsl 54 | $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 3 --topic server-log-dsl 55 | $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 3 --topic processed-log 56 | $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 3 --topic summary-access-log 57 | $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 3 --topic windowed-summary-access-log 58 | $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 3 --topic summary-payload-log 59 | $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 3 --topic windowed-summary-payload-log 60 | $KAFKA_HOME/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 3 --topic avro-topic 61 | ``` 62 | 63 | ### Run the Application! 64 | 65 | Now run the application as follows: 66 | 67 | ```bash 68 | $ sbt 69 | > clean 70 | > compile 71 | > dsl 72 | ``` 73 | 74 | This will start the application. Now you can query on the global state using `curl`: 75 | 76 | ```bash 77 | $ ## The example application has a timer to `touch` the files in the watched 78 | $ ## directory 1 minute after the app starts to trigger the streaming to begin. Touch 79 | $ ## the ClarkNet dataset again, or add new files, to stream more entries. 80 | $ 81 | $ ## Fetch the number of accesses made to the host world.std.com as per the downloaded 82 | $ ## data file 83 | $ curl http://localhost:7070/weblog/access/world.std.com 84 | 15 85 | $ 86 | $ ## If you specify ALL as the key-name then it will fetch a list of all key-values 87 | $ ## from all the stores that has the access information with the same application id 88 | $ curl http://localhost:7070/weblog/access/ALL 89 | [["204.249.225.59",1],["access9.accsyst.com",2],["cssu24.cs.ust.hk",1],["cyclom1-1-6.intersource.com",1],["d24-1.cpe.Brisbane.aone.net.au",1],["er6.rutgers.edu",1],["world.std.com",3]] 90 | $ 91 | $ ## If you specify COUNT as the key-name then it will fetch the sum of count of all 92 | $ ## approximate number of entries from all the stores that has the access information 93 | $ ## with the same application id 94 | $ curl http://localhost:7070/weblog/access/COUNT 95 | 7 96 | $ ## Query access counts by a range of keys. The "from" key must be less than the "to" 97 | $ ## key. For example, "a.com" < "z.org" 98 | $ curl http://localhost:7070/weblog/access/range/a.com/z.org 99 | [["access9.accsyst.com",4],["cssu24.cs.ust.hk",2],["cyclom1-1-6.intersource.com",2],["d24-1.cpe.Brisbane.aone.net.au",2],["er6.rutgers.edu",2],["reddit.com",2],["world.std.com",6]] 100 | $ 101 | $ ## Query a time window for a key. The "from" and "to" parameters must be represented 102 | $ ## as a milliseconds since epoch long number. The "from" time must be less than the 103 | $ ## "to time. Stream elements are windowed using ingest time and not event time. For 104 | $ ## example, get all time windows for world.std.com from epoch 0 to current epoch. 105 | $ curl http://localhost:7070/weblog/access/win/world.std.com/0/$(date +%s%3N) 106 | [[1517518200000,6],[1517518260000,3]] 107 | $ ## 108 | $ ## Fetch the number of bytes in the reply for queries to the host 109 | $ ## world.std.com as per the downloaded data file 110 | $ curl http://localhost:7070/weblog/bytes/world.std.com 111 | 124532 112 | ``` 113 | 114 | ## Run in Distributed Mode 115 | 116 | The http query layer is designed to work even when your application runs in the distributed mode. Running your Kafka Streams application in the distributed mode means that all the instances must have the same application id. 117 | 118 | > In order to run the application in distributed mode, you need to run an external Kafka and Zookeeper server. Set `kafka.localserver` to `false` to enable this setting. 119 | 120 | Here are the steps that you need to follow to run the application in distributed mode. We assume here you are running both the instances in the same node with different port numbers. It's fairly easy to scale this on different nodes. 121 | 122 | ### Step 1: Build and configure for distribution 123 | 124 | ```bash 125 | $ sbt 126 | > dslPackage/universal:packageZipTarball 127 | ``` 128 | 129 | This creates a distribution under a folder `/build`. 130 | 131 | ```bash 132 | $ pwd 133 | 134 | $ cd build/dsl/target/universal 135 | $ ls 136 | dslpackage-0.0.1.tgz 137 | ## unpack the distribution 138 | $ tar xvfz dslpackage-0.0.1.tgz 139 | $ cd dslpackage-0.0.1 140 | $ ls 141 | bin conf lib 142 | $ cd conf 143 | $ ls 144 | application.conf logback.xml 145 | ## change the above 2 files based on your requirements. 146 | $ cd .. 147 | $ pwd 148 | <...>/dslpackage-0.0.1 149 | ``` 150 | 151 | ### Step 2: Run the first instance of the application 152 | Ensure the following: 153 | 154 | 1. Zookeeper and Kafka are running 155 | 2. All topics mentioned above are created with more than 1 partition 156 | 3. The folder mentioned in `directoryToWatch` in `application.conf` has the data file 157 | 158 | ```bash 159 | $ pwd 160 | <...>/dslpackage-0.0.1 161 | $ bin/dslpackage 162 | ``` 163 | 164 | This starts the single instance of the application. After some time you will see data printed in the console regarding the host access information as present from the data file. 165 | 166 | In the log file, created under `<...>/dslpackage-0.0.1/logs`, check if the REST service has started and note the host and port details. It should be something like `localhost:7070` (the default setting in `application.conf`). 167 | 168 | ### Step 3: Run the second instance of the application 169 | 170 | If you decide to run multiple instances of the application you may choose to split the dataset into 2 parts and keep them in different folders. Also you need to copy the current distribution in some other folder and start the second instance from there, since you need to run it with changed settings in `application.conf`. Say we want to copy in a folder named `clarknet-2`. 171 | 172 | ```bash 173 | $ cp /build/dsl/target/universal/dslpackage-0.0.1.tgz clarknet-2 174 | $ cd clarknet-2 175 | $ tar xvfz dslpackage-0.0.1.tgz 176 | ## unpack the distribution 177 | $ cd dslpackage-0.0.1 178 | $ ls 179 | bin conf lib 180 | $ cd conf 181 | $ ls 182 | application.conf logback.xml 183 | ## change the above 2 files based on your requirements. 184 | $ cd .. 185 | $ pwd 186 | <...>/dslpackage-0.0.1 187 | ``` 188 | 189 | The following settings need to be changed in `application.conf` before you can run the second instance: 190 | 191 | 1. `dcos.kafka.statestoredir` - This is the folder where the local state information gets persisted by Kafka streams. This has to be different for every new instance set up. 192 | 2. `dcos.kafka.loader.directorytowatch` - The data folder because we would like to ingest different data for the 2 instances. 193 | 3. `dcos.http.interface` and `dcos.http.port` - The REST service endpoints. If the node is not different then it can be `localhost` for both. 194 | 195 | ```bash 196 | $ pwd 197 | <...>/dslpackage-0.0.1 198 | $ bin/dslpackage 199 | ``` 200 | 201 | This will start the second instance. Check the log file to verify that the REST endpoints are properly started. 202 | 203 | ### Step 4: Do query 204 | 205 | The idea of a distributed interactive query interface is to allow the user to query for *all* keys using *any* of the end points where the REST service are running. Assume that the 2 instances are running at `localhost:7070` and `localhost:7071`. 206 | 207 | Here are a few examples: 208 | 209 | ```bash 210 | ## world.std.com was loaded by the first instance of the app 211 | ## Query using the end points corresponding to the first instance gives correct result 212 | $ curl localhost:7070/weblog/access/world.std.com 213 | 14 214 | 215 | ## we get correct result even if we query using the end points of of the second instance 216 | $ curl localhost:7071/weblog/access/world.std.com 217 | 14 218 | 219 | ## ppp19.glas.apc.org was loaded by the second instance of the app 220 | ## Query using the end points corresponding to the first instance also gives correct result 221 | $ curl localhost:7070/weblog/access/ppp19.glas.apc.org 222 | 17 223 | ``` 224 | 225 | ### Step 5: Clean up application's Kafka Streams internal topics 226 | 227 | When running in distributed mode, Kafka Streams event stores are backed by internal Kafka Streams topics so that state 228 | can be restored on different instances of the app if there's a failure. To reset to a clean state you can use the 229 | [Kafka Streams Application Reset tool](https://cwiki.apache.org/confluence/display/KAFKA/Kafka+Streams+Application+Reset+Tool) 230 | This will delete internal Kafka Streams topics associated with a specified application id. Note that you must have 231 | `delete.topics.enable` set to true in your Broker configuration to delete topics. 232 | 233 | An example of run this tool: 234 | 235 | ``` 236 | $ ./kafka-streams-application-reset.sh \ 237 | --application-id kstream-weblog-processing \ 238 | --bootstrap-servers kafka-0-broker:9092 \ 239 | --zookeeper localhost:2181 240 | No input or intermediate topics specified. Skipping seek. 241 | Deleting all internal/auto-created topics for application kstream-weblog-processing 242 | Topic kstream-weblog-processing-windowed-access-count-per-host-changelog is marked for deletion. 243 | Note: This will have no impact if delete.topic.enable is not set to true. 244 | Topic kstream-weblog-processing-windowed-payload-size-per-host-repartition is marked for deletion. 245 | Note: This will have no impact if delete.topic.enable is not set to true. 246 | Topic kstream-weblog-processing-access-count-per-host-changelog is marked for deletion. 247 | Note: This will have no impact if delete.topic.enable is not set to true. 248 | Topic kstream-weblog-processing-payload-size-per-host-repartition is marked for deletion. 249 | Note: This will have no impact if delete.topic.enable is not set to true. 250 | Topic kstream-weblog-processing-windowed-payload-size-per-host-changelog is marked for deletion. 251 | Note: This will have no impact if delete.topic.enable is not set to true. 252 | Topic kstream-weblog-processing-windowed-access-count-per-host-repartition is marked for deletion. 253 | Note: This will have no impact if delete.topic.enable is not set to true. 254 | Topic kstream-weblog-processing-payload-size-per-host-changelog is marked for deletion. 255 | Note: This will have no impact if delete.topic.enable is not set to true. 256 | Topic kstream-weblog-processing-access-count-per-host-repartition is marked for deletion. 257 | Note: This will have no impact if delete.topic.enable is not set to true. 258 | Done. 259 | ``` --------------------------------------------------------------------------------