├── .gitignore ├── LICENCE ├── README.md ├── build.sbt ├── docker └── Dockerfile ├── postgresql └── ds2.backup ├── project └── plugins.sbt └── src └── main └── scala └── io └── scalac └── spark ├── AvroConsumer.scala └── Model.scala /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache/ 6 | .history/ 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | 15 | #IntelliJ IDEA specific 16 | .idea/ 17 | 18 | #Spark local 19 | /spark/ 20 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 ScalaC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spark-kafka-avro 2 | POC: Spark consumer for bottledwater-pg Kafka Avro topics 3 | 4 | This is a proof of concept of streaming a whole Postgres database with [Bottled Water](http://blog.confluent.io/2015/04/23/bottled-water-real-time-integration-of-postgresql-and-kafka/) to Kafka and then to consume the Kafka topics in Spark. 5 | 6 | *This code is not production ready. Use at own risk!* 7 | 8 | Bottled Water 9 | ------------- 10 | Bottled Water uses the [logical decoding](http://www.postgresql.org/docs/9.4/static/logicaldecoding.html) 11 | feature (introduced in PostgreSQL 9.4) to extract a consistent snapshot and a continuous stream 12 | of change events from a database. The data is extracted at a row level, and encoded using 13 | [Avro](http://avro.apache.org/). A client program connects to your database, extracts this data, 14 | and relays it to [Kafka](http://kafka.apache.org/) (you could also integrate it with other systems 15 | if you wish, but Kafka is pretty awesome). 16 | 17 | Key features of Bottled Water are: 18 | 19 | * Works with any PostgreSQL database (version 9.4 or later). There are no restrictions on your 20 | database schema. 21 | * No schema changes are required, no triggers or additional tables. (However, you do need to be 22 | able to install a PostgreSQL extension on the database server. More on this below.) 23 | * Negligible impact on database performance. 24 | * Transactionally consistent output. That means: writes appear only when they are committed to the 25 | database (writes by aborted transactions are discarded), writes appear in the same order as they 26 | were committed (no race conditions). 27 | * Fault-tolerant: does not lose data, even if processes crash, machines die, the network is 28 | interrupted, etc. 29 | 30 | Prerequisites 31 | ------------- 32 | The whole environment will run in Docker. So you must install Docker on your workstation. Check out the installation instructions at https://docs.docker.com 33 | 34 | Running in Docker 35 | ----------------- 36 | Start zookeeper: 37 | 38 | $ docker run -d --name zookeeper --hostname zookeeper confluent/zookeeper 39 | 40 | Start Avro schema-registry: 41 | 42 | $ docker run -d --name schema-registry --hostname schema-registry \ 43 | --link zookeeper:zookeeper --link kafka:kafka \ 44 | --env SCHEMA_REGISTRY_AVRO_COMPATIBILITY_LEVEL=none confluent/schema-registry 45 | 46 | Start Postgres server and map the container port 5432 to 32768 on your local machine: 47 | 48 | $ docker run -d -p 32768:5432 --name postgres --hostname postgres confluent/postgres-bw:0.1 49 | 50 | > Note: Note: If you have used the boot2docker virtual machine on OS X, Windows or Linux, you’ll need to get the IP of the virtual host instead of using localhost. You can do this by running the following outside of the boot2docker shell (i.e., from your comment line or terminal application). 51 | 52 | $ boot2docker ip 53 | 54 | The `postgres-bw` image extends the 55 | [official Postgres docker image](https://registry.hub.docker.com/_/postgres/) and adds 56 | Bottled Water support. However, before Bottled Water can be used, it first needs to be 57 | enabled. To do this, start a `psql` shell for the Postgres database: 58 | 59 | $ docker run -it --rm --link postgres:postgres postgres:9.4 sh -c \ 60 | 'exec psql -h "$POSTGRES_PORT_5432_TCP_ADDR" -p "$POSTGRES_PORT_5432_TCP_PORT" -U postgres' 61 | 62 | When the prompt appears, enable the `bottledwater` extension: 63 | 64 | create extension bottledwater; 65 | CREATE DATABASE ds2; 66 | \q 67 | 68 | To have some data to play with we will import a modified example [Dell DVD Store](http://linux.dell.com/dvdstore/) database into Postgres. The database dump can be found in `postgres/ds2.backup`. The easiest way to import the dump is to install [pgAdmin](http://pgadmin.org). Alternatively you can import the dump from your terminal (requires Postgres on your local machine): 69 | 70 | $ pg_restore --dbname=ds2 --host= --port=32768 -U postgres postgres/ds2.backup 71 | 72 | The next step is to start the Bottled Water client, which relays data from Postgres to Kafka. 73 | 74 | $ docker run -d --name bottledwater --hostname bottledwater --link postgres:postgres \ 75 | --env POSTGRES_DBNAME=ds2 --env POSTGRES_USER=postgres \ 76 | --link kafka:kafka --link schema-registry:schema-registry sibex/bottledwater:0.1 77 | 78 | Bottled Water takes the snapshot, and continues to watch Postgres for any data changes. You can see the data 79 | that has been extracted from Postgres by consuming from Kafka: 80 | 81 | $ docker run -it --rm --link zookeeper:zookeeper --link kafka:kafka \ 82 | --link schema-registry:schema-registry confluent/tools \ 83 | kafka-avro-console-consumer --property print.key=true --topic categories --from-beginning 84 | 85 | Now it's time to build the `Spark Streaming` application: 86 | 87 | $ sbt assembly 88 | 89 | > Note: For the sake of development speed you have to download [Spark](http://d3kbcqa49mib13.cloudfront.net/spark-1.4.0-bin-hadoop2.4.tgz) and extract it to the `spark` folder in the main directory. The `Dockerfile` assumes the folder `spark/spark-1.4.0-bin-hadoop2.4` to be there. The `Dockerfile` can be later changed to download `Spark` and extract each time the `Docker` image gets builded. 90 | 91 | Build the `Docker` image with the deployed `Spark Streaming` application (JAR): 92 | 93 | $ docker build -f docker/Dockerfile -t spark-kafka-avro . 94 | 95 | Finally run the application: 96 | 97 | $ docker run -it --rm --name spark --hostname spark --link postgres:postgres \ 98 | --env POSTGRES_DBNAME=ds2 --env POSTGRES_USER=postgres \ 99 | --link kafka:kafka --link schema-registry:schema-registry spark-kafka-avro 100 | 101 | Before creating another `Docker` image the previous one should be removed 102 | 103 | $ docker rmi spark-kafka-avro 104 | 105 | Developed by [Scalac](https://scalac.io/?utm_source=scalac_github&utm_campaign=scalac1&utm_medium=web) 106 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | val metaSettings = Seq( 2 | name := "spark-kafka-avro", 3 | description := "POC: Spark consumer for bottledwater-pg Kafka Avro topics", 4 | version := "1.0.0" 5 | ) 6 | 7 | val scalaSettings = Seq( 8 | scalaVersion := "2.10.5", 9 | scalacOptions ++= Seq("-feature", "-unchecked", "-deprecation", "-encoding", "utf8") 10 | ) 11 | 12 | val deploymentSettings = Seq( 13 | mainClass in assembly := Some("io.scalac.spark.AvroConsumer"), 14 | assemblyOutputPath in assembly := file("target/deploy/spark-kafka-avro.jar"), 15 | test in assembly := {} 16 | ) 17 | 18 | val repositories = Seq( 19 | "confluent" at "http://packages.confluent.io/maven/", 20 | Resolver.sonatypeRepo("public") 21 | ) 22 | 23 | val dependencies = Seq( 24 | "org.apache.spark" % "spark-streaming_2.10" % "1.4.0" % "provided", 25 | "org.apache.spark" % "spark-streaming-kafka_2.10" % "1.4.0" 26 | exclude("org.spark-project.spark", "unused"), 27 | "org.apache.avro" % "avro" % "1.7.7", 28 | "io.confluent" % "kafka-avro-serializer" % "1.0", 29 | "com.github.scopt" %% "scopt" % "3.3.0", 30 | "joda-time" % "joda-time" % "2.8.1" 31 | ) 32 | 33 | lazy val root = (project in file(".")). 34 | settings(metaSettings: _*). 35 | settings(scalaSettings: _*). 36 | settings(deploymentSettings: _*). 37 | settings(resolvers ++= repositories). 38 | settings(libraryDependencies ++= dependencies) 39 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM java:7 2 | MAINTAINER Sebastian Bach 3 | 4 | COPY target/deploy/spark-kafka-avro.jar /srv/ 5 | 6 | #RUN curl -SL http://d3kbcqa49mib13.cloudfront.net/spark-1.4.0-bin-hadoop2.4.tgz \ 7 | # | tar -xzC /srv \ 8 | # && mv /srv/spark-1.4.0-bin-hadoop2.4 /srv/spark 9 | # && sed -e s/log4j.rootCategory=INFO/log4j.rootCategory=WARN/g /srv/spark/conf/log4j.properties.template > /srv/spark/conf/log4j.properties 10 | 11 | COPY spark/spark-1.4.0-bin-hadoop2.4 /srv/spark 12 | RUN sed -e s/log4j.rootCategory=INFO/log4j.rootCategory=WARN/g /srv/spark/conf/log4j.properties.template > /srv/spark/conf/log4j.properties 13 | 14 | CMD /srv/spark/bin/spark-submit \ 15 | --class io.scalac.spark.AvroConsumer /srv/spark-kafka-avro.jar \ 16 | --postgres "hostaddr=${POSTGRES_PORT_5432_TCP_ADDR} port=${POSTGRES_PORT_5432_TCP_PORT} dbname=${POSTGRES_DBNAME} user=${POSTGRES_USER}" \ 17 | --broker ${KAFKA_PORT_9092_TCP_ADDR}:${KAFKA_PORT_9092_TCP_PORT} \ 18 | --schema-registry http://${SCHEMA_REGISTRY_PORT_8081_TCP_ADDR}:${SCHEMA_REGISTRY_PORT_8081_TCP_PORT} 19 | -------------------------------------------------------------------------------- /postgresql/ds2.backup: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ScalaConsultants/spark-kafka-avro/07d2bcdd032c83306afcbc8a746b390d356f21c8/postgresql/ds2.backup -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") 2 | -------------------------------------------------------------------------------- /src/main/scala/io/scalac/spark/AvroConsumer.scala: -------------------------------------------------------------------------------- 1 | package io.scalac.spark 2 | 3 | import io.confluent.kafka.serializers.KafkaAvroDecoder 4 | import org.apache.spark.SparkConf 5 | import org.apache.spark.streaming.kafka.KafkaUtils 6 | import org.apache.spark.streaming.{StreamingContext, Seconds} 7 | 8 | case class Config(postgres: Map[String, String] = Map(), broker: String = "", schemaRegistry: String = "") 9 | 10 | object AvroConsumer { 11 | 12 | def main(args: Array[String]) { 13 | val parser = new scopt.OptionParser[Config]("spark") { 14 | opt[Map[String, String]]("postgres") required() action { (x, c) => 15 | c.copy(postgres = x) 16 | } keyValueName("", "") text ("are PostgreSQL connection params") 17 | 18 | opt[String]("broker") required() action { (x, c) => 19 | c.copy(broker = x) 20 | } valueName (",") text ("is a list of one or more Kafka brokers") 21 | 22 | opt[String]("schema-registry") required() action { (x, c) => 23 | c.copy(schemaRegistry = x) 24 | } valueName ("") text ("is the Avro schema-registry URL") 25 | } 26 | 27 | parser.parse(args, Config()) match { 28 | case Some(config) => process(config) 29 | case _ => 30 | } 31 | } 32 | 33 | def process(config: Config) = { 34 | val conf = new SparkConf().setAppName("DirectKafkaAvroConsumer") 35 | val ssc = new StreamingContext(conf, Seconds(10)) 36 | val kafkaParams = Map[String, String]( 37 | "auto.offset.reset" -> "smallest", 38 | "metadata.broker.list" -> config.broker, 39 | "schema.registry.url" -> config.schemaRegistry) 40 | val topicSet = Set("categories") 41 | val messages = KafkaUtils.createDirectStream[Object, Object, KafkaAvroDecoder, KafkaAvroDecoder](ssc, kafkaParams, topicSet) 42 | 43 | // Process lines 44 | val lines = messages.map(AvroConverter.convert(_)) 45 | lines.print() 46 | 47 | ssc.start() 48 | ssc.awaitTermination() 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/main/scala/io/scalac/spark/Model.scala: -------------------------------------------------------------------------------- 1 | package io.scalac.spark 2 | 3 | import org.apache.avro.generic.{IndexedRecord, GenericRecord} 4 | import org.joda.time.DateTime 5 | 6 | object DatabaseSchema { 7 | sealed class Entity 8 | case class Category(categoryId: Int, name: String) extends Entity 9 | case class CustomerHistory(historyId: Int, customerId: Int, orderId: Int, productId: Int) extends Entity 10 | case class Customer(customerId: Int, firstName: String, lastName: String) extends Entity 11 | case class Inventory(productId: Int, inStock: Int, sales: Int) extends Entity 12 | case class Order(orderId: Int, date: DateTime, customerId: Int, amount: Double, tax: Double, total: Double) extends Entity 13 | case class OrderLine(orderLineId: Int, pos: Int, orderId: Int, quantity: Int) extends Entity 14 | case class Product(productId: Int, category: String, name: String, price: Double) extends Entity 15 | } 16 | 17 | object AvroConverter { 18 | def category(record: GenericRecord) = { 19 | DatabaseSchema.Category( 20 | record.get("category").asInstanceOf[Int], 21 | record.get("categoryname").toString) 22 | } 23 | 24 | def order(record: GenericRecord) = { 25 | DatabaseSchema.Order( 26 | record.get("orderid").asInstanceOf[Int], 27 | date(record.get("orderdate").asInstanceOf[GenericRecord]), 28 | record.get("customerid").asInstanceOf[Int], 29 | record.get("netamount").asInstanceOf[Double], 30 | record.get("tax").asInstanceOf[Double], 31 | record.get("totalamount").asInstanceOf[Double]) 32 | } 33 | 34 | def date(record: GenericRecord) = { 35 | new DateTime( 36 | record.get("year").asInstanceOf[Int], 37 | record.get("month").asInstanceOf[Int], 38 | record.get("day").asInstanceOf[Int], 0, 0) 39 | } 40 | 41 | def convert(message: (Object, Object)) = { 42 | val (k, v) = message 43 | val name = k.asInstanceOf[IndexedRecord].getSchema.getName 44 | val value = v.asInstanceOf[GenericRecord] 45 | name match { 46 | case "categories_pkey" => category(value) 47 | case "orders_pkey" => order(value) 48 | case n => throw new Exception(s"unknown key '$n'") 49 | } 50 | } 51 | } 52 | --------------------------------------------------------------------------------