├── .gitignore
├── LICENCE
├── README.md
├── build.sbt
├── docker
    └── Dockerfile
├── postgresql
    └── ds2.backup
├── project
    └── plugins.sbt
└── src
    └── main
        └── scala
            └── io
                └── scalac
                    └── spark
                        ├── AvroConsumer.scala
                        └── Model.scala


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.class
 2 | *.log
 3 | 
 4 | # sbt specific
 5 | .cache/
 6 | .history/
 7 | .lib/
 8 | dist/*
 9 | target/
10 | lib_managed/
11 | src_managed/
12 | project/boot/
13 | project/plugins/project/
14 | 
15 | #IntelliJ IDEA specific
16 | .idea/
17 | 
18 | #Spark local
19 | /spark/
20 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 ScalaC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # spark-kafka-avro
  2 | POC: Spark consumer for bottledwater-pg Kafka Avro topics
  3 | 
  4 | This is a proof of concept of streaming a whole Postgres database with [Bottled Water](http://blog.confluent.io/2015/04/23/bottled-water-real-time-integration-of-postgresql-and-kafka/) to Kafka and then to consume the Kafka topics in Spark.
  5 | 
  6 | *This code is not production ready. Use at own risk!*
  7 | 
  8 | Bottled Water
  9 | -------------
 10 | Bottled Water uses the [logical decoding](http://www.postgresql.org/docs/9.4/static/logicaldecoding.html)
 11 | feature (introduced in PostgreSQL 9.4) to extract a consistent snapshot and a continuous stream
 12 | of change events from a database. The data is extracted at a row level, and encoded using
 13 | [Avro](http://avro.apache.org/). A client program connects to your database, extracts this data,
 14 | and relays it to [Kafka](http://kafka.apache.org/) (you could also integrate it with other systems
 15 | if you wish, but Kafka is pretty awesome).
 16 | 
 17 | Key features of Bottled Water are:
 18 | 
 19 | * Works with any PostgreSQL database (version 9.4 or later). There are no restrictions on your
 20 |   database schema.
 21 | * No schema changes are required, no triggers or additional tables. (However, you do need to be
 22 |   able to install a PostgreSQL extension on the database server. More on this below.)
 23 | * Negligible impact on database performance.
 24 | * Transactionally consistent output. That means: writes appear only when they are committed to the
 25 |   database (writes by aborted transactions are discarded), writes appear in the same order as they
 26 |   were committed (no race conditions).
 27 | * Fault-tolerant: does not lose data, even if processes crash, machines die, the network is
 28 |   interrupted, etc.
 29 | 
 30 | Prerequisites
 31 | -------------
 32 | The whole environment will run in Docker. So you must install Docker on your workstation. Check out the installation instructions at https://docs.docker.com
 33 | 
 34 | Running in Docker
 35 | -----------------
 36 | Start zookeeper:
 37 | 
 38 |     $ docker run -d --name zookeeper --hostname zookeeper confluent/zookeeper
 39 | 
 40 | Start Avro schema-registry:
 41 | 
 42 |     $ docker run -d --name schema-registry --hostname schema-registry \
 43 |         --link zookeeper:zookeeper --link kafka:kafka \
 44 |         --env SCHEMA_REGISTRY_AVRO_COMPATIBILITY_LEVEL=none confluent/schema-registry
 45 | 
 46 | Start Postgres server and map the container port 5432 to 32768 on your local machine:
 47 | 
 48 |     $ docker run -d -p 32768:5432 --name postgres --hostname postgres confluent/postgres-bw:0.1
 49 | 
 50 | > Note: Note: If you have used the boot2docker virtual machine on OS X, Windows or Linux, you’ll need to get the IP of the virtual host instead of using localhost. You can do this by running the following outside of the boot2docker shell (i.e., from your comment line or terminal application).
 51 | 
 52 |     $ boot2docker ip
 53 | 
 54 | The `postgres-bw` image extends the
 55 | [official Postgres docker image](https://registry.hub.docker.com/_/postgres/) and adds
 56 | Bottled Water support. However, before Bottled Water can be used, it first needs to be
 57 | enabled. To do this, start a `psql` shell for the Postgres database:
 58 | 
 59 |     $ docker run -it --rm --link postgres:postgres postgres:9.4 sh -c \
 60 |         'exec psql -h "$POSTGRES_PORT_5432_TCP_ADDR" -p "$POSTGRES_PORT_5432_TCP_PORT" -U postgres'
 61 | 
 62 | When the prompt appears, enable the `bottledwater` extension:
 63 | 
 64 |     create extension bottledwater;
 65 |     CREATE DATABASE ds2;
 66 |     \q
 67 | 
 68 | To have some data to play with we will import a modified example [Dell DVD Store](http://linux.dell.com/dvdstore/) database into Postgres. The database dump can be found in `postgres/ds2.backup`. The easiest way to import the dump is to install [pgAdmin](http://pgadmin.org). Alternatively you can import the dump from your terminal (requires Postgres on your local machine):
 69 | 
 70 |     $ pg_restore --dbname=ds2 --host=<ip> --port=32768 -U postgres postgres/ds2.backup
 71 | 
 72 | The next step is to start the Bottled Water client, which relays data from Postgres to Kafka.
 73 | 
 74 |     $ docker run -d --name bottledwater --hostname bottledwater --link postgres:postgres \
 75 |         --env POSTGRES_DBNAME=ds2 --env POSTGRES_USER=postgres \
 76 |         --link kafka:kafka --link schema-registry:schema-registry sibex/bottledwater:0.1
 77 | 
 78 | Bottled Water takes the snapshot, and continues to watch Postgres for any data changes. You can see the data
 79 | that has been extracted from Postgres by consuming from Kafka:
 80 | 
 81 |     $ docker run -it --rm --link zookeeper:zookeeper --link kafka:kafka \
 82 |         --link schema-registry:schema-registry confluent/tools \
 83 |         kafka-avro-console-consumer --property print.key=true --topic categories --from-beginning
 84 | 
 85 | Now it's time to build the `Spark Streaming` application:
 86 | 
 87 |     $ sbt assembly
 88 | 
 89 | > Note: For the sake of development speed you have to download [Spark](http://d3kbcqa49mib13.cloudfront.net/spark-1.4.0-bin-hadoop2.4.tgz) and extract it to the `spark` folder in the main directory. The `Dockerfile` assumes the folder `spark/spark-1.4.0-bin-hadoop2.4` to be there. The `Dockerfile` can be later changed to download `Spark` and extract each time the `Docker` image gets builded.
 90 | 
 91 | Build the `Docker` image with the deployed `Spark Streaming` application (JAR):
 92 | 
 93 |     $ docker build -f docker/Dockerfile -t spark-kafka-avro .
 94 | 
 95 | Finally run the application:
 96 | 
 97 |     $ docker run -it --rm --name spark --hostname spark --link postgres:postgres \
 98 |         --env POSTGRES_DBNAME=ds2 --env POSTGRES_USER=postgres \
 99 |         --link kafka:kafka --link schema-registry:schema-registry spark-kafka-avro
100 | 
101 | Before creating another `Docker` image the previous one should be removed
102 | 
103 |     $ docker rmi spark-kafka-avro
104 | 
105 | Developed by [Scalac](https://scalac.io/?utm_source=scalac_github&utm_campaign=scalac1&utm_medium=web)
106 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | val metaSettings = Seq(
 2 |   name := "spark-kafka-avro",
 3 |   description := "POC: Spark consumer for bottledwater-pg Kafka Avro topics",
 4 |   version := "1.0.0"
 5 | )
 6 | 
 7 | val scalaSettings = Seq(
 8 |   scalaVersion := "2.10.5",
 9 |   scalacOptions ++= Seq("-feature", "-unchecked", "-deprecation", "-encoding", "utf8")
10 | )
11 | 
12 | val deploymentSettings = Seq(
13 |   mainClass in assembly := Some("io.scalac.spark.AvroConsumer"),
14 |   assemblyOutputPath in assembly := file("target/deploy/spark-kafka-avro.jar"),
15 |   test in assembly := {}
16 | )
17 | 
18 | val repositories = Seq(
19 |   "confluent" at "http://packages.confluent.io/maven/",
20 |   Resolver.sonatypeRepo("public")
21 | )
22 | 
23 | val dependencies = Seq(
24 |   "org.apache.spark" % "spark-streaming_2.10" % "1.4.0"  % "provided",
25 |   "org.apache.spark" % "spark-streaming-kafka_2.10" % "1.4.0"
26 |     exclude("org.spark-project.spark", "unused"),
27 |   "org.apache.avro" % "avro" % "1.7.7",
28 |   "io.confluent" % "kafka-avro-serializer" % "1.0",
29 |   "com.github.scopt" %% "scopt" % "3.3.0",
30 |   "joda-time" % "joda-time" % "2.8.1"
31 | )
32 | 
33 | lazy val root = (project in file(".")).
34 |   settings(metaSettings: _*).
35 |   settings(scalaSettings: _*).
36 |   settings(deploymentSettings: _*).
37 |   settings(resolvers ++= repositories).
38 |   settings(libraryDependencies ++= dependencies)
39 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM java:7
 2 | MAINTAINER Sebastian Bach <sebastian@scalac.io>
 3 | 
 4 | COPY target/deploy/spark-kafka-avro.jar /srv/
 5 | 
 6 | #RUN curl -SL http://d3kbcqa49mib13.cloudfront.net/spark-1.4.0-bin-hadoop2.4.tgz \
 7 | #    | tar -xzC /srv \
 8 | #    && mv /srv/spark-1.4.0-bin-hadoop2.4 /srv/spark
 9 | #    && sed -e s/log4j.rootCategory=INFO/log4j.rootCategory=WARN/g /srv/spark/conf/log4j.properties.template > /srv/spark/conf/log4j.properties
10 | 
11 | COPY spark/spark-1.4.0-bin-hadoop2.4 /srv/spark
12 | RUN sed -e s/log4j.rootCategory=INFO/log4j.rootCategory=WARN/g /srv/spark/conf/log4j.properties.template > /srv/spark/conf/log4j.properties
13 | 
14 | CMD /srv/spark/bin/spark-submit \
15 |     --class io.scalac.spark.AvroConsumer /srv/spark-kafka-avro.jar \
16 |     --postgres "hostaddr=${POSTGRES_PORT_5432_TCP_ADDR} port=${POSTGRES_PORT_5432_TCP_PORT} dbname=${POSTGRES_DBNAME} user=${POSTGRES_USER}" \
17 |     --broker ${KAFKA_PORT_9092_TCP_ADDR}:${KAFKA_PORT_9092_TCP_PORT} \
18 |     --schema-registry http://${SCHEMA_REGISTRY_PORT_8081_TCP_ADDR}:${SCHEMA_REGISTRY_PORT_8081_TCP_PORT}
19 | 


--------------------------------------------------------------------------------
/postgresql/ds2.backup:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ScalaConsultants/spark-kafka-avro/07d2bcdd032c83306afcbc8a746b390d356f21c8/postgresql/ds2.backup


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0")
2 | 


--------------------------------------------------------------------------------
/src/main/scala/io/scalac/spark/AvroConsumer.scala:
--------------------------------------------------------------------------------
 1 | package io.scalac.spark
 2 | 
 3 | import io.confluent.kafka.serializers.KafkaAvroDecoder
 4 | import org.apache.spark.SparkConf
 5 | import org.apache.spark.streaming.kafka.KafkaUtils
 6 | import org.apache.spark.streaming.{StreamingContext, Seconds}
 7 | 
 8 | case class Config(postgres: Map[String, String] = Map(), broker: String = "", schemaRegistry: String = "")
 9 | 
10 | object AvroConsumer {
11 | 
12 |   def main(args: Array[String]) {
13 |     val parser = new scopt.OptionParser[Config]("spark") {
14 |       opt[Map[String, String]]("postgres") required() action { (x, c) =>
15 |         c.copy(postgres = x)
16 |       } keyValueName("<key>", "<value>") text ("are PostgreSQL connection params")
17 | 
18 |       opt[String]("broker") required() action { (x, c) =>
19 |         c.copy(broker = x)
20 |       } valueName ("<broker1-host:port>,<broker2-host:port>") text ("is a list of one or more Kafka brokers")
21 | 
22 |       opt[String]("schema-registry") required() action { (x, c) =>
23 |         c.copy(schemaRegistry = x)
24 |       } valueName ("<schema registry url>") text ("is the Avro schema-registry URL")
25 |     }
26 | 
27 |     parser.parse(args, Config()) match {
28 |       case Some(config) => process(config)
29 |       case _ =>
30 |     }
31 |   }
32 | 
33 |   def process(config: Config) = {
34 |     val conf = new SparkConf().setAppName("DirectKafkaAvroConsumer")
35 |     val ssc = new StreamingContext(conf, Seconds(10))
36 |     val kafkaParams = Map[String, String](
37 |       "auto.offset.reset" -> "smallest",
38 |       "metadata.broker.list" -> config.broker,
39 |       "schema.registry.url" -> config.schemaRegistry)
40 |     val topicSet = Set("categories")
41 |     val messages = KafkaUtils.createDirectStream[Object, Object, KafkaAvroDecoder, KafkaAvroDecoder](ssc, kafkaParams, topicSet)
42 | 
43 |     // Process lines
44 |      val lines = messages.map(AvroConverter.convert(_))
45 |      lines.print()
46 | 
47 |     ssc.start()
48 |     ssc.awaitTermination()
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/main/scala/io/scalac/spark/Model.scala:
--------------------------------------------------------------------------------
 1 | package io.scalac.spark
 2 | 
 3 | import org.apache.avro.generic.{IndexedRecord, GenericRecord}
 4 | import org.joda.time.DateTime
 5 | 
 6 | object DatabaseSchema {
 7 |   sealed class Entity
 8 |   case class Category(categoryId: Int, name: String) extends Entity
 9 |   case class CustomerHistory(historyId: Int, customerId: Int, orderId: Int, productId: Int) extends Entity
10 |   case class Customer(customerId: Int, firstName: String, lastName: String) extends Entity
11 |   case class Inventory(productId: Int, inStock: Int, sales: Int) extends Entity
12 |   case class Order(orderId: Int, date: DateTime, customerId: Int, amount: Double, tax: Double, total: Double) extends Entity
13 |   case class OrderLine(orderLineId: Int, pos: Int, orderId: Int, quantity: Int) extends Entity
14 |   case class Product(productId: Int, category: String, name: String, price: Double) extends Entity
15 | }
16 | 
17 | object AvroConverter {
18 |   def category(record: GenericRecord) = {
19 |     DatabaseSchema.Category(
20 |       record.get("category").asInstanceOf[Int],
21 |       record.get("categoryname").toString)
22 |   }
23 | 
24 |   def order(record: GenericRecord) = {
25 |     DatabaseSchema.Order(
26 |       record.get("orderid").asInstanceOf[Int],
27 |       date(record.get("orderdate").asInstanceOf[GenericRecord]),
28 |       record.get("customerid").asInstanceOf[Int],
29 |       record.get("netamount").asInstanceOf[Double],
30 |       record.get("tax").asInstanceOf[Double],
31 |       record.get("totalamount").asInstanceOf[Double])
32 |   }
33 | 
34 |   def date(record: GenericRecord) = {
35 |     new DateTime(
36 |       record.get("year").asInstanceOf[Int],
37 |       record.get("month").asInstanceOf[Int],
38 |       record.get("day").asInstanceOf[Int], 0, 0)
39 |   }
40 | 
41 |   def convert(message: (Object, Object)) = {
42 |     val (k, v) = message
43 |     val name = k.asInstanceOf[IndexedRecord].getSchema.getName
44 |     val value = v.asInstanceOf[GenericRecord]
45 |     name match {
46 |       case "categories_pkey" => category(value)
47 |       case "orders_pkey" => order(value)
48 |       case n => throw new Exception(s"unknown key '$n'")
49 |     }
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------