├── .gitignore ├── docker-spark-streaming-sql ├── project │ ├── build.properties │ └── plugins.sbt ├── docker-compose.yml ├── build.sbt ├── README.md └── src │ └── main │ └── scala │ └── com │ └── antlypls │ └── blog │ └── KafkaSparkDemo.scala └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | build/ 3 | -------------------------------------------------------------------------------- /docker-spark-streaming-sql/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.16 2 | -------------------------------------------------------------------------------- /docker-spark-streaming-sql/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Spark Demos 2 | =========== 3 | 4 | This repository contains a collection of different demo applications using [Apache Spark](https://spark.apache.org/). 5 | -------------------------------------------------------------------------------- /docker-spark-streaming-sql/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | zookeeper: 5 | image: antlypls/zookeeper 6 | 7 | kafka: 8 | image: antlypls/kafka:0.10.2.1_2.11 9 | depends_on: 10 | - zookeeper 11 | environment: 12 | KAFKA_CREATE_TOPICS: "events:1:1" 13 | KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 14 | 15 | java: 16 | image: openjdk:jre 17 | command: bash 18 | ports: 19 | - "4040:4040" 20 | volumes: 21 | - ./build:/build 22 | working_dir: /build 23 | depends_on: 24 | - zookeeper 25 | - kafka 26 | -------------------------------------------------------------------------------- /docker-spark-streaming-sql/build.sbt: -------------------------------------------------------------------------------- 1 | name := "kafka-spark-demo" 2 | 3 | scalaVersion := "2.11.11" 4 | 5 | val sparkVersion = "2.2.0" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.apache.spark" %% "spark-core" % sparkVersion % "provided", 9 | "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", 10 | "org.apache.spark" %% "spark-streaming" % sparkVersion % "provided", 11 | "org.apache.spark" %% "spark-streaming-kafka-0-10" % sparkVersion excludeAll( 12 | ExclusionRule(organization = "org.spark-project.spark", name = "unused"), 13 | ExclusionRule(organization = "org.apache.spark", name = "spark-streaming"), 14 | ExclusionRule(organization = "org.apache.hadoop") 15 | ) 16 | ) 17 | 18 | target in assembly := file("build") 19 | 20 | assemblyJarName in assembly := s"${name.value}.jar" 21 | -------------------------------------------------------------------------------- /docker-spark-streaming-sql/README.md: -------------------------------------------------------------------------------- 1 | Spark SQL and Spark Streaming with Kafka Demo 2 | ============================================= 3 | 4 | This is a demo project that shows how to build application with Spark Streaming and Spark SQL using Docker and Docker Compose. 5 | 6 | For more details see [this post](http://blog.antlypls.com/blog/2017/10/15/using-spark-sql-and-spark-streaming-together/). 7 | 8 | How ro Run 9 | ---------- 10 | 11 | Build fat jar: `sbt assembly`. 12 | 13 | Run `docker-compose run --rm --service-ports java`. 14 | 15 | In the `java` container terminal run: 16 | 17 | ``` 18 | KAFKA_BROKERS=kafka:9092 \ 19 | KAFKA_GROUP_ID=spark-streaming-demo \ 20 | KAFKA_TOPIC=events \ 21 | spark-2.2.0-bin-hadoop2.7/bin/spark-submit \ 22 | --master local[*] \ 23 | --class com.antlypls.blog.KafkaSparkDemo kafka-spark-demo.jar 24 | ``` 25 | 26 | In a separate terminal run 27 | 28 | ``` 29 | docker exec -it $(docker-compose ps -q kafka) kafka-console-producer.sh --broker-list localhost:9092 --topic events 30 | ``` 31 | 32 | And add JSON events like: 33 | 34 | ``` 35 | {"action":"create","timestamp":"2017-10-05T23:01:17Z"} 36 | {"action":"update","timestamp":"2017-10-05T23:01:19Z"} 37 | {"action":"update","timestamp":"2017-10-05T23:02:51Z"} 38 | ``` 39 | -------------------------------------------------------------------------------- /docker-spark-streaming-sql/src/main/scala/com/antlypls/blog/KafkaSparkDemo.scala: -------------------------------------------------------------------------------- 1 | package com.antlypls.blog 2 | 3 | import org.apache.kafka.common.serialization.StringDeserializer 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions.count 6 | import org.apache.spark.sql.types.{StringType, StructType, TimestampType} 7 | import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe 8 | import org.apache.spark.streaming.kafka010.KafkaUtils 9 | import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent 10 | import org.apache.spark.streaming.{Seconds, StreamingContext} 11 | 12 | object KafkaSparkDemo { 13 | def main(args: Array[String]): Unit = { 14 | // Configurations for kafka consumer 15 | val kafkaBrokers = sys.env.get("KAFKA_BROKERS") 16 | val kafkaGroupId = sys.env.get("KAFKA_GROUP_ID") 17 | val kafkaTopic = sys.env.get("KAFKA_TOPIC") 18 | 19 | // Verify that all settings are set 20 | require(kafkaBrokers.isDefined, "KAFKA_BROKERS has not been set") 21 | require(kafkaGroupId.isDefined, "KAFKA_GROUP_ID has not been set") 22 | require(kafkaTopic.isDefined, "KAFKA_TOPIC has not been set") 23 | 24 | // Create Spark Session 25 | val spark = SparkSession 26 | .builder() 27 | .appName("KafkaSparkDemo") 28 | .getOrCreate() 29 | 30 | import spark.implicits._ 31 | 32 | // Create Streaming Context and Kafka Direct Stream with provided settings and 10 seconds batches 33 | val ssc = new StreamingContext(spark.sparkContext, Seconds(10)) 34 | 35 | val kafkaParams = Map[String, Object]( 36 | "bootstrap.servers" -> kafkaBrokers.get, 37 | "key.deserializer" -> classOf[StringDeserializer], 38 | "value.deserializer" -> classOf[StringDeserializer], 39 | "group.id" -> kafkaGroupId.get, 40 | "auto.offset.reset" -> "latest" 41 | ) 42 | 43 | val topics = Array(kafkaTopic.get) 44 | val stream = KafkaUtils.createDirectStream[String, String]( 45 | ssc, 46 | PreferConsistent, 47 | Subscribe[String, String](topics, kafkaParams) 48 | ) 49 | 50 | // Define a schema for JSON data 51 | val schema = new StructType() 52 | .add("action", StringType) 53 | .add("timestamp", TimestampType) 54 | 55 | // Process batches: 56 | // Parse JSON and create Data Frame 57 | // Execute computation on that Data Frame and print result 58 | stream.foreachRDD { (rdd, time) => 59 | val data = rdd.map(record => record.value) 60 | val json = spark.read.schema(schema).json(data) 61 | val result = json.groupBy($"action").agg(count("*").alias("count")) 62 | result.show 63 | } 64 | 65 | // Start Stream 66 | ssc.start() 67 | ssc.awaitTermination() 68 | } 69 | } 70 | --------------------------------------------------------------------------------