├── .gitignore ├── LICENSE ├── README.md ├── build.sbt ├── docker ├── cassandra │ ├── Dockerfile │ └── docker-entrypoint.sh └── kafka │ ├── Dockerfile │ ├── scripts │ └── start-kafka.sh │ └── supervisor │ ├── kafka.conf │ └── zookeeper.conf ├── project ├── build.properties └── plugins.sbt ├── scalastyle-config.xml └── src └── main ├── resources ├── companies.csv └── log4j.properties └── scala └── knolx ├── Config.scala ├── KnolXLogger.scala ├── kafka ├── DataStreamer.scala ├── MultiDataStreamer.scala ├── StreamStaticDataGenerator.scala └── StreamStreamDataGenerator.scala └── spark ├── CassandraForeachWriter.scala ├── Device.scala ├── MultiStreamHandler.scala ├── Stock.scala ├── StreamStaticJoiner.scala ├── StreamStreamJoiner.scala ├── StreamStreamOuterJoiner.scala └── StructuredStreamingWordCount.scala /.gitignore: -------------------------------------------------------------------------------- 1 | # Simple Build Tool 2 | 3 | target/ 4 | .idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Structured Streaming Application 2 | 3 | It is a reference application (which we will constantly improve) showing how to easily leverage and integrate [Spark Structured Streaming](http://spark.apache.org/docs/latest/structured-streaming-programming-guide.html), 4 | [Apache Cassandra](http://cassandra.apache.org), and [Apache Kafka](http://kafka.apache.org) for streaming computations. 5 | 6 | ## Sample Use Case 7 | We need to calculate streaming Word Count. 8 | 9 | ### Clone the repo 10 | 11 | git clone https://github.com/knoldus/structured-streaming-application.git 12 | cd structured-streaming-application 13 | 14 | ### Build the code 15 | If this is your first time running SBT, you will be downloading the internet. 16 | 17 | cd structured-streaming-application 18 | sbt clean compile 19 | 20 | ### Setup - 4 Steps 21 | 1.[Download the latest Cassandra](http://cassandra.apache.org/download/) and open the compressed file. 22 | 23 | 2.Start Cassandra - you may need to prepend with sudo, or chown /var/lib/cassandra. On the command line: 24 | 25 | ./apache-cassandra-{version}/bin/cassandra -f 26 | 27 | 3.[Download Kafka 0.10.2.1](https://www.apache.org/dyn/closer.cgi?path=/kafka/0.10.2.1/kafka_2.11-0.10.2.1.tgz) 28 | 29 | 4.Start the Kafka Server 30 | 31 | cd kafka_2.11-0.10.2.1 32 | bin/zookeeper-server-start.sh config/zookeeper.properties 33 | bin/kafka-server-start.sh config/server.properties 34 | 35 | ### Run 36 | #### From Command Line 37 | 1.Set Environment Variables. Eg, 38 | 39 | export BOOTSTRAP_SERVERS_CONFIG="localhost:9092" 40 | export TOPIC="knolx" 41 | export CASSANDRA_HOSTS="localhost" 42 | export CASSANDRA_KEYSPACE="knolx" 43 | export SPARK_MASTER="local" 44 | export SPARK_APP_NAME="knolx" 45 | export CHECKPOINT_DIR="/tmp/knolx" 46 | 47 | 2.Start `Structured Streaming Application` 48 | 49 | cd /path/to/structured-streaming-application 50 | sbt run 51 | Multiple main classes detected, select one to run: 52 | 53 | [1] knolx.kafka.DataStreamer 54 | [2] knolx.spark.StructuredStreamingWordCount 55 | 56 | Enter number: 2 57 | 58 | 3.Start the Kafka data feed 59 | In a second shell run: 60 | 61 | cd /path/to/structured-streaming-application 62 | sbt run 63 | Multiple main classes detected, select one to run: 64 | 65 | [1] knolx.kafka.DataStreamer 66 | [2] knolx.spark.StructuredStreamingWordCount 67 | 68 | Enter number: 1 69 | 70 | After a few seconds you should see data by entering this in the cqlsh shell: 71 | 72 | cqlsh> select * from wordcount; 73 | 74 | This confirms that data from the app has published to Kafka, and the data is 75 | streaming from Spark to Cassandra. 76 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "structured-streaming-application" 2 | 3 | version := "0.1" 4 | 5 | scalaVersion := "2.11.12" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.apache.kafka" % "kafka-clients" % "0.10.2.1" exclude("net.jpountz.lz4", "lz4"), 9 | "com.typesafe.akka" %% "akka-actor" % "2.5.14", 10 | "com.datastax.cassandra" % "cassandra-driver-core" % "3.5.1", 11 | "org.apache.spark" %% "spark-sql" % "2.3.1" excludeAll( 12 | ExclusionRule("io.netty", "netty"), 13 | ExclusionRule("commons-net", "commons-net"), 14 | ExclusionRule("com.google.guava", "guava") 15 | ), 16 | "org.apache.spark" %% "spark-sql-kafka-0-10" % "2.3.1" exclude("org.apache.kafka", "kafka-clients") 17 | ) 18 | -------------------------------------------------------------------------------- /docker/cassandra/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | # Install Java. 4 | RUN apt-get -y update 5 | 6 | RUN apt-get -y install software-properties-common 7 | 8 | RUN \ 9 | echo oracle-java8-installer shared/accepted-oracle-license-v1-1 select true | debconf-set-selections && \ 10 | add-apt-repository -y ppa:webupd8team/java && \ 11 | apt-get update && \ 12 | apt-get install -y oracle-java8-installer && \ 13 | rm -rf /var/lib/apt/lists/* && \ 14 | rm -rf /var/cache/oracle-jdk8-installer 15 | 16 | # Install Cassandra 17 | # explicitly set user/group IDs 18 | RUN groupadd -r cassandra --gid=999 && useradd -r -g cassandra --uid=999 cassandra 19 | 20 | # grab gosu for easy step-down from root 21 | ENV GOSU_VERSION 1.7 22 | RUN set -x \ 23 | && apt-get update && apt-get install -y --no-install-recommends ca-certificates wget && rm -rf /var/lib/apt/lists/* \ 24 | && wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$(dpkg --print-architecture)" \ 25 | && wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-$(dpkg --print-architecture).asc" \ 26 | && export GNUPGHOME="$(mktemp -d)" \ 27 | && gpg --keyserver ha.pool.sks-keyservers.net --recv-keys B42F6819007F00F88E364FD4036A9C25BF357DD4 \ 28 | && gpg --batch --verify /usr/local/bin/gosu.asc /usr/local/bin/gosu \ 29 | && rm -r "$GNUPGHOME" /usr/local/bin/gosu.asc \ 30 | && chmod +x /usr/local/bin/gosu \ 31 | && gosu nobody true \ 32 | && apt-get purge -y --auto-remove ca-certificates wget 33 | 34 | # solves warning: "jemalloc shared library could not be preloaded to speed up memory allocations" 35 | RUN apt-get update && apt-get install -y --no-install-recommends libjemalloc1 && rm -rf /var/lib/apt/lists/* 36 | 37 | # https://github.com/docker-library/cassandra/pull/98#issuecomment-280761137 38 | RUN { \ 39 | echo 'Package: openjdk-* ca-certificates-java'; \ 40 | echo 'Pin: release n=*-backports'; \ 41 | echo 'Pin-Priority: 990'; \ 42 | } > /etc/apt/preferences.d/java-backports 43 | 44 | # https://wiki.apache.org/cassandra/DebianPackaging#Adding_Repository_Keys 45 | ENV GPG_KEYS \ 46 | # gpg: key 0353B12C: public key "T Jake Luciani " imported 47 | 514A2AD631A57A16DD0047EC749D6EEC0353B12C \ 48 | # gpg: key FE4B2BDA: public key "Michael Shuler " imported 49 | A26E528B271F19B9E5D8E19EA278B781FE4B2BDA 50 | RUN set -ex; \ 51 | export GNUPGHOME="$(mktemp -d)"; \ 52 | for key in $GPG_KEYS; do \ 53 | gpg --keyserver ha.pool.sks-keyservers.net --recv-keys "$key"; \ 54 | done; \ 55 | gpg --export $GPG_KEYS > /etc/apt/trusted.gpg.d/cassandra.gpg; \ 56 | rm -r "$GNUPGHOME"; \ 57 | apt-key list 58 | 59 | RUN echo 'deb http://www.apache.org/dist/cassandra/debian 310x main' >> /etc/apt/sources.list.d/cassandra.list 60 | 61 | ENV CASSANDRA_VERSION 3.10 62 | 63 | RUN apt-get update \ 64 | && apt-get install -y wget \ 65 | && apt-get install -y \ 66 | cassandra="$CASSANDRA_VERSION" \ 67 | cassandra-tools="$CASSANDRA_VERSION" \ 68 | && rm -rf /var/lib/apt/lists/* 69 | 70 | # https://issues.apache.org/jira/browse/CASSANDRA-11661 71 | RUN sed -ri 's/^(JVM_PATCH_VERSION)=.*/\1=25/' /etc/cassandra/cassandra-env.sh 72 | 73 | ENV CASSANDRA_CONFIG /etc/cassandra 74 | 75 | COPY docker-entrypoint.sh /docker-entrypoint.sh 76 | RUN chmod +x /docker-entrypoint.sh 77 | ENTRYPOINT ["/docker-entrypoint.sh"] 78 | 79 | RUN mkdir -p /var/lib/cassandra "$CASSANDRA_CONFIG" \ 80 | && chown -R cassandra:cassandra /var/lib/cassandra "$CASSANDRA_CONFIG" \ 81 | && chmod 777 /var/lib/cassandra "$CASSANDRA_CONFIG" 82 | VOLUME /var/lib/cassandra 83 | 84 | # 7000: intra-node communication 85 | # 7001: TLS intra-node communication 86 | # 7199: JMX 87 | # 9042: CQL 88 | # 9160: thrift service 89 | EXPOSE 7000 7001 7199 9042 9160 90 | CMD ["cassandra", "-f"] 91 | -------------------------------------------------------------------------------- /docker/cassandra/docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # first arg is `-f` or `--some-option` 5 | if [ "${1:0:1}" = '-' ]; then 6 | set -- cassandra -f "$@" 7 | fi 8 | 9 | # allow the container to be started with `--user` 10 | if [ "$1" = 'cassandra' -a "$(id -u)" = '0' ]; then 11 | chown -R cassandra /var/lib/cassandra /var/log/cassandra "$CASSANDRA_CONFIG" 12 | exec gosu cassandra "$BASH_SOURCE" "$@" 13 | fi 14 | 15 | if [ "$1" = 'cassandra' ]; then 16 | : ${CASSANDRA_RPC_ADDRESS='0.0.0.0'} 17 | 18 | : ${CASSANDRA_LISTEN_ADDRESS='auto'} 19 | if [ "$CASSANDRA_LISTEN_ADDRESS" = 'auto' ]; then 20 | CASSANDRA_LISTEN_ADDRESS="$(hostname --ip-address)" 21 | fi 22 | 23 | : ${CASSANDRA_BROADCAST_ADDRESS="$CASSANDRA_LISTEN_ADDRESS"} 24 | 25 | if [ "$CASSANDRA_BROADCAST_ADDRESS" = 'auto' ]; then 26 | CASSANDRA_BROADCAST_ADDRESS="$(hostname --ip-address)" 27 | fi 28 | : ${CASSANDRA_BROADCAST_RPC_ADDRESS:=$CASSANDRA_BROADCAST_ADDRESS} 29 | 30 | if [ -n "${CASSANDRA_NAME:+1}" ]; then 31 | : ${CASSANDRA_SEEDS:="cassandra"} 32 | fi 33 | : ${CASSANDRA_SEEDS:="$CASSANDRA_BROADCAST_ADDRESS"} 34 | 35 | sed -ri 's/(- seeds:).*/\1 "'"$CASSANDRA_SEEDS"'"/' "$CASSANDRA_CONFIG/cassandra.yaml" 36 | 37 | for yaml in \ 38 | broadcast_address \ 39 | broadcast_rpc_address \ 40 | cluster_name \ 41 | endpoint_snitch \ 42 | listen_address \ 43 | num_tokens \ 44 | rpc_address \ 45 | start_rpc \ 46 | ; do 47 | var="CASSANDRA_${yaml^^}" 48 | val="${!var}" 49 | if [ "$val" ]; then 50 | sed -ri 's/^(# )?('"$yaml"':).*/\2 '"$val"'/' "$CASSANDRA_CONFIG/cassandra.yaml" 51 | fi 52 | done 53 | 54 | for rackdc in dc rack; do 55 | var="CASSANDRA_${rackdc^^}" 56 | val="${!var}" 57 | if [ "$val" ]; then 58 | sed -ri 's/^('"$rackdc"'=).*/\1 '"$val"'/' "$CASSANDRA_CONFIG/cassandra-rackdc.properties" 59 | fi 60 | done 61 | fi 62 | 63 | exec "$@" 64 | -------------------------------------------------------------------------------- /docker/kafka/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | # Install Java. 4 | RUN apt-get -y update 5 | 6 | RUN apt-get -y install software-properties-common 7 | 8 | RUN \ 9 | echo oracle-java8-installer shared/accepted-oracle-license-v1-1 select true | debconf-set-selections && \ 10 | add-apt-repository -y ppa:webupd8team/java && \ 11 | apt-get update && \ 12 | apt-get install -y oracle-java8-installer && \ 13 | rm -rf /var/lib/apt/lists/* && \ 14 | rm -rf /var/cache/oracle-jdk8-installer 15 | 16 | # Install Kafka, Zookeeper and other needed things 17 | ENV SCALA_VERSION 2.11 18 | ENV KAFKA_VERSION 0.10.2.1 19 | ENV KAFKA_HOME /opt/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION" 20 | 21 | RUN apt-get update && \ 22 | apt-get install -y zookeeper wget supervisor dnsutils && \ 23 | rm -rf /var/lib/apt/lists/* && \ 24 | apt-get clean && \ 25 | wget -q http://apache.mirrors.spacedump.net/kafka/"$KAFKA_VERSION"/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz -O /tmp/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz && \ 26 | tar xfz /tmp/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz -C /opt && \ 27 | rm /tmp/kafka_"$SCALA_VERSION"-"$KAFKA_VERSION".tgz 28 | 29 | ADD scripts/start-kafka.sh /usr/bin/start-kafka.sh 30 | RUN chmod +x /usr/bin/start-kafka.sh 31 | 32 | # Supervisor config 33 | ADD supervisor/kafka.conf supervisor/zookeeper.conf /etc/supervisor/conf.d/ 34 | 35 | # 2181 is zookeeper, 9092 is kafka 36 | EXPOSE 2181 9092 37 | 38 | CMD ["supervisord", "-n"] 39 | -------------------------------------------------------------------------------- /docker/kafka/scripts/start-kafka.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Optional ENV variables: 4 | # * ADVERTISED_HOST: the external ip for the container, e.g. localhost 5 | # * ADVERTISED_PORT: the external port for Kafka, e.g. 9092 6 | # * ZK_CHROOT: the zookeeper chroot that's used by Kafka (without / prefix), e.g. "kafka" 7 | # * LOG_RETENTION_HOURS: the minimum age of a log file in hours to be eligible for deletion (default is 168, for 1 week) 8 | # * LOG_RETENTION_BYTES: configure the size at which segments are pruned from the log, (default is 1073741824, for 1GB) 9 | # * NUM_PARTITIONS: configure the default number of log partitions per topic 10 | 11 | # Configure advertised host/port if we run in helios 12 | if [ ! -z "$HELIOS_PORT_kafka" ]; then 13 | ADVERTISED_HOST=`echo $HELIOS_PORT_kafka | cut -d':' -f 1 | xargs -n 1 dig +short | tail -n 1` 14 | ADVERTISED_PORT=`echo $HELIOS_PORT_kafka | cut -d':' -f 2` 15 | fi 16 | 17 | # Set the external host and port 18 | if [ ! -z "$ADVERTISED_HOST" ]; then 19 | echo "advertised host: $ADVERTISED_HOST" 20 | if grep -q "^advertised.host.name" $KAFKA_HOME/config/server.properties; then 21 | sed -r -i "s/#(advertised.host.name)=(.*)/\1=$ADVERTISED_HOST/g" $KAFKA_HOME/config/server.properties 22 | else 23 | echo "advertised.host.name=$ADVERTISED_HOST" >> $KAFKA_HOME/config/server.properties 24 | fi 25 | fi 26 | if [ ! -z "$ADVERTISED_PORT" ]; then 27 | echo "advertised port: $ADVERTISED_PORT" 28 | if grep -q "^advertised.port" $KAFKA_HOME/config/server.properties; then 29 | sed -r -i "s/#(advertised.port)=(.*)/\1=$ADVERTISED_PORT/g" $KAFKA_HOME/config/server.properties 30 | else 31 | echo "advertised.port=$ADVERTISED_PORT" >> $KAFKA_HOME/config/server.properties 32 | fi 33 | fi 34 | 35 | # Set the zookeeper chroot 36 | if [ ! -z "$ZK_CHROOT" ]; then 37 | # wait for zookeeper to start up 38 | until /usr/share/zookeeper/bin/zkServer.sh status; do 39 | sleep 0.1 40 | done 41 | 42 | # create the chroot node 43 | echo "create /$ZK_CHROOT \"\"" | /usr/share/zookeeper/bin/zkCli.sh || { 44 | echo "can't create chroot in zookeeper, exit" 45 | exit 1 46 | } 47 | 48 | # configure kafka 49 | sed -r -i "s/(zookeeper.connect)=(.*)/\1=localhost:2181\/$ZK_CHROOT/g" $KAFKA_HOME/config/server.properties 50 | fi 51 | 52 | # Allow specification of log retention policies 53 | if [ ! -z "$LOG_RETENTION_HOURS" ]; then 54 | echo "log retention hours: $LOG_RETENTION_HOURS" 55 | sed -r -i "s/(log.retention.hours)=(.*)/\1=$LOG_RETENTION_HOURS/g" $KAFKA_HOME/config/server.properties 56 | fi 57 | if [ ! -z "$LOG_RETENTION_BYTES" ]; then 58 | echo "log retention bytes: $LOG_RETENTION_BYTES" 59 | sed -r -i "s/#(log.retention.bytes)=(.*)/\1=$LOG_RETENTION_BYTES/g" $KAFKA_HOME/config/server.properties 60 | fi 61 | 62 | # Configure the default number of log partitions per topic 63 | if [ ! -z "$NUM_PARTITIONS" ]; then 64 | echo "default number of partition: $NUM_PARTITIONS" 65 | sed -r -i "s/(num.partitions)=(.*)/\1=$NUM_PARTITIONS/g" $KAFKA_HOME/config/server.properties 66 | fi 67 | 68 | # Enable/disable auto creation of topics 69 | if [ ! -z "$AUTO_CREATE_TOPICS" ]; then 70 | echo "auto.create.topics.enable: $AUTO_CREATE_TOPICS" 71 | echo "auto.create.topics.enable=$AUTO_CREATE_TOPICS" >> $KAFKA_HOME/config/server.properties 72 | fi 73 | 74 | # Run Kafka 75 | $KAFKA_HOME/bin/kafka-server-start.sh $KAFKA_HOME/config/server.properties 76 | -------------------------------------------------------------------------------- /docker/kafka/supervisor/kafka.conf: -------------------------------------------------------------------------------- 1 | [program:kafka] 2 | command=/usr/bin/start-kafka.sh 3 | autostart=true 4 | autorestart=true 5 | -------------------------------------------------------------------------------- /docker/kafka/supervisor/zookeeper.conf: -------------------------------------------------------------------------------- 1 | [program:zookeeper] 2 | command=/usr/share/zookeeper/bin/zkServer.sh start-foreground 3 | autostart=true 4 | autorestart=true 5 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.0.4 -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") 2 | -------------------------------------------------------------------------------- /scalastyle-config.xml: -------------------------------------------------------------------------------- 1 | 2 | Scalastyle standard configuration 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /src/main/resources/companies.csv: -------------------------------------------------------------------------------- 1 | companyName 2 | kirloskar 3 | bajaj 4 | amul 5 | dlf -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootCategory=INFO, console 2 | log4j.appender.console=org.apache.log4j.ConsoleAppender 3 | log4j.appender.console.target=System.err 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n -------------------------------------------------------------------------------- /src/main/scala/knolx/Config.scala: -------------------------------------------------------------------------------- 1 | package knolx 2 | 3 | /** 4 | * Copyright Knoldus Inc.. All rights reserved. 5 | */ 6 | object Config { 7 | val bootstrapServer = Option(System.getenv("BOOTSTRAP_SERVERS_CONFIG")).getOrElse("localhost:9092") 8 | val topic = Option(System.getenv("TOPIC")).getOrElse("data") 9 | val companiesTopic = System.getenv("COMPANIES_TOPIC") 10 | val stocksTopic = System.getenv("STOCKS_TOPIC") 11 | val cassandraHosts = System.getenv("CASSANDRA_HOSTS") 12 | val keyspace = System.getenv("CASSANDRA_KEYSPACE") 13 | val sparkMaster = Option(System.getenv("SPARK_MASTER")).getOrElse("local") 14 | val sparkAppName = Option(System.getenv("SPARK_APP_NAME")).getOrElse("stream") 15 | val checkPointDir = Option(System.getenv("CHECKPOINT_DIR")).getOrElse("/tmp/demo") 16 | } 17 | -------------------------------------------------------------------------------- /src/main/scala/knolx/KnolXLogger.scala: -------------------------------------------------------------------------------- 1 | package knolx 2 | 3 | import org.apache.log4j.{Level, LogManager} 4 | 5 | /** 6 | * Copyright Knoldus Inc.. All rights reserved. 7 | */ 8 | trait KnolXLogger { 9 | private val logger = LogManager.getLogger(this.getClass) 10 | logger.setLevel(Level.INFO) 11 | 12 | def debug(message: String): Unit = logger.debug(message) 13 | 14 | def info(message: String): Unit = logger.info(message) 15 | 16 | def warn(message: String): Unit = logger.warn(message) 17 | 18 | def error(message: String, throwable: Throwable): Unit = logger.error(message, throwable) 19 | } 20 | -------------------------------------------------------------------------------- /src/main/scala/knolx/kafka/DataStreamer.scala: -------------------------------------------------------------------------------- 1 | package knolx.kafka 2 | 3 | import java.util.Properties 4 | 5 | import akka.actor.ActorSystem 6 | import knolx.Config.{bootstrapServer, topic} 7 | import knolx.KnolXLogger 8 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} 9 | import org.apache.kafka.common.serialization.StringSerializer 10 | 11 | import scala.concurrent.ExecutionContext.Implicits.global 12 | import scala.concurrent.duration.DurationInt 13 | import scala.util.Random 14 | 15 | /** 16 | * Copyright Knoldus Inc.. All rights reserved. 17 | */ 18 | object DataStreamer extends App with KnolXLogger { 19 | val system = ActorSystem("DataStreamer") 20 | val props = new Properties() 21 | props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServer) 22 | props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName) 23 | props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName) 24 | 25 | val producer = new KafkaProducer[String, String](props) 26 | 27 | val someWords = List("about", "above", "after", "again", "against") 28 | 29 | info("Streaming data into Kafka...") 30 | system.scheduler.schedule(0 seconds, 200 milliseconds) { 31 | Random.shuffle(someWords).headOption.foreach { word => 32 | producer.send(new ProducerRecord[String, String](topic, word)) 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/knolx/kafka/MultiDataStreamer.scala: -------------------------------------------------------------------------------- 1 | package knolx.kafka 2 | 3 | import java.util.Properties 4 | 5 | import akka.actor.ActorSystem 6 | import knolx.Config.{bootstrapServer, topic} 7 | import knolx.KnolXLogger 8 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} 9 | import org.apache.kafka.common.serialization.StringSerializer 10 | 11 | import scala.concurrent.ExecutionContext.Implicits.global 12 | import scala.concurrent.duration.DurationInt 13 | import scala.language.postfixOps 14 | import scala.util.Random 15 | 16 | /** 17 | * Copyright Knoldus Inc.. All rights reserved. 18 | */ 19 | object MultiDataStreamer extends App with KnolXLogger { 20 | val system = ActorSystem("DataStreamer") 21 | 22 | val props = new Properties() 23 | props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServer) 24 | props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName) 25 | props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName) 26 | 27 | val producer = new KafkaProducer[String, String](props) 28 | 29 | info("Streaming data into Kafka...") 30 | system.scheduler.schedule(0 seconds, 3000 milliseconds) { 31 | (1 to Random.nextInt(100)).foreach { id => 32 | producer.send(new ProducerRecord[String, String](topic,s"device$id", (Math.random * 2 + 1).toString)) 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/knolx/kafka/StreamStaticDataGenerator.scala: -------------------------------------------------------------------------------- 1 | package knolx.kafka 2 | 3 | import java.util.Properties 4 | 5 | import akka.actor.ActorSystem 6 | import knolx.Config.{bootstrapServer, topic} 7 | import knolx.KnolXLogger 8 | import knolx.spark.Stock 9 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} 10 | import org.apache.kafka.common.serialization.StringSerializer 11 | import org.json4s.NoTypeHints 12 | import org.json4s.jackson.Serialization 13 | import org.json4s.jackson.Serialization.write 14 | 15 | import scala.concurrent.ExecutionContext.Implicits.global 16 | import scala.concurrent.duration.DurationInt 17 | import scala.util.Random 18 | 19 | /** 20 | * Copyright Knoldus Inc.. All rights reserved. 21 | */ 22 | object StreamStaticDataGenerator extends App with KnolXLogger { 23 | val system = ActorSystem("DataStreamer") 24 | val props = new Properties() 25 | props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServer) 26 | props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName) 27 | props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName) 28 | 29 | val producer = new KafkaProducer[String, String](props) 30 | 31 | val companyNames = List("kirloskar", "bajaj", "amul", "dlf", "ebay") 32 | val orderTypes = List("buy", "sell") 33 | val numberOfSharesList = List(1, 2, 3, 4, 5, 6, 7, 8, 9) 34 | 35 | implicit val formats = Serialization.formats(NoTypeHints) 36 | info("Streaming data into Kafka...") 37 | system.scheduler.schedule(0 seconds, 5 seconds) { 38 | companyNames.foreach { name => 39 | val stock = Stock(name, Random.shuffle(numberOfSharesList).head, Random.shuffle(orderTypes).head) 40 | producer.send(new ProducerRecord[String, String](topic, write(stock))) 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala/knolx/kafka/StreamStreamDataGenerator.scala: -------------------------------------------------------------------------------- 1 | package knolx.kafka 2 | 3 | import java.util.Properties 4 | 5 | import akka.actor.ActorSystem 6 | import knolx.Config._ 7 | import knolx.KnolXLogger 8 | import knolx.spark.Stock 9 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} 10 | import org.apache.kafka.common.serialization.StringSerializer 11 | import org.json4s.NoTypeHints 12 | import org.json4s.jackson.Serialization 13 | import org.json4s.jackson.Serialization.write 14 | 15 | import scala.concurrent.ExecutionContext.Implicits.global 16 | import scala.concurrent.duration.DurationInt 17 | import scala.util.Random 18 | 19 | /** 20 | * Copyright Knoldus Inc.. All rights reserved. 21 | */ 22 | object StreamStreamDataGenerator extends App with KnolXLogger { 23 | val system = ActorSystem("DataStreamer") 24 | val props = new Properties() 25 | props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServer) 26 | props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName) 27 | props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getName) 28 | 29 | val producer = new KafkaProducer[String, String](props) 30 | 31 | val companyNames = List("kirloskar", "bajaj", "amul", "dlf", "ebay") 32 | val orderTypes = List("buy", "sell") 33 | val numberOfSharesList = List(1, 2, 3, 4, 5, 6, 7, 8, 9) 34 | val randomCompanyNames = Random.shuffle(companyNames).drop(Random.shuffle((1 to 3).toList).head) 35 | 36 | implicit val formats = Serialization.formats(NoTypeHints) 37 | 38 | info("Streaming companies listed into Kafka...") 39 | system.scheduler.schedule(0 seconds, 20 seconds) { 40 | randomCompanyNames.foreach { name => 41 | producer.send(new ProducerRecord[String, String](companiesTopic, name)) 42 | } 43 | } 44 | 45 | info("Streaming stocks data into Kafka...") 46 | system.scheduler.schedule(0 seconds, 5 seconds) { 47 | companyNames.foreach { name => 48 | val stock = Stock(name, Random.shuffle(numberOfSharesList).head, Random.shuffle(orderTypes).head) 49 | producer.send(new ProducerRecord[String, String](stocksTopic, write(stock))) 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/knolx/spark/CassandraForeachWriter.scala: -------------------------------------------------------------------------------- 1 | package knolx.spark 2 | 3 | import com.datastax.driver.core.{Cluster, Session} 4 | import knolx.Config.{cassandraHosts, keyspace} 5 | import org.apache.spark.sql.{ForeachWriter, Row} 6 | 7 | /** 8 | * Copyright Knoldus Inc.. All rights reserved. 9 | */ 10 | object CassandraForeachWriter extends Serializable { 11 | val writeToCassandra = new ForeachWriter[Row] { 12 | private var cluster: Cluster = _ 13 | private var session: Session = _ 14 | 15 | override def process(row: Row): Unit = { 16 | val word = row.getString(0) 17 | val count = row.getLong(1) 18 | 19 | session.execute(s"insert into $keyspace.wordcount (word, count) values ('$word', $count);") 20 | } 21 | 22 | override def close(errorOrNull: Throwable): Unit = { 23 | session.close() 24 | session.getCluster.close() 25 | } 26 | 27 | override def open(partitionId: Long, version: Long): Boolean = { 28 | cluster = Cluster.builder.addContactPoints(cassandraHosts).build 29 | session = cluster.newSession() 30 | true 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/scala/knolx/spark/Device.scala: -------------------------------------------------------------------------------- 1 | package knolx.spark 2 | 3 | case class Device(powerConsumed: Double) 4 | -------------------------------------------------------------------------------- /src/main/scala/knolx/spark/MultiStreamHandler.scala: -------------------------------------------------------------------------------- 1 | package knolx.spark 2 | 3 | import knolx.Config._ 4 | import knolx.KnolXLogger 5 | import org.apache.spark.sql.functions.col 6 | import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode} 7 | import org.apache.spark.sql.types.StringType 8 | import org.apache.spark.sql.{Encoders, SparkSession} 9 | 10 | /** 11 | * Copyright Knoldus, Inc. 2018. All rights reserved. 12 | */ 13 | case class CurrentPowerConsumption(kwh: Double) 14 | 15 | case class PowerConsumptionStatus(numOfReadings: Long, total: Double, avg: Double, status: String) { 16 | def compute(newReadings: List[Double]) = { 17 | val newTotal = newReadings.sum + total 18 | val newNumOfReadings = numOfReadings + newReadings.size 19 | val newAvg = newTotal / newNumOfReadings.toDouble 20 | 21 | PowerConsumptionStatus(newNumOfReadings, newTotal, newAvg, "ON") 22 | } 23 | } 24 | 25 | object MultiStreamHandler extends App with KnolXLogger { 26 | info("Creating Spark Session") 27 | val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() 28 | spark.sparkContext.setLogLevel("WARN") 29 | 30 | val updateStateFunc = 31 | (deviceId: String, newReadings: Iterator[(String, CurrentPowerConsumption)], state: GroupState[PowerConsumptionStatus]) => { 32 | val data = newReadings.toList.map { case(_, reading) => reading }.map(_.kwh) 33 | 34 | lazy val initialPowerConsumptionStatus = PowerConsumptionStatus(0L, 0D, 0D, "OFF") 35 | val currentState = state.getOption.fold(initialPowerConsumptionStatus.compute(data))(_.compute(data)) 36 | 37 | val currentStatus = 38 | if(state.hasTimedOut) { 39 | // If we do not receive any reading, for a device, we will assume that it is OFF. 40 | currentState.copy(status = "OFF") 41 | } else { 42 | state.setTimeoutDuration("10 seconds") 43 | currentState 44 | } 45 | 46 | state.update(currentStatus) 47 | (deviceId, currentStatus) 48 | } 49 | 50 | info("Creating Streaming DF...") 51 | val dataStream = 52 | spark 53 | .readStream 54 | .format("kafka") 55 | .option("kafka.bootstrap.servers", bootstrapServer) 56 | .option("subscribe", topic) 57 | .option("failOnDataLoss", false) 58 | .option("includeTimestamp", true) 59 | .load() 60 | 61 | info("Writing data to Console...") 62 | import spark.implicits._ 63 | 64 | implicit val currentPowerConsumptionEncoder = Encoders.kryo[CurrentPowerConsumption] 65 | implicit val powerConsumptionStatusEncoder = Encoders.kryo[PowerConsumptionStatus] 66 | 67 | val query = 68 | dataStream 69 | .select(col("key").cast(StringType).as("key"), col("value").cast(StringType).as("value")) 70 | .as[(String, String)] 71 | .map { case(deviceId, unit) => 72 | (deviceId, CurrentPowerConsumption(Option(unit).fold(0D)(_.toDouble))) 73 | } 74 | .groupByKey { case(deviceId, _) => deviceId } 75 | .mapGroupsWithState[PowerConsumptionStatus, (String, PowerConsumptionStatus)](GroupStateTimeout.ProcessingTimeTimeout())(updateStateFunc) 76 | .toDF("deviceId", "current_status") 77 | .writeStream 78 | .format("console") 79 | .option("truncate", false) 80 | .outputMode(OutputMode.Update()) 81 | .option("checkpointLocation", checkPointDir) 82 | .start() 83 | 84 | info("Waiting for the query to terminate...") 85 | query.awaitTermination() 86 | query.stop() 87 | } 88 | -------------------------------------------------------------------------------- /src/main/scala/knolx/spark/Stock.scala: -------------------------------------------------------------------------------- 1 | package knolx.spark 2 | 3 | case class Stock(stockName: String, numberOfShares: Int, orderType: String) 4 | -------------------------------------------------------------------------------- /src/main/scala/knolx/spark/StreamStaticJoiner.scala: -------------------------------------------------------------------------------- 1 | package knolx.spark 2 | 3 | import knolx.Config._ 4 | import knolx.KnolXLogger 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.catalyst.ScalaReflection 7 | import org.apache.spark.sql.functions.{col, from_json} 8 | import org.apache.spark.sql.types.StructType 9 | 10 | /** 11 | * Copyright Knoldus Inc.. All rights reserved. 12 | */ 13 | object StreamStaticJoiner extends App with KnolXLogger { 14 | info("Creating Spark Session") 15 | val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() 16 | spark.sparkContext.setLogLevel("WARN") 17 | 18 | info("Static Dataframe") 19 | val companiesDF = spark.read.option("header", "true").csv("src/main/resources/companies.csv") 20 | companiesDF.show(false) 21 | 22 | info("Original Streaming Dataframe") 23 | val schema = ScalaReflection.schemaFor[Stock].dataType.asInstanceOf[StructType] 24 | val stockStreamDF = 25 | spark 26 | .readStream 27 | .format("kafka") 28 | .option("kafka.bootstrap.servers", bootstrapServer) 29 | .option("subscribe", topic) 30 | .load() 31 | .select(from_json(col("value").cast("string"), schema).as("value")) 32 | .select("value.*") 33 | 34 | stockStreamDF.printSchema() 35 | stockStreamDF.writeStream.format("console").start() 36 | 37 | info("Filtered Streaming Dataframe") 38 | val filteredStockStreamDF = stockStreamDF.join(companiesDF, "companyName") 39 | val filteredStockStreamingQuery = filteredStockStreamDF.writeStream.format("console").start() 40 | 41 | info("Waiting for the query to terminate...") 42 | filteredStockStreamingQuery.awaitTermination() 43 | filteredStockStreamingQuery.stop() 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala/knolx/spark/StreamStreamJoiner.scala: -------------------------------------------------------------------------------- 1 | package knolx.spark 2 | 3 | import knolx.Config._ 4 | import knolx.KnolXLogger 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.catalyst.ScalaReflection 7 | import org.apache.spark.sql.functions.{col, expr, from_json} 8 | import org.apache.spark.sql.types.StructType 9 | 10 | /** 11 | * Copyright Knoldus Inc.. All rights reserved. 12 | */ 13 | object StreamStreamJoiner extends App with KnolXLogger { 14 | info("Creating Spark Session") 15 | val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() 16 | spark.sparkContext.setLogLevel("WARN") 17 | 18 | info("Streaming companies Dataframe") 19 | val companiesDF = 20 | spark 21 | .readStream 22 | .format("kafka") 23 | .option("kafka.bootstrap.servers", bootstrapServer) 24 | .option("subscribe", companiesTopic) 25 | .load() 26 | .select(col("value").cast("string").as("companyName"), 27 | col("timestamp").as("companyTradingTime")) 28 | 29 | companiesDF.writeStream.format("console").option("truncate", false).start() 30 | 31 | info("Original Streaming Dataframe") 32 | val schema = ScalaReflection.schemaFor[Stock].dataType.asInstanceOf[StructType] 33 | val stockStreamDF = 34 | spark 35 | .readStream 36 | .format("kafka") 37 | .option("kafka.bootstrap.servers", bootstrapServer) 38 | .option("subscribe", stocksTopic) 39 | .load() 40 | .select(from_json(col("value").cast("string"), schema).as("value"), 41 | col("timestamp").as("stockInputTime")) 42 | .select("value.*", "stockInputTime") 43 | 44 | info("Filtered Streaming Dataframe") 45 | val filteredStockStreamDF = stockStreamDF.join(companiesDF, 46 | expr("companyName = stockName AND stockInputTime >= companyTradingTime AND stockInputTime <= companyTradingTime + interval 20 seconds")) 47 | val filteredStockStreamingQuery = filteredStockStreamDF.writeStream.format("console").option("truncate", false).start() 48 | 49 | info("Waiting for the query to terminate...") 50 | filteredStockStreamingQuery.awaitTermination() 51 | filteredStockStreamingQuery.stop() 52 | } 53 | -------------------------------------------------------------------------------- /src/main/scala/knolx/spark/StreamStreamOuterJoiner.scala: -------------------------------------------------------------------------------- 1 | package knolx.spark 2 | 3 | import knolx.Config._ 4 | import knolx.KnolXLogger 5 | import org.apache.spark.sql.SparkSession 6 | import org.apache.spark.sql.catalyst.ScalaReflection 7 | import org.apache.spark.sql.functions.{col, expr, from_json} 8 | import org.apache.spark.sql.types.StructType 9 | 10 | /** 11 | * Copyright Knoldus Inc.. All rights reserved. 12 | */ 13 | object StreamStreamOuterJoiner extends App with KnolXLogger { 14 | info("Creating Spark Session") 15 | val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() 16 | spark.sparkContext.setLogLevel("WARN") 17 | 18 | info("Streaming companies Dataframe") 19 | val companiesDF = 20 | spark 21 | .readStream 22 | .format("kafka") 23 | .option("kafka.bootstrap.servers", bootstrapServer) 24 | .option("subscribe", companiesTopic) 25 | .load() 26 | .select(col("value").cast("string").as("companyName"), 27 | col("timestamp").as("companyTradingTime")) 28 | .withWatermark("companyTradingTime", "10 seconds") 29 | 30 | companiesDF.writeStream.format("console").option("truncate", false).start() 31 | 32 | info("Original Streaming Dataframe") 33 | val schema = ScalaReflection.schemaFor[Stock].dataType.asInstanceOf[StructType] 34 | val stockStreamDF = 35 | spark 36 | .readStream 37 | .format("kafka") 38 | .option("kafka.bootstrap.servers", bootstrapServer) 39 | .option("subscribe", stocksTopic) 40 | .load() 41 | .select(from_json(col("value").cast("string"), schema).as("value"), 42 | col("timestamp").as("stockInputTime")) 43 | .select("value.*", "stockInputTime") 44 | .withWatermark("stockInputTime", "10 seconds") 45 | 46 | info("Filtered Streaming Dataframe") 47 | val filteredStockStreamDF = stockStreamDF.join(companiesDF, 48 | expr("companyName = stockName AND stockInputTime >= companyTradingTime AND stockInputTime <= companyTradingTime + interval 20 seconds"), 49 | joinType = "leftOuter") 50 | val filteredStockStreamingQuery = filteredStockStreamDF.writeStream.format("console").option("truncate", false).start() 51 | 52 | info("Waiting for the query to terminate...") 53 | filteredStockStreamingQuery.awaitTermination() 54 | filteredStockStreamingQuery.stop() 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/knolx/spark/StructuredStreamingWordCount.scala: -------------------------------------------------------------------------------- 1 | package knolx.spark 2 | 3 | import com.datastax.driver.core.Cluster 4 | import knolx.Config._ 5 | import knolx.KnolXLogger 6 | import knolx.spark.CassandraForeachWriter.writeToCassandra 7 | import org.apache.spark.sql.SparkSession 8 | import org.apache.spark.sql.functions.{col, lit, sum} 9 | import org.apache.spark.sql.streaming.OutputMode 10 | import org.apache.spark.sql.types.StringType 11 | 12 | /** 13 | * Copyright Knoldus Inc.. All rights reserved. 14 | */ 15 | object StructuredStreamingWordCount extends App with KnolXLogger { 16 | val cluster = Cluster.builder.addContactPoints(cassandraHosts).build 17 | val session = cluster.newSession() 18 | 19 | info("Creating Keypsace and tables in Cassandra...") 20 | session.execute(s"CREATE KEYSPACE IF NOT EXISTS $keyspace WITH " + 21 | "replication = {'class':'SimpleStrategy','replication_factor':1};") 22 | 23 | session.execute(s"CREATE TABLE IF NOT EXISTS $keyspace.wordcount ( word text PRIMARY KEY,count int );") 24 | 25 | info("Closing DB connection...") 26 | session.close() 27 | session.getCluster.close() 28 | 29 | info("Creating Spark Session") 30 | val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() 31 | spark.sparkContext.setLogLevel("WARN") 32 | 33 | info("Creating Streaming DF...") 34 | val dataStream = 35 | spark 36 | .readStream 37 | .format("kafka") 38 | .option("kafka.bootstrap.servers", bootstrapServer) 39 | .option("subscribe", topic) 40 | .load() 41 | 42 | info("Writing data to Cassandra...") 43 | val query = 44 | dataStream 45 | .select(col("value").cast(StringType).as("word"), lit(1).as("count")) 46 | .groupBy(col("word")) 47 | .agg(sum("count").as("count")) 48 | .writeStream 49 | .outputMode(OutputMode.Update()) 50 | .foreach(writeToCassandra) 51 | .option("checkpointLocation", checkPointDir) 52 | .start() 53 | 54 | info("Waiting for the query to terminate...") 55 | query.awaitTermination() 56 | query.stop() 57 | } 58 | --------------------------------------------------------------------------------