├── .bsp └── sbt.json ├── .gitignore ├── .idea ├── .gitignore ├── codeStyles │ ├── Project.xml │ └── codeStyleConfig.xml ├── misc.xml ├── modules.xml ├── modules │ ├── flink-essentials-build.iml │ └── flink-essentials.iml ├── runConfigurations.xml ├── sbt.xml ├── scala_compiler.xml └── vcs.xml ├── README.md ├── build.sbt ├── docker ├── cassandra │ ├── cql.sh │ └── docker-compose.yml ├── flink │ ├── application-cluster │ │ └── docker-compose.yml │ └── session-cluster │ │ └── docker-compose.yml ├── kafka │ └── docker-compose.yml └── postgres │ └── docker-compose.yml ├── project └── build.properties └── src └── main ├── resources └── logback.xml └── scala ├── generators ├── gaming │ └── gaming.scala └── shopping │ └── package.scala ├── part1recap └── ScalaRecap.scala ├── part2datastreams ├── EssentialStreams.scala ├── MultipleStreams.scala ├── Partitions.scala ├── TimeBasedTransformations.scala ├── Triggers.scala └── WindowFunctions.scala ├── part3state ├── BroadcastState.scala ├── Checkpoints.scala ├── KeyedState.scala └── RichFunctions.scala ├── part4io ├── CassandraIntegration.scala ├── CustomSinks.scala ├── CustomSources.scala ├── JDBCIntegration.scala ├── KafkaIntegration.scala └── SideOutputs.scala └── playground └── Playground.scala /.bsp/sbt.json: -------------------------------------------------------------------------------- 1 | {"name":"sbt","version":"1.6.2","bspVersion":"2.0.0-M5","languages":["scala"],"argv":["/Users/daniel/Library/Java/JavaVirtualMachines/adopt-openjdk-11.0.11/Contents/Home/bin/java","-Xms100m","-Xmx100m","-classpath","/Users/daniel/Library/Application Support/JetBrains/IdeaIC2021.2/plugins/Scala/launcher/sbt-launch.jar","xsbt.boot.Boot","-bsp","--sbt-launch-jar=/Users/daniel/Library/Application%20Support/JetBrains/IdeaIC2021.2/plugins/Scala/launcher/sbt-launch.jar"]} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## output dirs 2 | output/ 3 | checkpoints/ 4 | 5 | # Created by https://www.toptal.com/developers/gitignore/api/intellij,sbt,scala,java 6 | # Edit at https://www.toptal.com/developers/gitignore?templates=intellij,sbt,scala,java 7 | 8 | ### Intellij ### 9 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 10 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 11 | 12 | # User-specific stuff 13 | .idea/**/workspace.xml 14 | .idea/**/tasks.xml 15 | .idea/**/usage.statistics.xml 16 | .idea/**/dictionaries 17 | .idea/**/shelf 18 | 19 | # AWS User-specific 20 | .idea/**/aws.xml 21 | 22 | # Generated files 23 | .idea/**/contentModel.xml 24 | 25 | # Sensitive or high-churn files 26 | .idea/**/dataSources/ 27 | .idea/**/dataSources.ids 28 | .idea/**/dataSources.local.xml 29 | .idea/**/sqlDataSources.xml 30 | .idea/**/dynamic.xml 31 | .idea/**/uiDesigner.xml 32 | .idea/**/dbnavigator.xml 33 | 34 | # Gradle 35 | .idea/**/gradle.xml 36 | .idea/**/libraries 37 | 38 | # Gradle and Maven with auto-import 39 | # When using Gradle or Maven with auto-import, you should exclude module files, 40 | # since they will be recreated, and may cause churn. Uncomment if using 41 | # auto-import. 42 | # .idea/artifacts 43 | # .idea/compiler.xml 44 | # .idea/jarRepositories.xml 45 | # .idea/modules.xml 46 | # .idea/*.iml 47 | # .idea/modules 48 | # *.iml 49 | # *.ipr 50 | 51 | # CMake 52 | cmake-build-*/ 53 | 54 | # Mongo Explorer plugin 55 | .idea/**/mongoSettings.xml 56 | 57 | # File-based project format 58 | *.iws 59 | 60 | # IntelliJ 61 | out/ 62 | 63 | # mpeltonen/sbt-idea plugin 64 | .idea_modules/ 65 | 66 | # JIRA plugin 67 | atlassian-ide-plugin.xml 68 | 69 | # Cursive Clojure plugin 70 | .idea/replstate.xml 71 | 72 | # SonarLint plugin 73 | .idea/sonarlint/ 74 | 75 | # Crashlytics plugin (for Android Studio and IntelliJ) 76 | com_crashlytics_export_strings.xml 77 | crashlytics.properties 78 | crashlytics-build.properties 79 | fabric.properties 80 | 81 | # Editor-based Rest Client 82 | .idea/httpRequests 83 | 84 | # Android studio 3.1+ serialized cache file 85 | .idea/caches/build_file_checksums.ser 86 | 87 | ### Intellij Patch ### 88 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 89 | 90 | # *.iml 91 | # modules.xml 92 | # .idea/misc.xml 93 | # *.ipr 94 | 95 | # Sonarlint plugin 96 | # https://plugins.jetbrains.com/plugin/7973-sonarlint 97 | .idea/**/sonarlint/ 98 | 99 | # SonarQube Plugin 100 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin 101 | .idea/**/sonarIssues.xml 102 | 103 | # Markdown Navigator plugin 104 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced 105 | .idea/**/markdown-navigator.xml 106 | .idea/**/markdown-navigator-enh.xml 107 | .idea/**/markdown-navigator/ 108 | 109 | # Cache file creation bug 110 | # See https://youtrack.jetbrains.com/issue/JBR-2257 111 | .idea/$CACHE_FILE$ 112 | 113 | # CodeStream plugin 114 | # https://plugins.jetbrains.com/plugin/12206-codestream 115 | .idea/codestream.xml 116 | 117 | ### Java ### 118 | # Compiled class file 119 | *.class 120 | 121 | # Log file 122 | *.log 123 | 124 | # BlueJ files 125 | *.ctxt 126 | 127 | # Mobile Tools for Java (J2ME) 128 | .mtj.tmp/ 129 | 130 | # Package Files # 131 | *.jar 132 | *.war 133 | *.nar 134 | *.ear 135 | *.zip 136 | *.tar.gz 137 | *.rar 138 | 139 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 140 | hs_err_pid* 141 | replay_pid* 142 | 143 | ### SBT ### 144 | # Simple Build Tool 145 | # http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control 146 | 147 | dist/* 148 | target/ 149 | lib_managed/ 150 | src_managed/ 151 | project/boot/ 152 | project/plugins/project/ 153 | .history 154 | .cache 155 | .lib/ 156 | 157 | ### Scala ### 158 | 159 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 160 | 161 | # End of https://www.toptal.com/developers/gitignore/api/intellij,sbt,scala,java 162 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/codeStyles/Project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 8 | 9 | 10 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /.idea/codeStyles/codeStyleConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/modules/flink-essentials-build.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 116 | -------------------------------------------------------------------------------- /.idea/modules/flink-essentials.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /.idea/runConfigurations.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | 10 | -------------------------------------------------------------------------------- /.idea/sbt.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 17 | 18 | -------------------------------------------------------------------------------- /.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The official repository for the Rock the JVM Flink course for Scala developers 2 | 3 | This repository contains the code we wrote during [Rock the JVM's Flink course](https://rockthejvm.com/course/flink). Unless explicitly mentioned, the code in this repository is exactly what was caught on camera. 4 | 5 | ## How to install 6 | 7 | - install [IntelliJ IDEA](https://jetbrains.com/idea) 8 | - install [Docker](https://www.docker.com/products/docker-desktop) 9 | - either clone the repo or download as zip 10 | - open with IntelliJ as an SBT project 11 | - (optionally) in the `docker` directory, navigate to each subdir (except for `flink`) and run `docker-compose up` 12 | 13 | ### How to start 14 | 15 | Clone this repository and checkout the `start` tag by running the following in the repo folder: 16 | 17 | ``` 18 | git checkout start 19 | ``` 20 | 21 | To see the final code, run: 22 | 23 | ``` 24 | git checkout master 25 | ``` 26 | 27 | ### How to run an intermediate state 28 | 29 | The repository was built while recording the lectures. Prior to each lecture, I tagged each commit so you can easily go back to an earlier state of the repo! 30 | 31 | The tags are as follows: 32 | 33 | - `start` 34 | - `1.1-scala-recap` 35 | - `2.1-essential-streams` 36 | - `2.2-essential-streams-exercise` 37 | - `2.3-essential-streams-explicit` 38 | - `2.5-window-functions` 39 | - `2.6-window-functions-part-2` 40 | - `2.7-window-functions-exercise` 41 | - `2.8-time-based-transformations` 42 | - `2.9-triggers` 43 | - `2.10-multiple-streams` 44 | - `2.11-partitions` 45 | - `3.2-rich-functions` 46 | - `3.3-keyed-state` 47 | - `3.4-keyed-state-2` 48 | - `3.5-broadcast-state` 49 | - `3.6-checkpoint` 50 | - `4.1-kafka` 51 | - `4.2-jdbc` 52 | - `4.3-cassandra` 53 | - `4.4-source-functions` 54 | - `4.5-custom-sinks` 55 | - `4.6-side-outputs` 56 | 57 | When you watch a lecture, you can `git checkout` the appropriate tag and the repo will go back to the exact code I had when I started the lecture. 58 | 59 | ### For questions or suggestions 60 | 61 | If you have changes to suggest to this repo, either 62 | - submit a GitHub issue 63 | - tell me in the course Q/A forum 64 | - submit a pull request! 65 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "flink-essentials" 2 | 3 | version := "0.1" 4 | 5 | scalaVersion := "2.12.15" 6 | 7 | val flinkVersion = "1.13.2" 8 | val postgresVersion = "42.2.2" 9 | val logbackVersion = "1.2.10" 10 | 11 | val flinkDependencies = Seq( 12 | "org.apache.flink" %% "flink-clients" % flinkVersion, 13 | "org.apache.flink" %% "flink-scala" % flinkVersion, 14 | "org.apache.flink" %% "flink-streaming-scala" % flinkVersion, 15 | ) 16 | 17 | val flinkConnectors = Seq( 18 | "org.apache.flink" %% "flink-connector-kafka" % flinkVersion, 19 | "org.apache.flink" %% "flink-connector-cassandra" % flinkVersion, 20 | "org.apache.flink" %% "flink-connector-jdbc" % flinkVersion, 21 | "org.postgresql" % "postgresql" % postgresVersion 22 | ) 23 | 24 | val logging = Seq( 25 | "ch.qos.logback" % "logback-core" % logbackVersion, 26 | "ch.qos.logback" % "logback-classic" % logbackVersion 27 | ) 28 | 29 | libraryDependencies ++= flinkDependencies ++ flinkConnectors ++ logging 30 | -------------------------------------------------------------------------------- /docker/cassandra/cql.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo "================== Help for cqlsh =========================" 3 | echo "DESCRIBE tables : Prints all tables in the current keyspace" 4 | echo "DESCRIBE keyspaces : Prints all keyspaces in the current cluster" 5 | echo "DESCRIBE : Prints table detail information" 6 | echo "help : for more cqlsh commands" 7 | echo "help [cqlsh command] : Gives information about cqlsh commands" 8 | echo "quit : quit" 9 | echo "==================================================================" 10 | docker exec -it rockthejvm-flink-cassandra cqlsh -------------------------------------------------------------------------------- /docker/cassandra/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | cassandra: 4 | image: cassandra:3 5 | container_name: rockthejvm-flink-cassandra 6 | ports: 7 | - "7000:7000" 8 | - "9042:9042" 9 | environment: 10 | - "CASSANDRA_CLUSTER_NAME=OUR_DOCKERIZED_CASSANDRA_SINGLE_NODE_CLUSTER" -------------------------------------------------------------------------------- /docker/flink/application-cluster/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2.2" 2 | services: 3 | jobmanager: 4 | image: flink:latest 5 | ports: 6 | - "8081:8081" 7 | command: standalone-job --job-classname com.job.ClassName 8 | volumes: 9 | - ../artifacts:/opt/flink/usrlib 10 | environment: 11 | - | 12 | FLINK_PROPERTIES= 13 | jobmanager.rpc.address: jobmanager 14 | parallelism.default: 2 15 | 16 | taskmanager: 17 | image: flink:latest 18 | depends_on: 19 | - jobmanager 20 | command: taskmanager 21 | scale: 1 22 | volumes: 23 | - ../artifacts:/opt/flink/usrlib 24 | environment: 25 | - | 26 | FLINK_PROPERTIES= 27 | jobmanager.rpc.address: jobmanager 28 | taskmanager.numberOfTaskSlots: 2 29 | parallelism.default: 2 -------------------------------------------------------------------------------- /docker/flink/session-cluster/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "2.2" 2 | services: 3 | jobmanager: 4 | image: flink:latest 5 | ports: 6 | - "8081:8081" 7 | volumes: 8 | - ../artifacts:/opt/flink/usrlib 9 | command: jobmanager 10 | environment: 11 | - | 12 | FLINK_PROPERTIES= 13 | jobmanager.rpc.address: jobmanager 14 | 15 | taskmanager: 16 | image: flink:latest 17 | depends_on: 18 | - jobmanager 19 | command: taskmanager 20 | scale: 1 21 | environment: 22 | - | 23 | FLINK_PROPERTIES= 24 | jobmanager.rpc.address: jobmanager 25 | taskmanager.numberOfTaskSlots: 2 -------------------------------------------------------------------------------- /docker/kafka/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | zookeeper: 4 | image: confluentinc/cp-zookeeper:6.2.0 5 | hostname: zookeeper 6 | container_name: rockthejvm-flink-zookeeper 7 | ports: 8 | - "2181:2181" 9 | environment: 10 | ZOOKEEPER_CLIENT_PORT: 2181 11 | ZOOKEEPER_TICK_TIME: 2000 12 | 13 | kafka: 14 | image: confluentinc/cp-kafka:6.2.0 15 | hostname: broker 16 | container_name: rockthejvm-flink-broker 17 | depends_on: 18 | - zookeeper 19 | ports: 20 | - "29092:29092" 21 | - "9092:9092" 22 | - "9101:9101" 23 | environment: 24 | KAFKA_BROKER_ID: 1 25 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' 26 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 27 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092 28 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 29 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 30 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 31 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 32 | KAFKA_JMX_PORT: 9101 33 | KAFKA_JMX_HOSTNAME: localhost -------------------------------------------------------------------------------- /docker/postgres/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | 3 | services: 4 | postgres: 5 | image: postgres:latest 6 | container_name: rockthejvm-flink-postgres 7 | environment: 8 | - "TZ=Europe/Amsterdam" 9 | - "POSTGRES_USER=docker" 10 | - "POSTGRES_PASSWORD=docker" 11 | ports: 12 | - "5432:5432" 13 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.6.2 -------------------------------------------------------------------------------- /src/main/resources/logback.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/main/scala/generators/gaming/gaming.scala: -------------------------------------------------------------------------------- 1 | package generators 2 | 3 | import java.time.Instant 4 | import java.util.UUID 5 | import scala.concurrent.duration.FiniteDuration 6 | 7 | /** 8 | * A package describing events related to a multiplayer game. 9 | * We analyze some essential Flink features based on these data types. 10 | */ 11 | package object gaming { 12 | 13 | sealed trait ServerEvent { 14 | def eventTime: Instant 15 | def getId: String 16 | } 17 | 18 | sealed trait GameType 19 | case object OneVsOne extends GameType 20 | case object TwoVsTwo extends GameType 21 | case object ThreeVsThree extends GameType 22 | case object FourVsFour extends GameType 23 | 24 | final case class GameStarted( 25 | eventTime: Instant, 26 | gameId: UUID, 27 | playerIds: Vector[UUID], 28 | mapId: String, 29 | regionId: String, 30 | gameType: GameType 31 | ) extends ServerEvent { 32 | override def getId: String = s"game|$gameId" 33 | } 34 | 35 | final case class GameFinished( 36 | eventTime: Instant, 37 | gameId: UUID 38 | ) extends ServerEvent { 39 | override def getId: String = s"game|$gameId" 40 | } 41 | 42 | final case class PlayerRegistered( 43 | eventTime: Instant, 44 | playerId: UUID, 45 | nickname: String 46 | ) extends ServerEvent { 47 | override def getId: String = s"player|$playerId|$nickname" 48 | } 49 | 50 | final case class PlayerOnline( 51 | eventTime: Instant, 52 | playerId: UUID, 53 | nickname: String 54 | ) extends ServerEvent { 55 | override def getId: String = s"player|$playerId|$nickname" 56 | } 57 | 58 | final case class PlayerIsLookingForAGame( 59 | eventTime: Instant, 60 | playerId: UUID, 61 | gameType: GameType 62 | ) extends ServerEvent { 63 | override def getId: String = s"player|$playerId" 64 | } 65 | 66 | final case class PlayerOffline( 67 | eventTime: Instant, 68 | playerId: UUID, 69 | nickname: String 70 | ) extends ServerEvent { 71 | override def getId: String = s"player|$playerId|$nickname" 72 | } 73 | 74 | case class Player(playerId: UUID, nickname: String) { 75 | 76 | def register(d: FiniteDuration)(implicit startTime: Instant): PlayerRegistered = 77 | PlayerRegistered(startTime.plusMillis(d.toMillis), playerId, nickname) 78 | 79 | def online(d: FiniteDuration)(implicit startTime: Instant): PlayerOnline = 80 | PlayerOnline(startTime.plusMillis(d.toMillis), playerId, nickname) 81 | 82 | def offline(d: FiniteDuration)(implicit startTime: Instant): PlayerOffline = 83 | PlayerOffline(startTime.plusMillis(d.toMillis), playerId, nickname) 84 | 85 | def lookingForAGame( 86 | startTime: Instant, 87 | d: FiniteDuration, 88 | gameType: GameType 89 | ): PlayerIsLookingForAGame = 90 | PlayerIsLookingForAGame( 91 | startTime.plusMillis(d.toMillis), 92 | playerId, 93 | gameType 94 | ) 95 | } 96 | 97 | val bob: Player = Player(UUID.randomUUID(), "bob") 98 | val sam: Player = Player(UUID.randomUUID(), "sam") 99 | val rob: Player = Player(UUID.randomUUID(), "rob") 100 | val alice: Player = Player(UUID.randomUUID(), "alice") 101 | val mary: Player = Player(UUID.randomUUID(), "mary") 102 | val carl: Player = Player(UUID.randomUUID(), "carl") 103 | } 104 | -------------------------------------------------------------------------------- /src/main/scala/generators/shopping/package.scala: -------------------------------------------------------------------------------- 1 | package generators 2 | 3 | import org.apache.flink.streaming.api.functions.source.{ 4 | RichParallelSourceFunction, 5 | SourceFunction 6 | } 7 | 8 | import java.util.UUID 9 | import scala.annotation.tailrec 10 | import org.apache.flink.streaming.api.watermark.Watermark 11 | 12 | /** 13 | * A package describing data types and generator functions for shopping cart events. 14 | * We analyze a variety of scenarios in the course, and these generators were built for this purpose. 15 | */ 16 | package object shopping { 17 | 18 | sealed trait ShoppingCartEvent { 19 | def userId: String 20 | 21 | def time: java.time.Instant 22 | } 23 | 24 | case class AddToShoppingCartEvent( 25 | userId: String, 26 | sku: String, 27 | quantity: Int, 28 | time: java.time.Instant 29 | ) extends ShoppingCartEvent 30 | 31 | case class RemovedFromShoppingCartEvent( 32 | userId: String, 33 | sku: String, 34 | quantity: Int, 35 | time: java.time.Instant 36 | ) extends ShoppingCartEvent 37 | 38 | 39 | class ShoppingCartEventsGenerator( 40 | sleepMillisPerEvent: Int, 41 | batchSize: Int, 42 | baseInstant: java.time.Instant = java.time.Instant.now() 43 | ) extends SourceFunction[ShoppingCartEvent] { 44 | 45 | import ShoppingCartEventsGenerator._ 46 | 47 | @volatile private var running = true 48 | 49 | @tailrec 50 | private def run( 51 | startId: Long, 52 | ctx: SourceFunction.SourceContext[ShoppingCartEvent] 53 | ): Unit = 54 | if (running) { 55 | generateRandomEvents(startId).foreach(ctx.collect) 56 | Thread.sleep(batchSize * sleepMillisPerEvent) 57 | run(startId + batchSize, ctx) 58 | } 59 | 60 | private def generateRandomEvents(id: Long): Seq[AddToShoppingCartEvent] = { 61 | val events = (1 to batchSize) 62 | .map(_ => 63 | AddToShoppingCartEvent( 64 | getRandomUser, 65 | UUID.randomUUID().toString, 66 | getRandomQuantity, 67 | baseInstant.plusSeconds(id) 68 | ) 69 | ) 70 | 71 | events 72 | } 73 | 74 | override def run( 75 | ctx: SourceFunction.SourceContext[ShoppingCartEvent] 76 | ): Unit = run(0, ctx) 77 | 78 | override def cancel(): Unit = { 79 | running = false 80 | } 81 | } 82 | 83 | class SingleShoppingCartEventsGenerator( 84 | sleepMillisBetweenEvents: Int, 85 | baseInstant: java.time.Instant = java.time.Instant.now(), 86 | extraDelayInMillisOnEveryTenEvents: Option[Long] = None, 87 | sourceId: Option[String] = None, 88 | generateRemoved: Boolean = false 89 | ) extends EventGenerator[ShoppingCartEvent]( 90 | sleepMillisBetweenEvents, 91 | SingleShoppingCartEventsGenerator.generateEvent( 92 | generateRemoved, 93 | () => sourceId 94 | .map(sId => s"${sId}_${UUID.randomUUID()}") 95 | .getOrElse(UUID.randomUUID().toString), 96 | baseInstant 97 | ), 98 | baseInstant, 99 | extraDelayInMillisOnEveryTenEvents 100 | ) 101 | 102 | object SingleShoppingCartEventsGenerator { 103 | 104 | import ShoppingCartEventsGenerator._ 105 | 106 | def generateEvent 107 | : (Boolean, () => String, java.time.Instant) => Long => ShoppingCartEvent = 108 | (generateRemoved, skuGen, baseInstant) => 109 | id => 110 | if (!generateRemoved || scala.util.Random.nextBoolean()) 111 | AddToShoppingCartEvent( 112 | getRandomUser, 113 | skuGen(), 114 | getRandomQuantity, 115 | baseInstant.plusSeconds(id) 116 | ) 117 | else 118 | RemovedFromShoppingCartEvent( 119 | getRandomUser, 120 | skuGen(), 121 | getRandomQuantity, 122 | baseInstant.plusSeconds(id) 123 | ) 124 | } 125 | 126 | class EventGenerator[T]( 127 | sleepMillisBetweenEvents: Int, 128 | generator: Long => T, 129 | baseInstant: java.time.Instant, 130 | extraDelayInMillisOnEveryTenEvents: Option[Long] = None 131 | ) extends RichParallelSourceFunction[T] { 132 | @volatile private var running = true 133 | 134 | @tailrec 135 | private def run( 136 | id: Long, 137 | ctx: SourceFunction.SourceContext[T] 138 | ): Unit = 139 | if (running) { 140 | ctx.collect( 141 | generator(id) 142 | ) 143 | // this generator emits a watermark mimicking the same logic of 144 | // incrementing each element's timestamp 145 | ctx.emitWatermark(new Watermark(baseInstant.plusSeconds(id).toEpochMilli)) 146 | Thread.sleep(sleepMillisBetweenEvents) 147 | if (id % 10 == 0) extraDelayInMillisOnEveryTenEvents.foreach(Thread.sleep) 148 | run(id + 1, ctx) 149 | } 150 | 151 | override def run(ctx: SourceFunction.SourceContext[T]): Unit = 152 | run(1, ctx) 153 | 154 | override def cancel(): Unit = { 155 | running = false 156 | } 157 | } 158 | 159 | object ShoppingCartEventsGenerator { 160 | val users: Vector[String] = Vector("Bob", "Alice", "Sam", "Tom", "Diana") 161 | 162 | def getRandomUser: String = users(scala.util.Random.nextInt(users.length)) 163 | 164 | def getRandomQuantity: Int = scala.util.Random.nextInt(10) 165 | } 166 | 167 | sealed trait CatalogEvent { 168 | def userId: String 169 | 170 | def time: java.time.Instant 171 | } 172 | 173 | case class ProductDetailsViewed( 174 | userId: String, 175 | time: java.time.Instant, 176 | productId: String 177 | ) extends CatalogEvent 178 | 179 | class CatalogEventsGenerator( 180 | sleepMillisBetweenEvents: Int, 181 | baseInstant: java.time.Instant = java.time.Instant.now(), 182 | extraDelayInMillisOnEveryTenEvents: Option[Long] = None 183 | ) extends EventGenerator[CatalogEvent]( 184 | sleepMillisBetweenEvents, 185 | id => 186 | ProductDetailsViewed( 187 | ShoppingCartEventsGenerator.getRandomUser, 188 | baseInstant.plusSeconds(id), 189 | UUID.randomUUID().toString 190 | ), 191 | baseInstant, 192 | extraDelayInMillisOnEveryTenEvents 193 | ) 194 | } -------------------------------------------------------------------------------- /src/main/scala/part1recap/ScalaRecap.scala: -------------------------------------------------------------------------------- 1 | package part1recap 2 | 3 | import java.util.concurrent.Executors 4 | import scala.concurrent.{ExecutionContext, Future} 5 | import scala.util.{Try, Success, Failure} 6 | 7 | object ScalaRecap { 8 | 9 | // value 10 | val aBoolean: Boolean = false 11 | var aVariable: Int = 56 12 | aVariable += 1 13 | 14 | // expressions 15 | val anIfExpression: String = if (2 > 3) "bigger" else "smaller" 16 | 17 | // instructions vs expressions 18 | val theUnit: Unit = println("Hello, Scala") // Unit === "void" 19 | 20 | // OOP 21 | class Animal 22 | class Cat extends Animal 23 | trait Carnivore { 24 | def eat(animal: Animal): Unit 25 | } 26 | 27 | // inheritance: extends <= 1 class, but inherit from >= 0 traits 28 | class Crocodile extends Animal with Carnivore { 29 | override def eat(animal: Animal): Unit = println("eating this poor fellow") 30 | } 31 | 32 | // singleton 33 | object MySingleton 34 | 35 | // companions 36 | object Carnivore 37 | 38 | // case classes 39 | case class Person(name: String, age: Int) 40 | 41 | // generics 42 | class MyList[A] // can add variance modifiers - not important for this course 43 | 44 | // method notation 45 | // croc.eat(animal) OR croc eat animal 46 | val three = 1 + 2 47 | val three_v2 = 1.+(2) 48 | 49 | // FP 50 | val incrementer: Int => Int = x => x + 1 51 | val incremented = incrementer(4) // 5, same as incrementer.apply(4) 52 | 53 | // map flatMap filter = HOFs 54 | val processedList = List(1,2,3).map(incrementer) // [2,3,4] 55 | val aLongerList = List(1,2,3).flatMap(x => List(x, x + 1)) // [1,2, 2,3, 3,4] 56 | 57 | // for-comprehensions 58 | val checkerboard = List(1,2,3).flatMap(n => List('a', 'b', 'c').map(c => (n, c))) 59 | val checkerboard_v2 = for { 60 | n <- List(1,2,3) 61 | c <- List('a', 'b', 'c') 62 | } yield (n, c) // same 63 | 64 | // options and try 65 | val anOption: Option[Int] = Option(/* something that might be null*/ 43) 66 | val doubleOption = anOption.map(_ * 2) 67 | 68 | val anAttempt: Try[Int] = Try(12) 69 | val modifiedAttempt = anAttempt.map(_ * 10) 70 | 71 | // pattern matching 72 | val anUnknown: Any = 45 73 | val medal = anUnknown match { 74 | case 1 => "gold" 75 | case 2 => "silver" 76 | case 3 => "bronze" 77 | case _ => "no medal" 78 | } 79 | 80 | val optionDescription = anOption match { 81 | case Some(value) => s"the option is not empty: $value" 82 | case None => "the option is empty" 83 | } 84 | 85 | // Futures 86 | implicit val ec: ExecutionContext = ExecutionContext.fromExecutorService(Executors.newFixedThreadPool(8)) 87 | val aFuture = Future(/* something to be evaluated on another thread*/ 1 + 41) 88 | 89 | // register callback when it finishes 90 | aFuture.onComplete { 91 | case Success(value) => println(s"the async meaning of life is $value") 92 | case Failure(exception) => println(s"the meaning of value failed: $exception") 93 | } 94 | 95 | val aPartialFunction: PartialFunction[Try[Int], Unit] = { 96 | case Success(value) => println(s"the async meaning of life is $value") 97 | case Failure(exception) => println(s"the meaning of value failed: $exception") 98 | } 99 | 100 | // map, flatMap, filter, ... 101 | val doubledAsyncMOL: Future[Int] = aFuture.map(_ * 2) 102 | 103 | // implicits 104 | 105 | // 1 - implicit arguments and values 106 | implicit val timeout: Int = 3000 // implicit val == given instance 107 | def setTimeout(f: () => Unit)(implicit tout: Int) = { // (using tout: Int) 108 | Thread.sleep(tout) 109 | f() 110 | } 111 | 112 | setTimeout(() => println("timeout")) // (timeout) 113 | 114 | // 2 - extension methods 115 | implicit class MyRichInt(number: Int) { // implicit class = extension 116 | def isEven: Boolean = number % 2 == 0 117 | } 118 | 119 | val is2Even = 2.isEven // new RichInt(2).isEven 120 | 121 | // 3 - conversions 122 | implicit def string2Person(name: String): Person = 123 | Person(name, 57) 124 | 125 | val daniel: Person = "Daniel" // string2Person("Daniel") 126 | 127 | def main(args: Array[String]): Unit = { 128 | 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/main/scala/part2datastreams/EssentialStreams.scala: -------------------------------------------------------------------------------- 1 | package part2datastreams 2 | 3 | import org.apache.flink.api.common.functions.{FlatMapFunction, MapFunction, ReduceFunction} 4 | import org.apache.flink.api.common.serialization.SimpleStringEncoder 5 | import org.apache.flink.core.fs.Path 6 | import org.apache.flink.streaming.api.functions.ProcessFunction 7 | import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink 8 | import org.apache.flink.streaming.api.scala._ 9 | import org.apache.flink.util.Collector 10 | 11 | object EssentialStreams { 12 | 13 | def applicationTemplate(): Unit = { 14 | // execution environment 15 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment 16 | 17 | // in between, add any sort of computations 18 | import org.apache.flink.streaming.api.scala._ // import TypeInformation for the data of your DataStreams 19 | val simpleNumberStream: DataStream[Int] = env.fromElements(1,2,3,4) 20 | 21 | // perform some actions 22 | simpleNumberStream.print() 23 | 24 | // at the end 25 | env.execute() // trigger all the computations that were DESCRIBED earlier 26 | } 27 | 28 | // transformations 29 | def demoTransformations(): Unit = { 30 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment 31 | val numbers: DataStream[Int] = env.fromElements(1,2,3,4,5) 32 | 33 | // checking parallelism 34 | println(s"Current parallelism: ${env.getParallelism}") 35 | // set different parallelism 36 | env.setParallelism(2) 37 | println(s"New parallelism: ${env.getParallelism}") 38 | 39 | // map 40 | val doubledNumbers: DataStream[Int] = numbers.map(_ * 2) 41 | 42 | // flatMap 43 | val expandedNumbers: DataStream[Int] = numbers.flatMap(n => List(n, n + 1)) 44 | 45 | // filter 46 | val filteredNumbers: DataStream[Int] = numbers 47 | .filter(_ % 2 == 0) 48 | /* you can set parallelism here*/.setParallelism(4) 49 | 50 | val finalData = expandedNumbers.writeAsText("output/expandedStream") // directory with 12 files 51 | // set parallelism in the sink 52 | finalData.setParallelism(3) 53 | 54 | env.execute() 55 | } 56 | 57 | /** 58 | * Exercise: FizzBuzz on Flink 59 | * - take a stream of 100 natural numbers 60 | * - for every number 61 | * - if n % 3 == 0 then return "fizz" 62 | * - if n % 5 == 0 => "buzz" 63 | * - if both => "fizzbuzz" 64 | * - write the numbers for which you said "fizzbuzz" to a file 65 | */ 66 | case class FizzBuzzResult(n: Long, output: String) 67 | 68 | def fizzBuzzExercise(): Unit = { 69 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment 70 | val numbers = env.fromSequence(1, 100) 71 | 72 | // map 73 | val fizzbuzz = numbers 74 | .map { n => 75 | val output = 76 | if (n % 3 == 0 && n % 5 == 0) "fizzbuzz" 77 | else if (n % 3 == 0) "fizz" 78 | else if (n % 5 == 0) "buzz" 79 | else s"$n" 80 | FizzBuzzResult(n, output) 81 | } 82 | .filter(_.output == "fizzbuzz") // DataStream[FizzBuzzResult] 83 | .map(_.n) // DataStream[Long] 84 | 85 | // alternative to 86 | // fizzbuzz.writeAsText("output/fizzbuzz.txt").setParallelism(1) 87 | 88 | // add a SINK 89 | fizzbuzz.addSink( 90 | StreamingFileSink 91 | .forRowFormat( 92 | new Path("output/streaming_sink"), 93 | new SimpleStringEncoder[Long]("UTF-8") 94 | ) 95 | .build() 96 | ).setParallelism(1) 97 | 98 | env.execute() 99 | } 100 | 101 | // explicit transformations 102 | def demoExplicitTransformations(): Unit = { 103 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment 104 | val numbers = env.fromSequence(1, 100) 105 | 106 | // map 107 | val doubledNumbers = numbers.map(_ * 2) 108 | 109 | // explicit version 110 | val doubledNumbers_v2 = numbers.map(new MapFunction[Long, Long] { 111 | // declare fields, methods, ... 112 | override def map(value: Long) = value * 2 113 | }) 114 | 115 | // flatMap 116 | val expandedNumbers = numbers.flatMap(n => Range.Long(1, n, 1).toList) 117 | 118 | // explicit version 119 | val expandedNumbers_v2 = numbers.flatMap(new FlatMapFunction[Long, Long] { 120 | // declare fields, methods, ... 121 | override def flatMap(n: Long, out: Collector[Long]) = 122 | Range.Long(1, n, 1).foreach { i => 123 | out.collect(i) // imperative style - pushes the new element downstream 124 | } 125 | }) 126 | 127 | // process method 128 | // ProcessFunction is THE MOST GENERAL function to process elements in Flink 129 | val expandedNumbers_v3 = numbers.process(new ProcessFunction[Long, Long] { 130 | override def processElement(n: Long, ctx: ProcessFunction[Long, Long]#Context, out: Collector[Long]) = 131 | Range.Long(1, n, 1).foreach { i => 132 | out.collect(i) 133 | } 134 | }) 135 | 136 | // reduce 137 | // happens on keyed streams 138 | /* 139 | [ 1, false 140 | 2, true 141 | 142 | 100, true 143 | 144 | true => 2, 6, 12, 20, ... 145 | false => 1, 4, 9, 16, ... 146 | */ 147 | val keyedNumbers: KeyedStream[Long, Boolean] = numbers.keyBy(n => n % 2 == 0) 148 | 149 | // reduce - FP approach 150 | val sumByKey = keyedNumbers.reduce(_ + _) // sum up all the elements BY KEY 151 | 152 | // reduce - explicit approach 153 | val sumByKey_v2 = keyedNumbers.reduce(new ReduceFunction[Long] { 154 | // additional fields, methods... 155 | override def reduce(x: Long, y: Long): Long = x + y 156 | }) 157 | 158 | sumByKey_v2.print() 159 | env.execute() 160 | } 161 | 162 | def main(args: Array[String]): Unit = { 163 | demoExplicitTransformations() 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /src/main/scala/part2datastreams/MultipleStreams.scala: -------------------------------------------------------------------------------- 1 | package part2datastreams 2 | 3 | import generators.shopping._ 4 | import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy} 5 | import org.apache.flink.streaming.api.functions.co.{CoProcessFunction, ProcessJoinFunction} 6 | import org.apache.flink.streaming.api.scala._ 7 | import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows 8 | import org.apache.flink.streaming.api.windowing.time.Time 9 | import org.apache.flink.util.Collector 10 | 11 | object MultipleStreams { 12 | 13 | /* 14 | - union 15 | - window join 16 | - interval join 17 | - connect 18 | */ 19 | 20 | // Unioning = combining the output of multiple streams into just one 21 | def demoUnion(): Unit = { 22 | val env = StreamExecutionEnvironment.getExecutionEnvironment 23 | 24 | // define two streams of the same type 25 | val shoppingCartEventsKafka: DataStream[ShoppingCartEvent] = 26 | env.addSource(new SingleShoppingCartEventsGenerator(300, sourceId = Option("kafka"))) 27 | 28 | val shoppingCartEventsFiles: DataStream[ShoppingCartEvent] = 29 | env.addSource(new SingleShoppingCartEventsGenerator(1000, sourceId = Option("files"))) 30 | 31 | val combinedShoppingCartEventStream: DataStream[ShoppingCartEvent] = 32 | shoppingCartEventsKafka.union(shoppingCartEventsFiles) 33 | 34 | combinedShoppingCartEventStream.print() 35 | env.execute() 36 | } 37 | 38 | // window join = elements belong to the same window + some join condition 39 | def demoWindowJoins(): Unit = { 40 | val env = StreamExecutionEnvironment.getExecutionEnvironment 41 | 42 | val shoppingCartEvents = env.addSource(new SingleShoppingCartEventsGenerator(1000, sourceId = Option("kafka"))) 43 | val catalogEvents = env.addSource(new CatalogEventsGenerator(200)) 44 | 45 | val joinedStream = shoppingCartEvents 46 | .join(catalogEvents) 47 | // provide a join condition 48 | .where(shoppingCartEvent => shoppingCartEvent.userId) 49 | .equalTo(catalogEvent => catalogEvent.userId) 50 | // provide the same window grouping 51 | .window(TumblingProcessingTimeWindows.of(Time.seconds(5))) 52 | // do something with correlated events 53 | .apply { 54 | (shoppingCartEvent, catalogEvent) => 55 | s"User ${shoppingCartEvent.userId} browsed at ${catalogEvent.time} and bought at ${shoppingCartEvent.time}" 56 | } 57 | 58 | joinedStream.print() 59 | env.execute() 60 | } 61 | 62 | // interval joins = correlation between events A and B if durationMin < timeA - timeB < durationMax 63 | // involves EVENT TIME 64 | // only works on KEYED STREAMS 65 | 66 | def demoIntervalJoins(): Unit = { 67 | val env = StreamExecutionEnvironment.getExecutionEnvironment 68 | 69 | // we need to extract event times from both streams 70 | val shoppingCartEvents = 71 | env.addSource(new SingleShoppingCartEventsGenerator(300, sourceId = Option("kafka"))) 72 | .assignTimestampsAndWatermarks( 73 | WatermarkStrategy.forBoundedOutOfOrderness(java.time.Duration.ofMillis(500)) 74 | .withTimestampAssigner(new SerializableTimestampAssigner[ShoppingCartEvent] { 75 | override def extractTimestamp(element: ShoppingCartEvent, recordTimestamp: Long) = 76 | element.time.toEpochMilli 77 | }) 78 | ) 79 | .keyBy(_.userId) 80 | 81 | val catalogEvents = env.addSource(new CatalogEventsGenerator(500)) 82 | .assignTimestampsAndWatermarks( 83 | WatermarkStrategy.forBoundedOutOfOrderness(java.time.Duration.ofMillis(500)) 84 | .withTimestampAssigner(new SerializableTimestampAssigner[CatalogEvent] { 85 | override def extractTimestamp(element: CatalogEvent, recordTimestamp: Long) = 86 | element.time.toEpochMilli 87 | }) 88 | ) 89 | .keyBy(_.userId) 90 | 91 | val intervalJoinedStream = shoppingCartEvents 92 | .intervalJoin(catalogEvents) 93 | .between(Time.seconds(-2), Time.seconds(2)) 94 | .lowerBoundExclusive() // interval is by default inclusive 95 | .upperBoundExclusive() 96 | .process(new ProcessJoinFunction[ShoppingCartEvent, CatalogEvent, String] { 97 | override def processElement( 98 | left: ShoppingCartEvent, 99 | right: CatalogEvent, 100 | ctx: ProcessJoinFunction[ShoppingCartEvent, CatalogEvent, String]#Context, 101 | out: Collector[String] 102 | ) = 103 | out.collect(s"User ${left.userId} browsed at ${right.time} and bought at ${left.time}") 104 | }) 105 | 106 | intervalJoinedStream.print() 107 | env.execute() 108 | } 109 | 110 | // connect = two streams are treated with the same "operator" 111 | def demoConnect(): Unit = { 112 | val env = StreamExecutionEnvironment.getExecutionEnvironment 113 | 114 | // two separate streams 115 | val shoppingCartEvents = env.addSource(new SingleShoppingCartEventsGenerator(100)).setParallelism(1) 116 | val catalogEvents = env.addSource(new CatalogEventsGenerator(1000)).setParallelism(1) 117 | 118 | // connect the streams 119 | val connectedStream: ConnectedStreams[ShoppingCartEvent, CatalogEvent] = shoppingCartEvents.connect(catalogEvents) 120 | 121 | // variables - will use single-threaded 122 | env.setParallelism(1) 123 | env.setMaxParallelism(1) 124 | 125 | val ratioStream: DataStream[Double] = connectedStream.process( 126 | new CoProcessFunction[ShoppingCartEvent, CatalogEvent, Double] { 127 | var shoppingCartEventCount = 0 128 | var catalogEventCount = 0 129 | 130 | override def processElement1( 131 | value: ShoppingCartEvent, 132 | ctx: CoProcessFunction[ShoppingCartEvent, CatalogEvent, Double]#Context, 133 | out: Collector[Double] 134 | ) = { 135 | shoppingCartEventCount += 1 136 | out.collect(shoppingCartEventCount * 100.0 / (shoppingCartEventCount + catalogEventCount)) 137 | } 138 | 139 | override def processElement2( 140 | value: CatalogEvent, 141 | ctx: CoProcessFunction[ShoppingCartEvent, CatalogEvent, Double]#Context, 142 | out: Collector[Double] 143 | ) = { 144 | catalogEventCount += 1 145 | out.collect(shoppingCartEventCount * 100.0 / (shoppingCartEventCount + catalogEventCount)) 146 | } 147 | } 148 | ) 149 | 150 | ratioStream.print() 151 | env.execute() 152 | } 153 | 154 | def main(args: Array[String]): Unit = { 155 | demoConnect() 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/main/scala/part2datastreams/Partitions.scala: -------------------------------------------------------------------------------- 1 | package part2datastreams 2 | 3 | import generators.shopping._ 4 | import org.apache.flink.api.common.functions.Partitioner 5 | import org.apache.flink.streaming.api.scala._ 6 | 7 | object Partitions { 8 | 9 | // splitting = partitioning 10 | 11 | def demoPartitioner(): Unit = { 12 | val env = StreamExecutionEnvironment.getExecutionEnvironment 13 | 14 | val shoppingCartEvents: DataStream[ShoppingCartEvent] = 15 | env.addSource(new SingleShoppingCartEventsGenerator(100)) // ~10 events/s 16 | 17 | // partitioner = logic to split the data 18 | val partitioner = new Partitioner[String] { 19 | override def partition(key: String, numPartitions: Int): Int = { // invoked on every event 20 | // hash code % number of partitions ~ even distribution 21 | println(s"Number of max partitions: $numPartitions") 22 | key.hashCode % numPartitions 23 | } 24 | } 25 | 26 | val partitionedStream = shoppingCartEvents.partitionCustom( 27 | partitioner, 28 | event => event.userId 29 | ) 30 | 31 | /* 32 | Bad because 33 | - you lose parallelism 34 | - you risk overloading the task with the disproportionate data 35 | 36 | Good for e.g. sending HTTP requests 37 | */ 38 | val badPartitioner = new Partitioner[String] { 39 | override def partition(key: String, numPartitions: Int): Int = { // invoked on every event 40 | numPartitions - 1 // last partition index 41 | } 42 | } 43 | 44 | val badPartitionedStream = shoppingCartEvents.partitionCustom( 45 | badPartitioner, 46 | event => event.userId 47 | ) 48 | // redistribution of data evenly - involves data transfer through network 49 | .shuffle 50 | 51 | badPartitionedStream.print() 52 | env.execute() 53 | } 54 | 55 | 56 | def main(args: Array[String]): Unit = { 57 | demoPartitioner() 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/part2datastreams/TimeBasedTransformations.scala: -------------------------------------------------------------------------------- 1 | package part2datastreams 2 | 3 | import generators.shopping._ 4 | import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, Watermark, WatermarkGenerator, WatermarkOutput, WatermarkStrategy} 5 | import org.apache.flink.streaming.api.scala._ 6 | import org.apache.flink.streaming.api.scala.function.ProcessAllWindowFunction 7 | import org.apache.flink.streaming.api.windowing.assigners.{TumblingEventTimeWindows, TumblingProcessingTimeWindows} 8 | import org.apache.flink.streaming.api.windowing.time.Time 9 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 10 | import org.apache.flink.util.Collector 11 | 12 | import java.time.Instant 13 | 14 | object TimeBasedTransformations { 15 | 16 | val env = StreamExecutionEnvironment.getExecutionEnvironment 17 | 18 | val shoppingCartEvents: DataStream[ShoppingCartEvent] = env.addSource( 19 | new ShoppingCartEventsGenerator( 20 | sleepMillisPerEvent = 100, 21 | batchSize = 5, 22 | baseInstant = Instant.parse("2022-02-15T00:00:00.000Z") 23 | ) 24 | ) 25 | 26 | // 1. Event time = the moment the event was CREATED 27 | // 2. Processing time = the moment the event ARRIVES AT FLINK 28 | 29 | class CountByWindowAll extends ProcessAllWindowFunction[ShoppingCartEvent, String, TimeWindow] { 30 | override def process(context: Context, elements: Iterable[ShoppingCartEvent], out: Collector[String]): Unit = { 31 | val window = context.window 32 | out.collect(s"Window [${window.getStart} - ${window.getEnd}] ${elements.size}") 33 | } 34 | } 35 | /* 36 | Group by window, every 3s, tumbling (non-overlapping), PROCESSING TIME 37 | */ 38 | /* 39 | With processing time 40 | - we don't care when the event was created 41 | - multiple runs generate different results 42 | */ 43 | def demoProcessingTime(): Unit = { 44 | def groupedEventsByWindow = shoppingCartEvents.windowAll(TumblingProcessingTimeWindows.of(Time.seconds(3))) 45 | def countEventsByWindow: DataStream[String] = groupedEventsByWindow.process(new CountByWindowAll) 46 | countEventsByWindow.print() 47 | env.execute() 48 | } 49 | 50 | /* 51 | With event time 52 | - we NEED to care about handling late data - done with watermarks 53 | - we don't care about Flink internal time 54 | - we might see faster results 55 | - same events + different runs => same results 56 | */ 57 | def demoEventTime(): Unit = { 58 | val groupedEventsByWindow = shoppingCartEvents 59 | .assignTimestampsAndWatermarks( 60 | WatermarkStrategy 61 | .forBoundedOutOfOrderness(java.time.Duration.ofMillis(500)) // max delay < 500 millis 62 | .withTimestampAssigner(new SerializableTimestampAssigner[ShoppingCartEvent] { 63 | override def extractTimestamp(element: ShoppingCartEvent, recordTimestamp: Long) = element.time.toEpochMilli 64 | }) 65 | ) 66 | .windowAll(TumblingEventTimeWindows.of(Time.seconds(3))) 67 | 68 | def countEventsByWindow: DataStream[String] = groupedEventsByWindow.process(new CountByWindowAll) 69 | countEventsByWindow.print() 70 | env.execute() 71 | } 72 | 73 | /** 74 | Custom watermarks 75 | */ 76 | // with every new MAX timestamp, every new incoming element with event time < max timestamp - max delay will be discarded 77 | class BoundedOutOfOrdernessGenerator(maxDelay: Long) extends WatermarkGenerator[ShoppingCartEvent] { 78 | var currentMaxTimestamp: Long = 0L 79 | 80 | // maybe emit watermark on a particular event 81 | override def onEvent(event: ShoppingCartEvent, eventTimestamp: Long, output: WatermarkOutput) = { 82 | // ^ event being processed ^ timestamp attached to the event 83 | currentMaxTimestamp = Math.max(currentMaxTimestamp, event.time.toEpochMilli) 84 | // emitting a watermark is NOT mandatory 85 | // output.emitWatermark(new Watermark(event.time.toEpochMilli)) // every new event older than THIS EVENT will be discarded 86 | } 87 | 88 | // Flink can also call onPeriodicEmit regularly - up to us to maybe emit a watermark at these times 89 | override def onPeriodicEmit(output: WatermarkOutput) = 90 | output.emitWatermark(new Watermark(currentMaxTimestamp - maxDelay - 1)) 91 | } 92 | 93 | def demoEventTime_v2(): Unit = { 94 | // control how often Flink calls onPeriodicEmit 95 | env.getConfig.setAutoWatermarkInterval(1000L) // call onPeriodicEmit every 1s 96 | 97 | val groupedEventsByWindow = shoppingCartEvents 98 | .assignTimestampsAndWatermarks( 99 | WatermarkStrategy 100 | .forGenerator(_ => new BoundedOutOfOrdernessGenerator(500L)) 101 | .withTimestampAssigner(new SerializableTimestampAssigner[ShoppingCartEvent] { 102 | override def extractTimestamp(element: ShoppingCartEvent, recordTimestamp: Long) = element.time.toEpochMilli 103 | }) 104 | ) 105 | .windowAll(TumblingEventTimeWindows.of(Time.seconds(3))) 106 | 107 | def countEventsByWindow: DataStream[String] = groupedEventsByWindow.process(new CountByWindowAll) 108 | countEventsByWindow.print() 109 | env.execute() 110 | } 111 | 112 | def main(args: Array[String]): Unit = { 113 | demoEventTime_v2() 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/main/scala/part2datastreams/Triggers.scala: -------------------------------------------------------------------------------- 1 | package part2datastreams 2 | 3 | import generators.shopping._ 4 | import org.apache.flink.streaming.api.scala._ 5 | import org.apache.flink.streaming.api.scala.function.ProcessAllWindowFunction 6 | import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows 7 | import org.apache.flink.streaming.api.windowing.time.Time 8 | import org.apache.flink.streaming.api.windowing.triggers.{CountTrigger, PurgingTrigger} 9 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow 10 | import org.apache.flink.util.Collector 11 | 12 | object Triggers { 13 | 14 | // Triggers -> WHEN a window function is executed 15 | 16 | val env = StreamExecutionEnvironment.getExecutionEnvironment 17 | 18 | def demoCountTrigger(): Unit = { 19 | val shoppingCartEvents: DataStream[String] = env 20 | .addSource(new ShoppingCartEventsGenerator(500, 2)) // 2 events/second 21 | .windowAll(TumblingProcessingTimeWindows.of(Time.seconds(5))) // 10 events/window 22 | .trigger(CountTrigger.of[TimeWindow](5)) // the window function runs every 5 elements 23 | .process(new CountByWindowAll) // runs twice for the same window 24 | 25 | shoppingCartEvents.print() 26 | env.execute() 27 | } 28 | /* 29 | 12> Window [1646129900000 - 1646129905000] 2 30 | 1> Window [1646129905000 - 1646129910000] 10 31 | 2> Window [1646129910000 - 1646129915000] 10 32 | 3> Window [1646129915000 - 1646129920000] 10 33 | 4> Window [1646129920000 - 1646129925000] 10 34 | 35 | with trigger 36 | 6> Window [1646130165000 - 1646130170000] 5 <- trigger running on the window 65000-70000 for the first time 37 | 7> Window [1646130165000 - 1646130170000] 10 <- second trigger FOR THE SAME WINDOW 38 | 8> Window [1646130170000 - 1646130175000] 5 39 | 9> Window [1646130170000 - 1646130175000] 10 40 | 10> Window [1646130175000 - 1646130180000] 5 41 | 11> Window [1646130175000 - 1646130180000] 10 42 | */ 43 | 44 | // purging trigger - clear the window when it fires 45 | 46 | def demoPurgingTrigger(): Unit = { 47 | val shoppingCartEvents: DataStream[String] = env 48 | .addSource(new ShoppingCartEventsGenerator(500, 2)) // 2 events/second 49 | .windowAll(TumblingProcessingTimeWindows.of(Time.seconds(5))) // 10 events/window 50 | .trigger(PurgingTrigger.of(CountTrigger.of[TimeWindow](5))) // the window function runs every 5 elements, THEN CLEARS THE WINDOW 51 | .process(new CountByWindowAll) // runs twice for the same window 52 | 53 | shoppingCartEvents.print() 54 | env.execute() 55 | } 56 | 57 | /* 58 | with purging trigger 59 | 60 | 12> Window [1646134290000 - 1646134295000] 5 61 | 1> Window [1646134295000 - 1646134300000] 5 62 | 2> Window [1646134295000 - 1646134300000] 5 63 | 3> Window [1646134300000 - 1646134305000] 5 64 | 4> Window [1646134300000 - 1646134305000] 5 65 | 5> Window [1646134305000 - 1646134310000] 5 66 | 6> Window [1646134305000 - 1646134310000] 5 67 | */ 68 | 69 | /* 70 | Other triggers: 71 | - EventTimeTrigger - happens by default when the watermark is > window end time (automatic for event time windows) 72 | - ProcessingTimeTrigger - fires when the current system time > window end time (automatic for processing time windows) 73 | - custom triggers - powerful APIs for custom firing behavior 74 | */ 75 | 76 | def main(args: Array[String]): Unit = { 77 | demoPurgingTrigger() 78 | } 79 | 80 | } 81 | 82 | // copied from Time Based Transformations 83 | class CountByWindowAll extends ProcessAllWindowFunction[ShoppingCartEvent, String, TimeWindow] { 84 | override def process(context: Context, elements: Iterable[ShoppingCartEvent], out: Collector[String]): Unit = { 85 | val window = context.window 86 | out.collect(s"Window [${window.getStart} - ${window.getEnd}] ${elements.size}") 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/scala/part2datastreams/WindowFunctions.scala: -------------------------------------------------------------------------------- 1 | package part2datastreams 2 | 3 | import generators.gaming._ 4 | import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy} 5 | import org.apache.flink.api.common.functions.AggregateFunction 6 | import org.apache.flink.streaming.api.scala._ 7 | import org.apache.flink.streaming.api.scala.function.{AllWindowFunction, ProcessAllWindowFunction, ProcessWindowFunction, WindowFunction} 8 | import org.apache.flink.streaming.api.windowing.assigners.{EventTimeSessionWindows, GlobalWindows, SlidingEventTimeWindows, TumblingEventTimeWindows} 9 | import org.apache.flink.streaming.api.windowing.time.Time 10 | import org.apache.flink.streaming.api.windowing.triggers.CountTrigger 11 | import org.apache.flink.streaming.api.windowing.windows.{GlobalWindow, TimeWindow} 12 | import org.apache.flink.util.Collector 13 | 14 | import java.time.Instant 15 | import scala.concurrent.duration._ 16 | 17 | object WindowFunctions { 18 | 19 | // use-case: stream of events for a gaming session 20 | 21 | val env = StreamExecutionEnvironment.getExecutionEnvironment 22 | 23 | implicit val serverStartTime: Instant = 24 | Instant.parse("2022-02-02T00:00:00.000Z") 25 | 26 | val events: List[ServerEvent] = List( 27 | bob.register(2.seconds), // player "Bob" registered 2s after the server started 28 | bob.online(2.seconds), 29 | sam.register(3.seconds), 30 | sam.online(4.seconds), 31 | rob.register(4.seconds), 32 | alice.register(4.seconds), 33 | mary.register(6.seconds), 34 | mary.online(6.seconds), 35 | carl.register(8.seconds), 36 | rob.online(10.seconds), 37 | alice.online(10.seconds), 38 | carl.online(10.seconds) 39 | ) 40 | 41 | val eventStream: DataStream[ServerEvent] = env 42 | .fromCollection(events) 43 | .assignTimestampsAndWatermarks( // extract timestamps for events (event time) + watermarks 44 | WatermarkStrategy 45 | .forBoundedOutOfOrderness(java.time.Duration.ofMillis(500)) // once you get an event with time T, you will NOT accept further events with time < T - 500 46 | .withTimestampAssigner(new SerializableTimestampAssigner[ServerEvent] { 47 | override def extractTimestamp(element: ServerEvent, recordTimestamp: Long) = 48 | element.eventTime.toEpochMilli 49 | }) 50 | ) 51 | 52 | // how many players were registered every 3 seconds? 53 | // [0...3s] [3s...6s] [6s...9s] 54 | val threeSecondsTumblingWindow = eventStream.windowAll(TumblingEventTimeWindows.of(Time.seconds(3))) 55 | /* 56 | |----0----|----1----|--------2--------|--------3--------|---------4---------|---5---|--------6--------|---7---|--------8--------|--9--|------10-------|------11------| 57 | | | | bob registered | sam registered | sam online | | mary registered | | carl registered | | rob online | | 58 | | | | bob online | | rob registered | | mary online | | | | alice online | | 59 | | | | | | alice registered | | | | | | carl online | | 60 | ^|------------ window one ----------- + -------------- window two ----------------- + ------------- window three -------------- + ----------- window four ----------|^ 61 | | | | | | 62 | | 1 registrations | 3 registrations | 2 registration | 0 registrations | 63 | | 1643760000000 - 1643760003000 | 1643760005000 - 1643760006000 | 1643760006000 - 1643760009000 | 1643760009000 - 1643760012000 | 64 | */ 65 | 66 | // count by windowAll 67 | class CountByWindowAll extends AllWindowFunction[ServerEvent, String, TimeWindow] { 68 | // ^ input ^ output ^ window type 69 | override def apply(window: TimeWindow, input: Iterable[ServerEvent], out: Collector[String]): Unit = { 70 | val registrationEventCount = input.count(event => event.isInstanceOf[PlayerRegistered]) 71 | out.collect(s"Window [${window.getStart} - ${window.getEnd}] $registrationEventCount") 72 | } 73 | } 74 | 75 | def demoCountByWindow(): Unit = { 76 | val registrationsPerThreeSeconds: DataStream[String] = threeSecondsTumblingWindow.apply(new CountByWindowAll) 77 | registrationsPerThreeSeconds.print() 78 | env.execute() 79 | } 80 | 81 | // alternative: process window function which offers a much richer API (lower-level) 82 | class CountByWindowAllV2 extends ProcessAllWindowFunction[ServerEvent, String, TimeWindow] { 83 | override def process(context: Context, elements: Iterable[ServerEvent], out: Collector[String]): Unit = { 84 | val window = context.window 85 | val registrationEventCount = elements.count(event => event.isInstanceOf[PlayerRegistered]) 86 | out.collect(s"Window [${window.getStart} - ${window.getEnd}] $registrationEventCount") 87 | } 88 | } 89 | 90 | def demoCountByWindow_v2(): Unit = { 91 | val registrationsPerThreeSeconds: DataStream[String] = threeSecondsTumblingWindow.process(new CountByWindowAllV2) 92 | registrationsPerThreeSeconds.print() 93 | env.execute() 94 | } 95 | 96 | // alternative 2: aggregate function 97 | class CountByWindowV3 extends AggregateFunction[ServerEvent, Long, Long] { 98 | // ^ input ^ acc ^ output 99 | 100 | // start counting from 0 101 | override def createAccumulator(): Long = 0L 102 | 103 | // every element increases accumulator by 1 104 | override def add(value: ServerEvent, accumulator: Long) = 105 | if (value.isInstanceOf[PlayerRegistered]) accumulator + 1 106 | else accumulator 107 | 108 | // push a final output out of the final accumulator 109 | override def getResult(accumulator: Long) = accumulator 110 | 111 | // accum1 + accum2 = a bigger accumulator 112 | override def merge(a: Long, b: Long) = a + b 113 | } 114 | 115 | def demoCountByWindow_v3(): Unit = { 116 | val registrationsPerThreeSeconds: DataStream[Long] = threeSecondsTumblingWindow.aggregate(new CountByWindowV3) 117 | registrationsPerThreeSeconds.print() 118 | env.execute() 119 | } 120 | 121 | /** 122 | * Keyed streams and window functions 123 | */ 124 | // each element will be assigned to a "mini-stream" for its own key 125 | val streamByType: KeyedStream[ServerEvent, String] = eventStream.keyBy(e => e.getClass.getSimpleName) 126 | 127 | // for every key, we'll have a separate window allocation 128 | val threeSecondsTumblingWindowByType = streamByType.window(TumblingEventTimeWindows.of(Time.seconds(3))) 129 | 130 | /* 131 | === Registration Events Stream === 132 | |----0----|----1----|--------2--------|--------3--------|---------4---------|---5---|--------6--------|---7---|--------8--------|--9--|------10-------|------11------| 133 | | | | bob registered | sam registered | rob registered | | mary registered | | carl registered | | | | 134 | | | | | | alice registered | | | | | | | | 135 | ^|------------ window one ----------- + -------------- window two ----------------- + ------------- window three -------------- + ----------- window four ----------|^ 136 | | 1 registration | 3 registrations | 2 registrations | 0 registrations | 137 | | 1643760000000 - 1643760003000 | 1643760003000 - 1643760006000 | 1643760006000 - 1643760009000 | 1643760009000 - 1643760012000 | 138 | 139 | === Online Events Stream === 140 | |----0----|----1----|--------2--------|--------3--------|---------4---------|---5---|--------6--------|---7---|--------8--------|--9--|------10-------|------11------| 141 | | | | bob online | | sam online | | mary online | | | | rob online | carl online | 142 | | | | | | | | | | | | alice online | | 143 | ^|------------ window one ----------- + -------------- window two ----------------- + ------------- window three -------------- + ----------- window four ----------|^ 144 | | 1 online | 1 online | 1 online | 3 online | 145 | | 1643760000000 - 1643760003000 | 1643760005000 - 1643760006000 | 1643760006000 - 1643760009000 | 1643760009000 - 1643760012000 | 146 | */ 147 | 148 | class CountByWindow extends WindowFunction[ServerEvent, String, String, TimeWindow] { 149 | override def apply(key: String, window: TimeWindow, input: Iterable[ServerEvent], out: Collector[String]): Unit = 150 | out.collect(s"$key: $window, ${input.size}") 151 | } 152 | 153 | def demoCountByTypeByWindow(): Unit = { 154 | val finalStream = threeSecondsTumblingWindowByType.apply(new CountByWindow) 155 | finalStream.print() 156 | env.execute() 157 | } 158 | 159 | // alternative: process function for windows 160 | class CountByWindowV2 extends ProcessWindowFunction[ServerEvent, String, String, TimeWindow] { 161 | override def process(key: String, context: Context, elements: Iterable[ServerEvent], out: Collector[String]): Unit = 162 | out.collect(s"$key: ${context.window}, ${elements.size}") 163 | } 164 | 165 | def demoCountByTypeByWindow_v2(): Unit = { 166 | val finalStream = threeSecondsTumblingWindowByType.process(new CountByWindowV2) 167 | finalStream.print() 168 | env.execute() 169 | } 170 | 171 | // one task processes all the data for a particular key 172 | 173 | /** 174 | * Sliding Windows 175 | */ 176 | 177 | // how many players were registered every 3 seconds, UPDATED EVERY 1s? 178 | // [0s...3s] [1s...4s] [2s...5s] ... 179 | 180 | /* 181 | |----0----|----1----|--------2--------|--------3--------|---------4---------|---5---|--------6--------|---7---|--------8--------|--9--|------10-------|------11------| 182 | | | | bob registered | sam registered | sam online | | mary registered | | carl registered | | rob online | carl online | 183 | | | | bob online | | rob registered | | mary online | | | | alice online | | 184 | | | | | | alice registered | | | | | | | | 185 | ^|------------ window one ----------- + 186 | 1 registration 187 | 188 | + ---------------- window two --------------- + 189 | 2 registrations 190 | 191 | + ------------------- window three ------------------- + 192 | 4 registrations 193 | 194 | + ---------------- window four --------------- + 195 | 3 registrations 196 | 197 | + ---------------- window five -------------- + 198 | 3 registrations 199 | 200 | + ---------- window six -------- + 201 | 1 registration 202 | 203 | + ------------ window seven ----------- + 204 | 2 registrations 205 | 206 | + ------- window eight------- + 207 | 1 registration 208 | 209 | + ----------- window nine ----------- + 210 | 1 registration 211 | 212 | + ---------- window ten --------- + 213 | 0 registrations 214 | */ 215 | 216 | def demoSlidingAllWindows(): Unit = { 217 | val windowSize: Time = Time.seconds(3) 218 | val slidingTime: Time = Time.seconds(1) 219 | 220 | val slidingWindowsAll = eventStream.windowAll(SlidingEventTimeWindows.of(windowSize, slidingTime)) 221 | 222 | // process the windowed stream with similar window functions 223 | val registrationCountByWindow = slidingWindowsAll.apply(new CountByWindowAll) 224 | 225 | // similar to the other example 226 | registrationCountByWindow.print() 227 | env.execute() 228 | } 229 | 230 | /** 231 | * Session windows = groups of events with NO MORE THAN a certain time gap in between them 232 | */ 233 | // how many registration events do we have NO MORE THAN 1s apart? 234 | /* 235 | |----0----|----1----|--------2--------|--------3--------|---------4---------|---5---|--------6--------|---7---|--------8--------|--9--|------10-------|------11------| 236 | | | | bob registered | sam registered | sam online | | mary registered | | carl registered | | rob online | | 237 | | | | bob online | | rob registered | | mary online | | | | alice online | | 238 | | | | | | alice registered | | | | | | carl online | | 239 | 240 | after filtering: 241 | 242 | +---------+---------+-----------------+-----------------+-------------------+-------+-----------------+-------+-----------------+-----+---------------+--------------+ 243 | | | | bob registered | sam registered | rob registered | | mary registered | | carl registered | | N/A | | 244 | | | | | | alice registered | | | | | | | | 245 | ^ ----------------- window 1 -------------------------- ^ ^ -- window 2 --- ^ ^ -- window 3 --- ^ ^ -- window 4 - ^ 246 | */ 247 | 248 | def demoSessionWindows(): Unit = { 249 | val groupBySessionWindows = eventStream.windowAll(EventTimeSessionWindows.withGap(Time.seconds(1))) 250 | 251 | // operate any kind of window function 252 | val countBySessionWindows = groupBySessionWindows.apply(new CountByWindowAll) 253 | 254 | // same things as before 255 | countBySessionWindows.print() 256 | env.execute() 257 | } 258 | 259 | /** 260 | * Global window 261 | */ 262 | // how many registration events do we have every 10 events 263 | 264 | class CountByGlobalWindowAll extends AllWindowFunction[ServerEvent, String, GlobalWindow] { 265 | // ^ input ^ output ^ window type 266 | override def apply(window: GlobalWindow, input: Iterable[ServerEvent], out: Collector[String]): Unit = { 267 | val registrationEventCount = input.count(event => event.isInstanceOf[PlayerRegistered]) 268 | out.collect(s"Window [$window] $registrationEventCount") 269 | } 270 | } 271 | 272 | def demoGlobalWindow(): Unit = { 273 | val globalWindowEvents = eventStream 274 | .windowAll(GlobalWindows.create()) 275 | .trigger(CountTrigger.of[GlobalWindow](10)) 276 | .apply(new CountByGlobalWindowAll) 277 | 278 | globalWindowEvents.print() 279 | env.execute() 280 | } 281 | 282 | /** 283 | * Exercise: what was the time window (continuous 2s) when we had THE MOST registration events? 284 | * - what kind of window functions should we use? ALL WINDOW FUNCTION 285 | * - what kind of windows should we use? SLIDING WINDOWS 286 | */ 287 | class KeepWindowAndCountFunction extends AllWindowFunction[ServerEvent, (TimeWindow, Long), TimeWindow] { 288 | override def apply(window: TimeWindow, input: Iterable[ServerEvent], out: Collector[(TimeWindow, Long)]): Unit = 289 | out.collect((window, input.size)) 290 | } 291 | 292 | def windowFunctionsExercise(): Unit = { 293 | val slidingWindows: DataStream[(TimeWindow, Long)] = eventStream 294 | .filter(_.isInstanceOf[PlayerRegistered]) 295 | .windowAll(SlidingEventTimeWindows.of(Time.seconds(2), Time.seconds(1))) 296 | .apply(new KeepWindowAndCountFunction) 297 | 298 | val localWindows: List[(TimeWindow, Long)] = slidingWindows.executeAndCollect().toList 299 | val bestWindow: (TimeWindow, Long) = localWindows.maxBy(_._2) 300 | println(s"The best window is ${bestWindow._1} with ${bestWindow._2} registration events.") 301 | } 302 | 303 | def main(args: Array[String]): Unit = { 304 | windowFunctionsExercise() 305 | } 306 | } 307 | -------------------------------------------------------------------------------- /src/main/scala/part3state/BroadcastState.scala: -------------------------------------------------------------------------------- 1 | package part3state 2 | 3 | import generators.shopping._ 4 | import org.apache.flink.api.common.state.MapStateDescriptor 5 | import org.apache.flink.streaming.api.datastream.BroadcastStream 6 | import org.apache.flink.streaming.api.functions.co.KeyedBroadcastProcessFunction 7 | import org.apache.flink.streaming.api.functions.source.SourceFunction 8 | import org.apache.flink.streaming.api.scala._ 9 | import org.apache.flink.util.Collector 10 | 11 | object BroadcastState { 12 | 13 | val env = StreamExecutionEnvironment.getExecutionEnvironment 14 | val shoppingCartEvents = env.addSource(new SingleShoppingCartEventsGenerator(100)) 15 | val eventsByUser = shoppingCartEvents.keyBy(_.userId) 16 | 17 | // issue a warning if quantity > threshold 18 | def purchaseWarnings(): Unit = { 19 | val threshold = 2 20 | 21 | val notificationsStream = eventsByUser 22 | .filter(_.isInstanceOf[AddToShoppingCartEvent]) 23 | .filter(_.asInstanceOf[AddToShoppingCartEvent].quantity > threshold) 24 | .map(event => event match { 25 | case AddToShoppingCartEvent(userId, sku, quantity, _) => 26 | s"User $userId attempting to purchase $quantity items of $sku when threshold is $threshold" 27 | case _ => "" 28 | }) 29 | 30 | notificationsStream.print() 31 | env.execute() 32 | } 33 | 34 | // ... if the threshold CHANGES over time? 35 | // thresholds will be BROADCAST 36 | 37 | def changingThresholds(): Unit = { 38 | val thresholds: DataStream[Int] = env.addSource(new SourceFunction[Int] { 39 | override def run(ctx: SourceFunction.SourceContext[Int]) = 40 | List(2,0,4,5,6,3).foreach { newThreshold => 41 | Thread.sleep(1000) 42 | ctx.collect(newThreshold) 43 | } 44 | 45 | override def cancel() = () 46 | }) 47 | 48 | // broadcast state is ALWAYS a map 49 | val broadcastStateDescriptor = new MapStateDescriptor[String, Int]("thresholds", classOf[String], classOf[Int]) 50 | val broadcastThresholds: BroadcastStream[Int] = thresholds.broadcast(broadcastStateDescriptor) 51 | 52 | val notificationsStream = eventsByUser 53 | .connect(broadcastThresholds) 54 | .process(new KeyedBroadcastProcessFunction[String, ShoppingCartEvent, Int, String] { 55 | // ^ key ^ first event ^ broadcast ^ output 56 | val thresholdsDescriptor = new MapStateDescriptor[String, Int]("thresholds", classOf[String], classOf[Int]) 57 | 58 | override def processBroadcastElement( 59 | newThreshold: Int, 60 | ctx: KeyedBroadcastProcessFunction[String, ShoppingCartEvent, Int, String]#Context, 61 | out: Collector[String] 62 | ) = { 63 | println(s"Threshold about to be changed -- $newThreshold") 64 | // fetch the broadcast state = distributed variable 65 | val stateThresholds = ctx.getBroadcastState(thresholdsDescriptor) 66 | // update the state 67 | stateThresholds.put("quantity-threshold", newThreshold) 68 | } 69 | 70 | 71 | override def processElement( 72 | event: ShoppingCartEvent, 73 | ctx: KeyedBroadcastProcessFunction[String, ShoppingCartEvent, Int, String]#ReadOnlyContext, 74 | out: Collector[String] 75 | ) = { 76 | event match { 77 | case AddToShoppingCartEvent(userId, sku, quantity, time) => 78 | val currentThreshold: Int = ctx.getBroadcastState(thresholdsDescriptor).get("quantity-threshold") 79 | if (quantity > currentThreshold) 80 | out.collect(s"User $userId attempting to purchase $quantity items of $sku when threshold is $currentThreshold") 81 | case _ => 82 | } 83 | } 84 | }) 85 | 86 | 87 | notificationsStream.print() 88 | env.execute() 89 | } 90 | 91 | def main(args: Array[String]): Unit = { 92 | changingThresholds() 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/part3state/Checkpoints.scala: -------------------------------------------------------------------------------- 1 | package part3state 2 | 3 | import generators.shopping._ 4 | import org.apache.flink.api.common.functions.FlatMapFunction 5 | import org.apache.flink.api.common.state.{CheckpointListener, ValueState, ValueStateDescriptor} 6 | import org.apache.flink.runtime.state.{FunctionInitializationContext, FunctionSnapshotContext} 7 | import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction 8 | import org.apache.flink.streaming.api.scala._ 9 | import org.apache.flink.util.Collector 10 | 11 | object Checkpoints { 12 | 13 | val env = StreamExecutionEnvironment.getExecutionEnvironment 14 | 15 | // set checkpoint intervals 16 | env.getCheckpointConfig.setCheckpointInterval(5000) // a checkpoint triggered every 5s 17 | // set checkpoint storage 18 | env.getCheckpointConfig.setCheckpointStorage("file:///Users/daniel/dev/rockthejvm/courses/flink-essentials/checkpoints") 19 | 20 | /* 21 | Keep track of the NUMBER OF AddToCart events PER USER, when quantity > a threshold (e.g. managing stock) 22 | Persist the data (state) via checkpoints 23 | */ 24 | 25 | val shoppingCartEvents = 26 | env.addSource(new SingleShoppingCartEventsGenerator(sleepMillisBetweenEvents = 100, generateRemoved = true)) 27 | 28 | val eventsByUser = shoppingCartEvents 29 | .keyBy(_.userId) 30 | .flatMap(new HighQuantityCheckpointedFunction(5)) 31 | 32 | 33 | def main(args: Array[String]): Unit = { 34 | eventsByUser.print() 35 | env.execute() 36 | } 37 | } 38 | 39 | class HighQuantityCheckpointedFunction(val threshold: Long) 40 | extends FlatMapFunction[ShoppingCartEvent, (String, Long)] 41 | with CheckpointedFunction 42 | with CheckpointListener { 43 | 44 | var stateCount: ValueState[Long] = _ // instantiated PER KEY 45 | 46 | override def flatMap(event: ShoppingCartEvent, out: Collector[(String, Long)]): Unit = 47 | event match { 48 | case AddToShoppingCartEvent(userId, _, quantity, _) => 49 | if (quantity > threshold) { 50 | // update state 51 | val newUserEventCount = stateCount.value() + 1 52 | stateCount.update(newUserEventCount) 53 | 54 | // push output 55 | out.collect((userId, newUserEventCount)) 56 | } 57 | case _ => // do nothing 58 | } 59 | 60 | // invoked when the checkpoint is TRIGGERED 61 | override def snapshotState(context: FunctionSnapshotContext): Unit = 62 | println(s"CHECKPOINT AT ${context.getCheckpointTimestamp}") 63 | 64 | // lifecycle method to initialize state (~ open() in RichFunctions) 65 | override def initializeState(context: FunctionInitializationContext): Unit = { 66 | val stateCountDescriptor = new ValueStateDescriptor[Long]("impossibleOrderCount", classOf[Long]) 67 | stateCount = context.getKeyedStateStore.getState(stateCountDescriptor) 68 | } 69 | 70 | override def notifyCheckpointComplete(checkpointId: Long): Unit = () 71 | override def notifyCheckpointAborted(checkpointId: Long): Unit = () 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/part3state/KeyedState.scala: -------------------------------------------------------------------------------- 1 | package part3state 2 | 3 | import generators.shopping._ 4 | import org.apache.flink.api.common.state.{ListState, ListStateDescriptor, MapState, MapStateDescriptor, StateTtlConfig, ValueState, ValueStateDescriptor} 5 | import org.apache.flink.api.common.time.Time 6 | import org.apache.flink.configuration.Configuration 7 | import org.apache.flink.streaming.api.functions.KeyedProcessFunction 8 | import org.apache.flink.streaming.api.scala._ 9 | import org.apache.flink.util.Collector 10 | 11 | object KeyedState { 12 | 13 | val env = StreamExecutionEnvironment.getExecutionEnvironment 14 | val shoppingCartEvents = env.addSource( 15 | new SingleShoppingCartEventsGenerator( 16 | sleepMillisBetweenEvents = 100, // ~ 10 events/s 17 | generateRemoved = true 18 | ) 19 | ) 20 | 21 | val eventsPerUser: KeyedStream[ShoppingCartEvent, String] = shoppingCartEvents.keyBy(_.userId) 22 | 23 | def demoValueState(): Unit = { 24 | /* 25 | How many events PER USER have been generated? 26 | */ 27 | 28 | val numEventsPerUserNaive = eventsPerUser.process( 29 | new KeyedProcessFunction[String, ShoppingCartEvent, String] { // instantiated ONCE PER KEY 30 | // ^ key ^ event ^ result 31 | 32 | var nEventsForThisUser = 0 33 | 34 | override def processElement( 35 | value: ShoppingCartEvent, 36 | ctx: KeyedProcessFunction[String, ShoppingCartEvent, String]#Context, 37 | out: Collector[String] 38 | ): Unit = { 39 | nEventsForThisUser += 1 40 | out.collect(s"User ${value.userId} - $nEventsForThisUser") 41 | } 42 | } 43 | ) 44 | 45 | /* 46 | Problems with local vars 47 | - they are local, so other nodes don't see them 48 | - if a node crashes, the var disappears 49 | */ 50 | 51 | val numEventsPerUserStream = eventsPerUser.process( 52 | new KeyedProcessFunction[String, ShoppingCartEvent, String] { 53 | 54 | // can call .value to get current state 55 | // can call .update(newValue) to overwrite 56 | var stateCounter: ValueState[Long] = _ // a value state per key=userId 57 | 58 | override def open(parameters: Configuration): Unit = { 59 | // initialize all state 60 | stateCounter = getRuntimeContext // from RichFunction 61 | .getState(new ValueStateDescriptor[Long]("events-counter", classOf[Long])) 62 | } 63 | 64 | override def processElement( 65 | value: ShoppingCartEvent, 66 | ctx: KeyedProcessFunction[String, ShoppingCartEvent, String]#Context, 67 | out: Collector[String] 68 | ) = { 69 | val nEventsForThisUser = stateCounter.value() 70 | stateCounter.update(nEventsForThisUser + 1) 71 | out.collect(s"User ${value.userId} - ${nEventsForThisUser + 1}") 72 | } 73 | } 74 | ) 75 | 76 | numEventsPerUserStream.print() 77 | env.execute() 78 | } 79 | 80 | // ListState 81 | def demoListState(): Unit = { 82 | // store all the events per user id 83 | val allEventsPerUserStream = eventsPerUser.process( 84 | new KeyedProcessFunction[String, ShoppingCartEvent, String] { 85 | // create state here 86 | /* 87 | Capabilities 88 | - add(value) 89 | - addAll(list) 90 | - update(new list) - overwriting 91 | - get() 92 | */ 93 | var stateEventsForUser: ListState[ShoppingCartEvent] = _ // once per key 94 | // you need to be careful to keep the size of the list BOUNDED 95 | 96 | // initialization of state here 97 | override def open(parameters: Configuration): Unit = 98 | stateEventsForUser = getRuntimeContext.getListState( 99 | new ListStateDescriptor[ShoppingCartEvent]("shopping-cart-events", classOf[ShoppingCartEvent]) 100 | ) 101 | 102 | override def processElement( 103 | event: ShoppingCartEvent, 104 | ctx: KeyedProcessFunction[String, ShoppingCartEvent, String]#Context, 105 | out: Collector[String] 106 | ) = { 107 | stateEventsForUser.add(event) 108 | // import the Scala converters for collections 109 | // Scala 2.12 110 | import scala.collection.JavaConverters._ // implicit converters (extension methods) 111 | // Scala 2.13 & Scala 3 112 | // import scala.jdk.CollectionConverters._ 113 | 114 | val currentEvents: Iterable[ShoppingCartEvent] = stateEventsForUser.get() // does not return a plain List, but a Java Iterable 115 | .asScala // convert to a Scala Iterable 116 | 117 | out.collect(s"User ${event.userId} - [${currentEvents.mkString(", ")}]") 118 | } 119 | } 120 | ) 121 | 122 | allEventsPerUserStream.print() 123 | env.execute() 124 | } 125 | 126 | // MapState 127 | def demoMapState(): Unit = { 128 | // count how many events PER TYPE were ingested PER USER 129 | val streamOfCountsPerType = eventsPerUser.process( 130 | new KeyedProcessFunction[String, ShoppingCartEvent, String] { 131 | // Scala collection converters 132 | import scala.collection.JavaConverters._ // implicit converters (extension methods) 133 | 134 | // create the state 135 | var stateCountsPerEventType: MapState[String, Long] = _ // keep this bounded 136 | 137 | // initialize the state 138 | override def open(parameters: Configuration): Unit = { 139 | stateCountsPerEventType = getRuntimeContext.getMapState( 140 | new MapStateDescriptor[String, Long]( 141 | "per-type-counter", 142 | classOf[String], 143 | classOf[Long] 144 | ) 145 | ) 146 | } 147 | 148 | override def processElement( 149 | event: ShoppingCartEvent, 150 | ctx: KeyedProcessFunction[String, ShoppingCartEvent, String]#Context, 151 | out: Collector[String] 152 | ) = { 153 | // fetch the type of the event 154 | val eventType = event.getClass.getSimpleName 155 | // updating the state 156 | if (stateCountsPerEventType.contains(eventType)) { 157 | val oldCount = stateCountsPerEventType.get(eventType) 158 | val newCount = oldCount + 1 159 | stateCountsPerEventType.put(eventType, newCount) 160 | } else { 161 | stateCountsPerEventType.put(eventType, 1) 162 | } 163 | 164 | // push some output 165 | out.collect(s"${ctx.getCurrentKey} - ${stateCountsPerEventType.entries().asScala.mkString(", ")}") 166 | } 167 | } 168 | ) 169 | 170 | streamOfCountsPerType.print() 171 | env.execute() 172 | } 173 | 174 | // clear the state manually 175 | // clear the state at a regular interval 176 | 177 | def demoListStateWithClearance(): Unit = { 178 | val allEventsPerUserStream = eventsPerUser.process( 179 | new KeyedProcessFunction[String, ShoppingCartEvent, String] { 180 | import scala.collection.JavaConverters._ // implicit converters (extension methods) 181 | 182 | // if more than 10 elements, clear the list 183 | var stateEventsForUser: ListState[ShoppingCartEvent] = _ 184 | 185 | // initialization of state here 186 | override def open(parameters: Configuration): Unit = { 187 | val descriptor = new ListStateDescriptor[ShoppingCartEvent]("shopping-cart-events", classOf[ShoppingCartEvent]) 188 | // time to live = cleared if it's not modified after a certain time 189 | descriptor.enableTimeToLive( 190 | StateTtlConfig.newBuilder(Time.hours(1)) // clears the state after 1h 191 | .setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite) // specify when the timer resets 192 | .setStateVisibility(StateTtlConfig.StateVisibility.ReturnExpiredIfNotCleanedUp) 193 | .build() 194 | ) 195 | 196 | stateEventsForUser = getRuntimeContext.getListState(descriptor) 197 | 198 | } 199 | 200 | override def processElement( 201 | event: ShoppingCartEvent, 202 | ctx: KeyedProcessFunction[String, ShoppingCartEvent, String]#Context, 203 | out: Collector[String] 204 | ) = { 205 | stateEventsForUser.add(event) 206 | val currentEvents = stateEventsForUser.get().asScala.toList 207 | if (currentEvents.size > 10) 208 | stateEventsForUser.clear() // clearing is not done immediately 209 | 210 | out.collect(s"User ${event.userId} - [${currentEvents.mkString(", ")}]") 211 | } 212 | } 213 | ) 214 | 215 | allEventsPerUserStream.print() 216 | env.execute() 217 | } 218 | 219 | def main(args: Array[String]): Unit = { 220 | demoListStateWithClearance() 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/main/scala/part3state/RichFunctions.scala: -------------------------------------------------------------------------------- 1 | package part3state 2 | 3 | import generators.shopping._ 4 | import org.apache.flink.api.common.functions.{FlatMapFunction, MapFunction, RichFlatMapFunction, RichMapFunction} 5 | import org.apache.flink.configuration.Configuration 6 | import org.apache.flink.streaming.api.functions.ProcessFunction 7 | import org.apache.flink.streaming.api.scala._ 8 | import org.apache.flink.util.Collector 9 | 10 | object RichFunctions { 11 | 12 | val env = StreamExecutionEnvironment.getExecutionEnvironment 13 | env.setParallelism(1) 14 | 15 | val numbersStream: DataStream[Int] = env.fromElements(1,2,3,4,5,6) 16 | 17 | // pure FP 18 | val tenxNumbers: DataStream[Int] = numbersStream.map(_ * 10) 19 | 20 | // "explicit" map functions 21 | val tenxNumbers_v2: DataStream[Int] = numbersStream.map(new MapFunction[Int, Int] { 22 | override def map(value: Int) = value * 10 23 | }) 24 | 25 | // Rich Map function 26 | val tenxNumbers_v3: DataStream[Int] = numbersStream.map(new RichMapFunction[Int, Int] { 27 | override def map(value: Int) = value * 10 28 | }) 29 | 30 | // Rich map function + lifecycle methods 31 | val tenxNumbersWithLifecycle: DataStream[Int] = numbersStream.map(new RichMapFunction[Int, Int] { 32 | override def map(value: Int) = value * 10 // mandatory override 33 | 34 | // optional overrides: lifecycle methods open/close 35 | // called BEFORE data goes through 36 | override def open(parameters: Configuration): Unit = 37 | println("Starting my work!!") 38 | 39 | // invoked AFTER all the data 40 | override def close(): Unit = 41 | println("Finishing my work...") 42 | }) 43 | 44 | // ProcessFunction - the most general function abstraction in Flink 45 | val tenxNumbersProcess: DataStream[Int] = numbersStream.process(new ProcessFunction[Int, Int] { 46 | override def processElement(value: Int, ctx: ProcessFunction[Int, Int]#Context, out: Collector[Int]) = 47 | out.collect(value * 10) 48 | 49 | // can also override the lifecycle methods 50 | override def open(parameters: Configuration): Unit = 51 | println("Process function starting") 52 | 53 | override def close(): Unit = 54 | println("Closing process function") 55 | }) 56 | 57 | /** 58 | * Exercise: "explode" all purchase events to a single item 59 | * [("boots", 2), (iPhone, 1)] -> 60 | * ["boots", "boots", iPhone] 61 | * - lambdas 62 | * - explicit functions 63 | * - rich functions 64 | * - process functions 65 | */ 66 | def exercise(): Unit = { 67 | val exerciseEnv = StreamExecutionEnvironment.getExecutionEnvironment 68 | val shoppingCartStream: DataStream[AddToShoppingCartEvent] = exerciseEnv.addSource(new SingleShoppingCartEventsGenerator(100)) // ~10 events/s 69 | .filter(_.isInstanceOf[AddToShoppingCartEvent]) 70 | .map(_.asInstanceOf[AddToShoppingCartEvent]) 71 | 72 | // 1 - lambdas: flatMap 73 | val itemsPurchasedStream: DataStream[String] = 74 | shoppingCartStream.flatMap(event => (1 to event.quantity).map(_ => event.sku)) 75 | 76 | // 2 - explicit flatMap function 77 | val itemsPurchasedStream_v2: DataStream[String] = 78 | shoppingCartStream.flatMap(new FlatMapFunction[AddToShoppingCartEvent, String] { 79 | override def flatMap(event: AddToShoppingCartEvent, out: Collector[String]) = 80 | (1 to event.quantity).map(_ => event.sku).foreach(out.collect) 81 | }) 82 | 83 | // 3 - rich flatMap function 84 | val itemsPurchasedStream_v3: DataStream[String] = 85 | shoppingCartStream.flatMap(new RichFlatMapFunction[AddToShoppingCartEvent, String] { 86 | override def flatMap(event: AddToShoppingCartEvent, out: Collector[String]) = 87 | (1 to event.quantity).map(_ => event.sku).foreach(out.collect) 88 | 89 | override def open(parameters: Configuration): Unit = 90 | println("Processing with rich flatMap function") 91 | 92 | override def close(): Unit = 93 | println("Finishing rich flatMap function") 94 | }) 95 | 96 | // 4 - process function 97 | val itemsPurchasedStream_v4: DataStream[String] = 98 | shoppingCartStream.process(new ProcessFunction[AddToShoppingCartEvent, String] { 99 | override def processElement(event: AddToShoppingCartEvent, ctx: ProcessFunction[AddToShoppingCartEvent, String]#Context, out: Collector[String]) = 100 | (1 to event.quantity).map(_ => event.sku).foreach(out.collect) 101 | }) 102 | 103 | itemsPurchasedStream_v3.print() 104 | exerciseEnv.execute() 105 | } 106 | 107 | 108 | def main(args: Array[String]): Unit = { 109 | exercise() 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/main/scala/part4io/CassandraIntegration.scala: -------------------------------------------------------------------------------- 1 | package part4io 2 | 3 | import org.apache.flink.streaming.api.scala._ 4 | import org.apache.flink.streaming.connectors.cassandra.CassandraSink 5 | 6 | object CassandraIntegration { 7 | 8 | val env = StreamExecutionEnvironment.getExecutionEnvironment 9 | 10 | case class Person(name: String, age: Int) 11 | 12 | // write data to Cassandra 13 | def demoWriteDataToCassandra(): Unit = { 14 | val people = env.fromElements( 15 | Person("Daniel", 99), 16 | Person("Alice", 12), 17 | Person("Julie", 14), 18 | Person("Mom", 54), 19 | ) 20 | 21 | // we can only write TUPLES to Cassandra 22 | val personTuples: DataStream[(String, Int)] = people.map(p => (p.name, p.age)) 23 | 24 | // write the data 25 | CassandraSink.addSink(personTuples) // builder pattern 26 | .setQuery("insert into rtjvm.people(name, age) values (?, ?)") 27 | .setHost("localhost") 28 | .build() 29 | 30 | env.execute() 31 | } 32 | 33 | 34 | def main(args: Array[String]): Unit = { 35 | demoWriteDataToCassandra() 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/scala/part4io/CustomSinks.scala: -------------------------------------------------------------------------------- 1 | package part4io 2 | 3 | import org.apache.flink.configuration.Configuration 4 | import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction} 5 | import org.apache.flink.streaming.api.scala._ 6 | 7 | import java.io.{FileWriter, PrintWriter} 8 | import java.net.{ServerSocket, Socket} 9 | import java.util.Scanner 10 | 11 | object CustomSinks { 12 | 13 | val env = StreamExecutionEnvironment.getExecutionEnvironment 14 | val stringStream: DataStream[String] = env.fromElements( 15 | "This is an example of a sink function", 16 | "some other string", 17 | "Daniel says this is ok" 18 | ) 19 | 20 | // push the strings to a file sink 21 | 22 | // instantiated once per thread 23 | class FileSink(path: String) extends RichSinkFunction[String] { 24 | /* 25 | - hold state 26 | - lifecycle methods 27 | */ 28 | 29 | var writer: PrintWriter = _ 30 | 31 | // called once per event in the datastream 32 | override def invoke(event: String, context: SinkFunction.Context): Unit = { 33 | writer.println(event) 34 | writer.flush() 35 | } 36 | 37 | override def open(parameters: Configuration): Unit = { 38 | // initialize resources 39 | writer = new PrintWriter(new FileWriter(path, true)) // append mode 40 | } 41 | 42 | override def close(): Unit = { 43 | // close resources 44 | writer.close() 45 | } 46 | } 47 | 48 | def demoFileSink(): Unit = { 49 | stringStream.addSink(new FileSink("output/demoFileSink.txt")) 50 | stringStream.print() 51 | env.execute() 52 | } 53 | 54 | /** 55 | * Create a sink function that will push data (as strings) to a socket sink. 56 | */ 57 | class SocketSink(host: String, port: Int) extends RichSinkFunction[String] { 58 | var socket: Socket = _ 59 | var writer: PrintWriter = _ 60 | 61 | override def invoke(value: String, context: SinkFunction.Context): Unit = { 62 | writer.println(value) 63 | writer.flush() 64 | } 65 | 66 | override def open(parameters: Configuration): Unit = { 67 | socket = new Socket(host, port) 68 | writer = new PrintWriter(socket.getOutputStream) 69 | } 70 | 71 | override def close(): Unit = { 72 | socket.close() // closes the writer as well 73 | } 74 | } 75 | 76 | def demoSocketSink(): Unit = { 77 | stringStream.addSink(new SocketSink("localhost", 12345)).setParallelism(1) 78 | stringStream.print() 79 | env.execute() 80 | } 81 | 82 | def main(args: Array[String]): Unit = { 83 | demoSocketSink() 84 | } 85 | } 86 | 87 | /* 88 | - start data receiver 89 | - start flink 90 | */ 91 | object DataReceiver { 92 | def main(args: Array[String]): Unit = { 93 | val server = new ServerSocket(12345) 94 | println("Waiting for Flink to connect...") 95 | val socket = server.accept() 96 | val reader = new Scanner(socket.getInputStream) 97 | println("Flink connected. Reading...") 98 | 99 | while (reader.hasNextLine) { 100 | println(s"> ${reader.nextLine()}") 101 | } 102 | 103 | socket.close() 104 | println("All data read. Closing app.") 105 | server.close() 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/scala/part4io/CustomSources.scala: -------------------------------------------------------------------------------- 1 | package part4io 2 | 3 | import org.apache.flink.configuration.Configuration 4 | import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, RichSourceFunction, SourceFunction} 5 | import org.apache.flink.streaming.api.scala._ 6 | 7 | import java.io.PrintStream 8 | import java.net.{ServerSocket, Socket} 9 | import java.util.Scanner 10 | import scala.util.Random 11 | 12 | object CustomSources { 13 | val env = StreamExecutionEnvironment.getExecutionEnvironment 14 | 15 | // source of numbers, randomly generated 16 | class RandomNumberGeneratorSource(minEventsPerSeconds: Double) 17 | extends RichParallelSourceFunction[Long] { 18 | 19 | // create local fields/methods 20 | val maxSleepTime = (1000 / minEventsPerSeconds).toLong 21 | var isRunning: Boolean = true 22 | 23 | // called ONCE, when the function is instantiated 24 | // SourceFunction/RichSourceFunction runs on a (single) dedicated thread 25 | 26 | // Parallel function is called ONCE PER THREAD, each instance has its own thread 27 | override def run(ctx: SourceFunction.SourceContext[Long]): Unit = 28 | while (isRunning) { 29 | val sleepTime = Math.abs(Random.nextLong() % maxSleepTime) 30 | val nextNumber = Random.nextLong() 31 | Thread.sleep(sleepTime) 32 | 33 | // push something to the output 34 | ctx.collect(nextNumber) 35 | } 36 | 37 | // called at application shutdown 38 | // contract: the run method should stop immediately 39 | override def cancel(): Unit = 40 | isRunning = false 41 | 42 | // capability of lifecycle methods - initialize state ... 43 | override def open(parameters: Configuration): Unit = 44 | println(s"[${Thread.currentThread().getName}] starting source function") 45 | override def close(): Unit = 46 | println(s"[${Thread.currentThread().getName}] closing source function") 47 | 48 | // can hold state - ValueState, ListState, MapState 49 | } 50 | 51 | def demoSourceFunction(): Unit = { 52 | val numbersStream: DataStream[Long] = env.addSource(new RandomNumberGeneratorSource(10)).setParallelism(10) 53 | numbersStream.print() 54 | env.execute() 55 | } 56 | 57 | /** 58 | * Create a source function that reads data from a socket. 59 | */ 60 | 61 | class SocketStringSource(host: String, port: Int) extends RichSourceFunction[String] { 62 | // whenever you manage a resource, use a RichSourceFunction 63 | var socket: Socket = _ 64 | var isRunning = true 65 | 66 | override def run(ctx: SourceFunction.SourceContext[String]): Unit = { 67 | val scanner = new Scanner(socket.getInputStream) 68 | while (isRunning && scanner.hasNextLine) { 69 | ctx.collect(scanner.nextLine()) 70 | } 71 | } 72 | 73 | override def cancel(): Unit = 74 | isRunning = false 75 | 76 | override def open(parameters: Configuration): Unit = 77 | socket = new Socket(host, port) 78 | 79 | override def close(): Unit = 80 | socket.close() 81 | } 82 | 83 | def demoSocketSource(): Unit = { 84 | val socketStringStream = env.addSource(new SocketStringSource("localhost", 12345)) 85 | socketStringStream.print() 86 | env.execute() 87 | } 88 | 89 | def main(args: Array[String]): Unit = { 90 | demoSocketSource() 91 | } 92 | } 93 | 94 | /* 95 | - start DataSender 96 | - start Flink 97 | - DataSender -> Flink 98 | */ 99 | 100 | object DataSender { 101 | def main(args: Array[String]): Unit = { 102 | val serverSocket = new ServerSocket(12345) 103 | println("Waiting for Flink to connect...") 104 | 105 | val socket = serverSocket.accept() 106 | println("Flink connected. Sending data...") 107 | 108 | val printer = new PrintStream(socket.getOutputStream) 109 | printer.println("Hello from the other side...") 110 | Thread.sleep(3000) 111 | printer.println("Almost ready...") 112 | Thread.sleep(500) 113 | (1 to 10).foreach { i => 114 | Thread.sleep(200) 115 | printer.println(s"Number $i") 116 | } 117 | 118 | println("Data sending completed.") 119 | serverSocket.close() 120 | } 121 | } -------------------------------------------------------------------------------- /src/main/scala/part4io/JDBCIntegration.scala: -------------------------------------------------------------------------------- 1 | package part4io 2 | 3 | import org.apache.flink.connector.jdbc.{JdbcConnectionOptions, JdbcSink, JdbcStatementBuilder} 4 | import org.apache.flink.streaming.api.scala._ 5 | 6 | import java.sql.PreparedStatement 7 | 8 | object JDBCIntegration { 9 | 10 | val env = StreamExecutionEnvironment.getExecutionEnvironment 11 | 12 | case class Person(name: String, age: Int) 13 | 14 | // write data to JDBC 15 | def demoWriteToJDBC(): Unit = { 16 | val people = env.fromElements( 17 | Person("Daniel", 99), 18 | Person("Alice", 1), 19 | Person("Bob", 10), 20 | Person("Mary Jane", 43) 21 | ) 22 | 23 | val jdbcSink = JdbcSink.sink[Person]( 24 | // 1 - SQL statement 25 | "insert into people (name, age) values (?, ?)", 26 | new JdbcStatementBuilder[Person] { // the way to expand the wildcards with actual values 27 | override def accept(statement: PreparedStatement, person: Person): Unit = { 28 | statement.setString(1, person.name) // the first ? is replaced with person.name 29 | statement.setInt(2, person.age) // similar 30 | } 31 | }, 32 | new JdbcConnectionOptions.JdbcConnectionOptionsBuilder() 33 | .withUrl("jdbc:postgresql://localhost:5432/rtjvm") 34 | .withDriverName("org.postgresql.Driver") 35 | .withUsername("docker") 36 | .withPassword("docker") 37 | .build() 38 | ) 39 | 40 | // push the data through the sink 41 | people.addSink(jdbcSink) 42 | people.print() 43 | env.execute() 44 | } 45 | 46 | def main(args: Array[String]): Unit = { 47 | demoWriteToJDBC() 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/main/scala/part4io/KafkaIntegration.scala: -------------------------------------------------------------------------------- 1 | package part4io 2 | 3 | import org.apache.commons.lang3.CharSet 4 | import org.apache.flink.api.common.eventtime.WatermarkStrategy 5 | import org.apache.flink.api.common.serialization.{DeserializationSchema, SerializationSchema, SimpleStringSchema} 6 | import org.apache.flink.api.common.typeinfo.TypeInformation 7 | import org.apache.flink.connector.kafka.source.KafkaSource 8 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer 9 | import org.apache.flink.streaming.api.scala._ 10 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer 11 | 12 | object KafkaIntegration { 13 | 14 | val env = StreamExecutionEnvironment.getExecutionEnvironment 15 | 16 | // read simple data (strings) from a Kafka topic 17 | def readStrings(): Unit = { 18 | val kafkaSource = KafkaSource.builder[String]() 19 | .setBootstrapServers("localhost:9092") 20 | .setTopics("events") 21 | .setGroupId("events-group") 22 | .setStartingOffsets(OffsetsInitializer.earliest()) 23 | .setValueOnlyDeserializer(new SimpleStringSchema()) 24 | .build() 25 | 26 | val kafkaStrings: DataStream[String] = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "Kafka Source") 27 | 28 | // use the DS 29 | kafkaStrings.print() 30 | env.execute() 31 | } 32 | 33 | // read custom data 34 | case class Person(name: String, age: Int) 35 | class PersonDeserializer extends DeserializationSchema[Person] { 36 | override def deserialize(message: Array[Byte]): Person = { 37 | // format: name,age 38 | val string = new String(message) 39 | val tokens = string.split(",") 40 | val name = tokens(0) 41 | val age = tokens(1) 42 | Person(name, age.toInt) 43 | } 44 | 45 | override def isEndOfStream(nextElement: Person): Boolean = false 46 | 47 | override def getProducedType: TypeInformation[Person] = implicitly[TypeInformation[Person]] 48 | } 49 | 50 | def readCustomData(): Unit = { 51 | val kafkaSource = KafkaSource.builder[Person]() 52 | .setBootstrapServers("localhost:9092") 53 | .setTopics("people") 54 | .setGroupId("people-group") 55 | .setStartingOffsets(OffsetsInitializer.earliest()) 56 | .setValueOnlyDeserializer(new PersonDeserializer) 57 | .build() 58 | 59 | val kafkaPeople: DataStream[Person] = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "Kafka Source") 60 | 61 | // use the DS 62 | kafkaPeople.print() 63 | env.execute() 64 | } 65 | 66 | // write custom data 67 | // need serializer 68 | class PersonSerializer extends SerializationSchema[Person] { 69 | override def serialize(person: Person): Array[Byte] = 70 | s"${person.name},${person.age}".getBytes("UTF-8") 71 | } 72 | 73 | def writeCustomData(): Unit = { 74 | val kafkaSink = new FlinkKafkaProducer[Person]( 75 | "localhost:9092", // bootstrap server 76 | "people", // topic 77 | new PersonSerializer 78 | ) 79 | 80 | val peopleStream = env.fromElements( 81 | Person("Alice", 10), 82 | Person("Bob", 11), 83 | Person("Charlie", 12), 84 | ) 85 | 86 | peopleStream.addSink(kafkaSink) 87 | peopleStream.print() 88 | env.execute() 89 | } 90 | 91 | def main(args: Array[String]): Unit = { 92 | writeCustomData() 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/part4io/SideOutputs.scala: -------------------------------------------------------------------------------- 1 | package part4io 2 | 3 | import generators.shopping._ 4 | import org.apache.flink.streaming.api.functions.ProcessFunction 5 | import org.apache.flink.streaming.api.scala._ 6 | import org.apache.flink.util.Collector 7 | 8 | object SideOutputs { 9 | 10 | // shopping cart events 11 | // process this in 2 different ways with the same function 12 | // e.g. events for user "Alice", and all the events of everyone else 13 | 14 | val env = StreamExecutionEnvironment.getExecutionEnvironment 15 | val shoppingCartEvents = env.addSource(new SingleShoppingCartEventsGenerator(100)) 16 | 17 | // output tags - only available for ProcessFunctions 18 | val aliceTag = new OutputTag[ShoppingCartEvent]("alice-events") // name should be unique 19 | 20 | class AliceEventsFunction extends ProcessFunction[ShoppingCartEvent, ShoppingCartEvent] { 21 | override def processElement( 22 | event: ShoppingCartEvent, 23 | ctx: ProcessFunction[ShoppingCartEvent, ShoppingCartEvent]#Context, 24 | out: Collector[ShoppingCartEvent] // "primary" destination 25 | ): Unit = { 26 | if (event.userId == "Alice") { 27 | ctx.output(aliceTag, event) // collecting an event through a secondary destination 28 | } else { 29 | out.collect(event) 30 | } 31 | } 32 | } 33 | 34 | def demoSideOutput(): Unit = { 35 | val allEventsButAlices: DataStream[ShoppingCartEvent] = shoppingCartEvents.process(new AliceEventsFunction) 36 | val alicesEvents: DataStream[ShoppingCartEvent] = allEventsButAlices.getSideOutput(aliceTag) 37 | 38 | // process the datastreams separately 39 | alicesEvents.print() 40 | env.execute() 41 | } 42 | 43 | def main(args: Array[String]): Unit = { 44 | demoSideOutput() 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/playground/Playground.scala: -------------------------------------------------------------------------------- 1 | package playground 2 | 3 | import org.apache.flink.streaming.api.scala._ 4 | 5 | /** 6 | * Probably the simplest Flink application imaginable. 7 | * Run this app when you first download the repository of the course. 8 | * If the app compiles, runs and prints something, then Flink is installed in your project and you're good to go. 9 | * 10 | * Feel free to modify this app as you see fit. Practice and play with the concepts you learn in the course. 11 | */ 12 | object Playground { 13 | 14 | def main(args: Array[String]): Unit = { 15 | val env = StreamExecutionEnvironment.getExecutionEnvironment 16 | val data = env.fromElements(1 to 1000: _*) 17 | data.print() 18 | env.execute() 19 | } 20 | } 21 | --------------------------------------------------------------------------------