├── .bsp
└── sbt.json
├── .gitignore
├── .idea
├── .gitignore
├── codeStyles
│ ├── Project.xml
│ └── codeStyleConfig.xml
├── misc.xml
├── modules.xml
├── modules
│ ├── flink-essentials-build.iml
│ └── flink-essentials.iml
├── runConfigurations.xml
├── sbt.xml
├── scala_compiler.xml
└── vcs.xml
├── README.md
├── build.sbt
├── docker
├── cassandra
│ ├── cql.sh
│ └── docker-compose.yml
├── flink
│ ├── application-cluster
│ │ └── docker-compose.yml
│ └── session-cluster
│ │ └── docker-compose.yml
├── kafka
│ └── docker-compose.yml
└── postgres
│ └── docker-compose.yml
├── project
└── build.properties
└── src
└── main
├── resources
└── logback.xml
└── scala
├── generators
├── gaming
│ └── gaming.scala
└── shopping
│ └── package.scala
├── part1recap
└── ScalaRecap.scala
├── part2datastreams
├── EssentialStreams.scala
├── MultipleStreams.scala
├── Partitions.scala
├── TimeBasedTransformations.scala
├── Triggers.scala
└── WindowFunctions.scala
├── part3state
├── BroadcastState.scala
├── Checkpoints.scala
├── KeyedState.scala
└── RichFunctions.scala
├── part4io
├── CassandraIntegration.scala
├── CustomSinks.scala
├── CustomSources.scala
├── JDBCIntegration.scala
├── KafkaIntegration.scala
└── SideOutputs.scala
└── playground
└── Playground.scala
/.bsp/sbt.json:
--------------------------------------------------------------------------------
1 | {"name":"sbt","version":"1.6.2","bspVersion":"2.0.0-M5","languages":["scala"],"argv":["/Users/daniel/Library/Java/JavaVirtualMachines/adopt-openjdk-11.0.11/Contents/Home/bin/java","-Xms100m","-Xmx100m","-classpath","/Users/daniel/Library/Application Support/JetBrains/IdeaIC2021.2/plugins/Scala/launcher/sbt-launch.jar","xsbt.boot.Boot","-bsp","--sbt-launch-jar=/Users/daniel/Library/Application%20Support/JetBrains/IdeaIC2021.2/plugins/Scala/launcher/sbt-launch.jar"]}
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## output dirs
2 | output/
3 | checkpoints/
4 |
5 | # Created by https://www.toptal.com/developers/gitignore/api/intellij,sbt,scala,java
6 | # Edit at https://www.toptal.com/developers/gitignore?templates=intellij,sbt,scala,java
7 |
8 | ### Intellij ###
9 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
10 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
11 |
12 | # User-specific stuff
13 | .idea/**/workspace.xml
14 | .idea/**/tasks.xml
15 | .idea/**/usage.statistics.xml
16 | .idea/**/dictionaries
17 | .idea/**/shelf
18 |
19 | # AWS User-specific
20 | .idea/**/aws.xml
21 |
22 | # Generated files
23 | .idea/**/contentModel.xml
24 |
25 | # Sensitive or high-churn files
26 | .idea/**/dataSources/
27 | .idea/**/dataSources.ids
28 | .idea/**/dataSources.local.xml
29 | .idea/**/sqlDataSources.xml
30 | .idea/**/dynamic.xml
31 | .idea/**/uiDesigner.xml
32 | .idea/**/dbnavigator.xml
33 |
34 | # Gradle
35 | .idea/**/gradle.xml
36 | .idea/**/libraries
37 |
38 | # Gradle and Maven with auto-import
39 | # When using Gradle or Maven with auto-import, you should exclude module files,
40 | # since they will be recreated, and may cause churn. Uncomment if using
41 | # auto-import.
42 | # .idea/artifacts
43 | # .idea/compiler.xml
44 | # .idea/jarRepositories.xml
45 | # .idea/modules.xml
46 | # .idea/*.iml
47 | # .idea/modules
48 | # *.iml
49 | # *.ipr
50 |
51 | # CMake
52 | cmake-build-*/
53 |
54 | # Mongo Explorer plugin
55 | .idea/**/mongoSettings.xml
56 |
57 | # File-based project format
58 | *.iws
59 |
60 | # IntelliJ
61 | out/
62 |
63 | # mpeltonen/sbt-idea plugin
64 | .idea_modules/
65 |
66 | # JIRA plugin
67 | atlassian-ide-plugin.xml
68 |
69 | # Cursive Clojure plugin
70 | .idea/replstate.xml
71 |
72 | # SonarLint plugin
73 | .idea/sonarlint/
74 |
75 | # Crashlytics plugin (for Android Studio and IntelliJ)
76 | com_crashlytics_export_strings.xml
77 | crashlytics.properties
78 | crashlytics-build.properties
79 | fabric.properties
80 |
81 | # Editor-based Rest Client
82 | .idea/httpRequests
83 |
84 | # Android studio 3.1+ serialized cache file
85 | .idea/caches/build_file_checksums.ser
86 |
87 | ### Intellij Patch ###
88 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
89 |
90 | # *.iml
91 | # modules.xml
92 | # .idea/misc.xml
93 | # *.ipr
94 |
95 | # Sonarlint plugin
96 | # https://plugins.jetbrains.com/plugin/7973-sonarlint
97 | .idea/**/sonarlint/
98 |
99 | # SonarQube Plugin
100 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
101 | .idea/**/sonarIssues.xml
102 |
103 | # Markdown Navigator plugin
104 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
105 | .idea/**/markdown-navigator.xml
106 | .idea/**/markdown-navigator-enh.xml
107 | .idea/**/markdown-navigator/
108 |
109 | # Cache file creation bug
110 | # See https://youtrack.jetbrains.com/issue/JBR-2257
111 | .idea/$CACHE_FILE$
112 |
113 | # CodeStream plugin
114 | # https://plugins.jetbrains.com/plugin/12206-codestream
115 | .idea/codestream.xml
116 |
117 | ### Java ###
118 | # Compiled class file
119 | *.class
120 |
121 | # Log file
122 | *.log
123 |
124 | # BlueJ files
125 | *.ctxt
126 |
127 | # Mobile Tools for Java (J2ME)
128 | .mtj.tmp/
129 |
130 | # Package Files #
131 | *.jar
132 | *.war
133 | *.nar
134 | *.ear
135 | *.zip
136 | *.tar.gz
137 | *.rar
138 |
139 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
140 | hs_err_pid*
141 | replay_pid*
142 |
143 | ### SBT ###
144 | # Simple Build Tool
145 | # http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control
146 |
147 | dist/*
148 | target/
149 | lib_managed/
150 | src_managed/
151 | project/boot/
152 | project/plugins/project/
153 | .history
154 | .cache
155 | .lib/
156 |
157 | ### Scala ###
158 |
159 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
160 |
161 | # End of https://www.toptal.com/developers/gitignore/api/intellij,sbt,scala,java
162 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/codeStyles/Project.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/codeStyles/codeStyleConfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.idea/modules/flink-essentials-build.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
--------------------------------------------------------------------------------
/.idea/modules/flink-essentials.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
--------------------------------------------------------------------------------
/.idea/runConfigurations.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
9 |
10 |
--------------------------------------------------------------------------------
/.idea/sbt.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.idea/scala_compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # The official repository for the Rock the JVM Flink course for Scala developers
2 |
3 | This repository contains the code we wrote during [Rock the JVM's Flink course](https://rockthejvm.com/course/flink). Unless explicitly mentioned, the code in this repository is exactly what was caught on camera.
4 |
5 | ## How to install
6 |
7 | - install [IntelliJ IDEA](https://jetbrains.com/idea)
8 | - install [Docker](https://www.docker.com/products/docker-desktop)
9 | - either clone the repo or download as zip
10 | - open with IntelliJ as an SBT project
11 | - (optionally) in the `docker` directory, navigate to each subdir (except for `flink`) and run `docker-compose up`
12 |
13 | ### How to start
14 |
15 | Clone this repository and checkout the `start` tag by running the following in the repo folder:
16 |
17 | ```
18 | git checkout start
19 | ```
20 |
21 | To see the final code, run:
22 |
23 | ```
24 | git checkout master
25 | ```
26 |
27 | ### How to run an intermediate state
28 |
29 | The repository was built while recording the lectures. Prior to each lecture, I tagged each commit so you can easily go back to an earlier state of the repo!
30 |
31 | The tags are as follows:
32 |
33 | - `start`
34 | - `1.1-scala-recap`
35 | - `2.1-essential-streams`
36 | - `2.2-essential-streams-exercise`
37 | - `2.3-essential-streams-explicit`
38 | - `2.5-window-functions`
39 | - `2.6-window-functions-part-2`
40 | - `2.7-window-functions-exercise`
41 | - `2.8-time-based-transformations`
42 | - `2.9-triggers`
43 | - `2.10-multiple-streams`
44 | - `2.11-partitions`
45 | - `3.2-rich-functions`
46 | - `3.3-keyed-state`
47 | - `3.4-keyed-state-2`
48 | - `3.5-broadcast-state`
49 | - `3.6-checkpoint`
50 | - `4.1-kafka`
51 | - `4.2-jdbc`
52 | - `4.3-cassandra`
53 | - `4.4-source-functions`
54 | - `4.5-custom-sinks`
55 | - `4.6-side-outputs`
56 |
57 | When you watch a lecture, you can `git checkout` the appropriate tag and the repo will go back to the exact code I had when I started the lecture.
58 |
59 | ### For questions or suggestions
60 |
61 | If you have changes to suggest to this repo, either
62 | - submit a GitHub issue
63 | - tell me in the course Q/A forum
64 | - submit a pull request!
65 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | name := "flink-essentials"
2 |
3 | version := "0.1"
4 |
5 | scalaVersion := "2.12.15"
6 |
7 | val flinkVersion = "1.13.2"
8 | val postgresVersion = "42.2.2"
9 | val logbackVersion = "1.2.10"
10 |
11 | val flinkDependencies = Seq(
12 | "org.apache.flink" %% "flink-clients" % flinkVersion,
13 | "org.apache.flink" %% "flink-scala" % flinkVersion,
14 | "org.apache.flink" %% "flink-streaming-scala" % flinkVersion,
15 | )
16 |
17 | val flinkConnectors = Seq(
18 | "org.apache.flink" %% "flink-connector-kafka" % flinkVersion,
19 | "org.apache.flink" %% "flink-connector-cassandra" % flinkVersion,
20 | "org.apache.flink" %% "flink-connector-jdbc" % flinkVersion,
21 | "org.postgresql" % "postgresql" % postgresVersion
22 | )
23 |
24 | val logging = Seq(
25 | "ch.qos.logback" % "logback-core" % logbackVersion,
26 | "ch.qos.logback" % "logback-classic" % logbackVersion
27 | )
28 |
29 | libraryDependencies ++= flinkDependencies ++ flinkConnectors ++ logging
30 |
--------------------------------------------------------------------------------
/docker/cassandra/cql.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo "================== Help for cqlsh ========================="
3 | echo "DESCRIBE tables : Prints all tables in the current keyspace"
4 | echo "DESCRIBE keyspaces : Prints all keyspaces in the current cluster"
5 | echo "DESCRIBE : Prints table detail information"
6 | echo "help : for more cqlsh commands"
7 | echo "help [cqlsh command] : Gives information about cqlsh commands"
8 | echo "quit : quit"
9 | echo "=================================================================="
10 | docker exec -it rockthejvm-flink-cassandra cqlsh
--------------------------------------------------------------------------------
/docker/cassandra/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 | services:
3 | cassandra:
4 | image: cassandra:3
5 | container_name: rockthejvm-flink-cassandra
6 | ports:
7 | - "7000:7000"
8 | - "9042:9042"
9 | environment:
10 | - "CASSANDRA_CLUSTER_NAME=OUR_DOCKERIZED_CASSANDRA_SINGLE_NODE_CLUSTER"
--------------------------------------------------------------------------------
/docker/flink/application-cluster/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "2.2"
2 | services:
3 | jobmanager:
4 | image: flink:latest
5 | ports:
6 | - "8081:8081"
7 | command: standalone-job --job-classname com.job.ClassName
8 | volumes:
9 | - ../artifacts:/opt/flink/usrlib
10 | environment:
11 | - |
12 | FLINK_PROPERTIES=
13 | jobmanager.rpc.address: jobmanager
14 | parallelism.default: 2
15 |
16 | taskmanager:
17 | image: flink:latest
18 | depends_on:
19 | - jobmanager
20 | command: taskmanager
21 | scale: 1
22 | volumes:
23 | - ../artifacts:/opt/flink/usrlib
24 | environment:
25 | - |
26 | FLINK_PROPERTIES=
27 | jobmanager.rpc.address: jobmanager
28 | taskmanager.numberOfTaskSlots: 2
29 | parallelism.default: 2
--------------------------------------------------------------------------------
/docker/flink/session-cluster/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "2.2"
2 | services:
3 | jobmanager:
4 | image: flink:latest
5 | ports:
6 | - "8081:8081"
7 | volumes:
8 | - ../artifacts:/opt/flink/usrlib
9 | command: jobmanager
10 | environment:
11 | - |
12 | FLINK_PROPERTIES=
13 | jobmanager.rpc.address: jobmanager
14 |
15 | taskmanager:
16 | image: flink:latest
17 | depends_on:
18 | - jobmanager
19 | command: taskmanager
20 | scale: 1
21 | environment:
22 | - |
23 | FLINK_PROPERTIES=
24 | jobmanager.rpc.address: jobmanager
25 | taskmanager.numberOfTaskSlots: 2
--------------------------------------------------------------------------------
/docker/kafka/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 | services:
3 | zookeeper:
4 | image: confluentinc/cp-zookeeper:6.2.0
5 | hostname: zookeeper
6 | container_name: rockthejvm-flink-zookeeper
7 | ports:
8 | - "2181:2181"
9 | environment:
10 | ZOOKEEPER_CLIENT_PORT: 2181
11 | ZOOKEEPER_TICK_TIME: 2000
12 |
13 | kafka:
14 | image: confluentinc/cp-kafka:6.2.0
15 | hostname: broker
16 | container_name: rockthejvm-flink-broker
17 | depends_on:
18 | - zookeeper
19 | ports:
20 | - "29092:29092"
21 | - "9092:9092"
22 | - "9101:9101"
23 | environment:
24 | KAFKA_BROKER_ID: 1
25 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
26 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
27 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
28 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
29 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
30 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
31 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
32 | KAFKA_JMX_PORT: 9101
33 | KAFKA_JMX_HOSTNAME: localhost
--------------------------------------------------------------------------------
/docker/postgres/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '2'
2 |
3 | services:
4 | postgres:
5 | image: postgres:latest
6 | container_name: rockthejvm-flink-postgres
7 | environment:
8 | - "TZ=Europe/Amsterdam"
9 | - "POSTGRES_USER=docker"
10 | - "POSTGRES_PASSWORD=docker"
11 | ports:
12 | - "5432:5432"
13 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 1.6.2
--------------------------------------------------------------------------------
/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
6 |
7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/src/main/scala/generators/gaming/gaming.scala:
--------------------------------------------------------------------------------
1 | package generators
2 |
3 | import java.time.Instant
4 | import java.util.UUID
5 | import scala.concurrent.duration.FiniteDuration
6 |
7 | /**
8 | * A package describing events related to a multiplayer game.
9 | * We analyze some essential Flink features based on these data types.
10 | */
11 | package object gaming {
12 |
13 | sealed trait ServerEvent {
14 | def eventTime: Instant
15 | def getId: String
16 | }
17 |
18 | sealed trait GameType
19 | case object OneVsOne extends GameType
20 | case object TwoVsTwo extends GameType
21 | case object ThreeVsThree extends GameType
22 | case object FourVsFour extends GameType
23 |
24 | final case class GameStarted(
25 | eventTime: Instant,
26 | gameId: UUID,
27 | playerIds: Vector[UUID],
28 | mapId: String,
29 | regionId: String,
30 | gameType: GameType
31 | ) extends ServerEvent {
32 | override def getId: String = s"game|$gameId"
33 | }
34 |
35 | final case class GameFinished(
36 | eventTime: Instant,
37 | gameId: UUID
38 | ) extends ServerEvent {
39 | override def getId: String = s"game|$gameId"
40 | }
41 |
42 | final case class PlayerRegistered(
43 | eventTime: Instant,
44 | playerId: UUID,
45 | nickname: String
46 | ) extends ServerEvent {
47 | override def getId: String = s"player|$playerId|$nickname"
48 | }
49 |
50 | final case class PlayerOnline(
51 | eventTime: Instant,
52 | playerId: UUID,
53 | nickname: String
54 | ) extends ServerEvent {
55 | override def getId: String = s"player|$playerId|$nickname"
56 | }
57 |
58 | final case class PlayerIsLookingForAGame(
59 | eventTime: Instant,
60 | playerId: UUID,
61 | gameType: GameType
62 | ) extends ServerEvent {
63 | override def getId: String = s"player|$playerId"
64 | }
65 |
66 | final case class PlayerOffline(
67 | eventTime: Instant,
68 | playerId: UUID,
69 | nickname: String
70 | ) extends ServerEvent {
71 | override def getId: String = s"player|$playerId|$nickname"
72 | }
73 |
74 | case class Player(playerId: UUID, nickname: String) {
75 |
76 | def register(d: FiniteDuration)(implicit startTime: Instant): PlayerRegistered =
77 | PlayerRegistered(startTime.plusMillis(d.toMillis), playerId, nickname)
78 |
79 | def online(d: FiniteDuration)(implicit startTime: Instant): PlayerOnline =
80 | PlayerOnline(startTime.plusMillis(d.toMillis), playerId, nickname)
81 |
82 | def offline(d: FiniteDuration)(implicit startTime: Instant): PlayerOffline =
83 | PlayerOffline(startTime.plusMillis(d.toMillis), playerId, nickname)
84 |
85 | def lookingForAGame(
86 | startTime: Instant,
87 | d: FiniteDuration,
88 | gameType: GameType
89 | ): PlayerIsLookingForAGame =
90 | PlayerIsLookingForAGame(
91 | startTime.plusMillis(d.toMillis),
92 | playerId,
93 | gameType
94 | )
95 | }
96 |
97 | val bob: Player = Player(UUID.randomUUID(), "bob")
98 | val sam: Player = Player(UUID.randomUUID(), "sam")
99 | val rob: Player = Player(UUID.randomUUID(), "rob")
100 | val alice: Player = Player(UUID.randomUUID(), "alice")
101 | val mary: Player = Player(UUID.randomUUID(), "mary")
102 | val carl: Player = Player(UUID.randomUUID(), "carl")
103 | }
104 |
--------------------------------------------------------------------------------
/src/main/scala/generators/shopping/package.scala:
--------------------------------------------------------------------------------
1 | package generators
2 |
3 | import org.apache.flink.streaming.api.functions.source.{
4 | RichParallelSourceFunction,
5 | SourceFunction
6 | }
7 |
8 | import java.util.UUID
9 | import scala.annotation.tailrec
10 | import org.apache.flink.streaming.api.watermark.Watermark
11 |
12 | /**
13 | * A package describing data types and generator functions for shopping cart events.
14 | * We analyze a variety of scenarios in the course, and these generators were built for this purpose.
15 | */
16 | package object shopping {
17 |
18 | sealed trait ShoppingCartEvent {
19 | def userId: String
20 |
21 | def time: java.time.Instant
22 | }
23 |
24 | case class AddToShoppingCartEvent(
25 | userId: String,
26 | sku: String,
27 | quantity: Int,
28 | time: java.time.Instant
29 | ) extends ShoppingCartEvent
30 |
31 | case class RemovedFromShoppingCartEvent(
32 | userId: String,
33 | sku: String,
34 | quantity: Int,
35 | time: java.time.Instant
36 | ) extends ShoppingCartEvent
37 |
38 |
39 | class ShoppingCartEventsGenerator(
40 | sleepMillisPerEvent: Int,
41 | batchSize: Int,
42 | baseInstant: java.time.Instant = java.time.Instant.now()
43 | ) extends SourceFunction[ShoppingCartEvent] {
44 |
45 | import ShoppingCartEventsGenerator._
46 |
47 | @volatile private var running = true
48 |
49 | @tailrec
50 | private def run(
51 | startId: Long,
52 | ctx: SourceFunction.SourceContext[ShoppingCartEvent]
53 | ): Unit =
54 | if (running) {
55 | generateRandomEvents(startId).foreach(ctx.collect)
56 | Thread.sleep(batchSize * sleepMillisPerEvent)
57 | run(startId + batchSize, ctx)
58 | }
59 |
60 | private def generateRandomEvents(id: Long): Seq[AddToShoppingCartEvent] = {
61 | val events = (1 to batchSize)
62 | .map(_ =>
63 | AddToShoppingCartEvent(
64 | getRandomUser,
65 | UUID.randomUUID().toString,
66 | getRandomQuantity,
67 | baseInstant.plusSeconds(id)
68 | )
69 | )
70 |
71 | events
72 | }
73 |
74 | override def run(
75 | ctx: SourceFunction.SourceContext[ShoppingCartEvent]
76 | ): Unit = run(0, ctx)
77 |
78 | override def cancel(): Unit = {
79 | running = false
80 | }
81 | }
82 |
83 | class SingleShoppingCartEventsGenerator(
84 | sleepMillisBetweenEvents: Int,
85 | baseInstant: java.time.Instant = java.time.Instant.now(),
86 | extraDelayInMillisOnEveryTenEvents: Option[Long] = None,
87 | sourceId: Option[String] = None,
88 | generateRemoved: Boolean = false
89 | ) extends EventGenerator[ShoppingCartEvent](
90 | sleepMillisBetweenEvents,
91 | SingleShoppingCartEventsGenerator.generateEvent(
92 | generateRemoved,
93 | () => sourceId
94 | .map(sId => s"${sId}_${UUID.randomUUID()}")
95 | .getOrElse(UUID.randomUUID().toString),
96 | baseInstant
97 | ),
98 | baseInstant,
99 | extraDelayInMillisOnEveryTenEvents
100 | )
101 |
102 | object SingleShoppingCartEventsGenerator {
103 |
104 | import ShoppingCartEventsGenerator._
105 |
106 | def generateEvent
107 | : (Boolean, () => String, java.time.Instant) => Long => ShoppingCartEvent =
108 | (generateRemoved, skuGen, baseInstant) =>
109 | id =>
110 | if (!generateRemoved || scala.util.Random.nextBoolean())
111 | AddToShoppingCartEvent(
112 | getRandomUser,
113 | skuGen(),
114 | getRandomQuantity,
115 | baseInstant.plusSeconds(id)
116 | )
117 | else
118 | RemovedFromShoppingCartEvent(
119 | getRandomUser,
120 | skuGen(),
121 | getRandomQuantity,
122 | baseInstant.plusSeconds(id)
123 | )
124 | }
125 |
126 | class EventGenerator[T](
127 | sleepMillisBetweenEvents: Int,
128 | generator: Long => T,
129 | baseInstant: java.time.Instant,
130 | extraDelayInMillisOnEveryTenEvents: Option[Long] = None
131 | ) extends RichParallelSourceFunction[T] {
132 | @volatile private var running = true
133 |
134 | @tailrec
135 | private def run(
136 | id: Long,
137 | ctx: SourceFunction.SourceContext[T]
138 | ): Unit =
139 | if (running) {
140 | ctx.collect(
141 | generator(id)
142 | )
143 | // this generator emits a watermark mimicking the same logic of
144 | // incrementing each element's timestamp
145 | ctx.emitWatermark(new Watermark(baseInstant.plusSeconds(id).toEpochMilli))
146 | Thread.sleep(sleepMillisBetweenEvents)
147 | if (id % 10 == 0) extraDelayInMillisOnEveryTenEvents.foreach(Thread.sleep)
148 | run(id + 1, ctx)
149 | }
150 |
151 | override def run(ctx: SourceFunction.SourceContext[T]): Unit =
152 | run(1, ctx)
153 |
154 | override def cancel(): Unit = {
155 | running = false
156 | }
157 | }
158 |
159 | object ShoppingCartEventsGenerator {
160 | val users: Vector[String] = Vector("Bob", "Alice", "Sam", "Tom", "Diana")
161 |
162 | def getRandomUser: String = users(scala.util.Random.nextInt(users.length))
163 |
164 | def getRandomQuantity: Int = scala.util.Random.nextInt(10)
165 | }
166 |
167 | sealed trait CatalogEvent {
168 | def userId: String
169 |
170 | def time: java.time.Instant
171 | }
172 |
173 | case class ProductDetailsViewed(
174 | userId: String,
175 | time: java.time.Instant,
176 | productId: String
177 | ) extends CatalogEvent
178 |
179 | class CatalogEventsGenerator(
180 | sleepMillisBetweenEvents: Int,
181 | baseInstant: java.time.Instant = java.time.Instant.now(),
182 | extraDelayInMillisOnEveryTenEvents: Option[Long] = None
183 | ) extends EventGenerator[CatalogEvent](
184 | sleepMillisBetweenEvents,
185 | id =>
186 | ProductDetailsViewed(
187 | ShoppingCartEventsGenerator.getRandomUser,
188 | baseInstant.plusSeconds(id),
189 | UUID.randomUUID().toString
190 | ),
191 | baseInstant,
192 | extraDelayInMillisOnEveryTenEvents
193 | )
194 | }
--------------------------------------------------------------------------------
/src/main/scala/part1recap/ScalaRecap.scala:
--------------------------------------------------------------------------------
1 | package part1recap
2 |
3 | import java.util.concurrent.Executors
4 | import scala.concurrent.{ExecutionContext, Future}
5 | import scala.util.{Try, Success, Failure}
6 |
7 | object ScalaRecap {
8 |
9 | // value
10 | val aBoolean: Boolean = false
11 | var aVariable: Int = 56
12 | aVariable += 1
13 |
14 | // expressions
15 | val anIfExpression: String = if (2 > 3) "bigger" else "smaller"
16 |
17 | // instructions vs expressions
18 | val theUnit: Unit = println("Hello, Scala") // Unit === "void"
19 |
20 | // OOP
21 | class Animal
22 | class Cat extends Animal
23 | trait Carnivore {
24 | def eat(animal: Animal): Unit
25 | }
26 |
27 | // inheritance: extends <= 1 class, but inherit from >= 0 traits
28 | class Crocodile extends Animal with Carnivore {
29 | override def eat(animal: Animal): Unit = println("eating this poor fellow")
30 | }
31 |
32 | // singleton
33 | object MySingleton
34 |
35 | // companions
36 | object Carnivore
37 |
38 | // case classes
39 | case class Person(name: String, age: Int)
40 |
41 | // generics
42 | class MyList[A] // can add variance modifiers - not important for this course
43 |
44 | // method notation
45 | // croc.eat(animal) OR croc eat animal
46 | val three = 1 + 2
47 | val three_v2 = 1.+(2)
48 |
49 | // FP
50 | val incrementer: Int => Int = x => x + 1
51 | val incremented = incrementer(4) // 5, same as incrementer.apply(4)
52 |
53 | // map flatMap filter = HOFs
54 | val processedList = List(1,2,3).map(incrementer) // [2,3,4]
55 | val aLongerList = List(1,2,3).flatMap(x => List(x, x + 1)) // [1,2, 2,3, 3,4]
56 |
57 | // for-comprehensions
58 | val checkerboard = List(1,2,3).flatMap(n => List('a', 'b', 'c').map(c => (n, c)))
59 | val checkerboard_v2 = for {
60 | n <- List(1,2,3)
61 | c <- List('a', 'b', 'c')
62 | } yield (n, c) // same
63 |
64 | // options and try
65 | val anOption: Option[Int] = Option(/* something that might be null*/ 43)
66 | val doubleOption = anOption.map(_ * 2)
67 |
68 | val anAttempt: Try[Int] = Try(12)
69 | val modifiedAttempt = anAttempt.map(_ * 10)
70 |
71 | // pattern matching
72 | val anUnknown: Any = 45
73 | val medal = anUnknown match {
74 | case 1 => "gold"
75 | case 2 => "silver"
76 | case 3 => "bronze"
77 | case _ => "no medal"
78 | }
79 |
80 | val optionDescription = anOption match {
81 | case Some(value) => s"the option is not empty: $value"
82 | case None => "the option is empty"
83 | }
84 |
85 | // Futures
86 | implicit val ec: ExecutionContext = ExecutionContext.fromExecutorService(Executors.newFixedThreadPool(8))
87 | val aFuture = Future(/* something to be evaluated on another thread*/ 1 + 41)
88 |
89 | // register callback when it finishes
90 | aFuture.onComplete {
91 | case Success(value) => println(s"the async meaning of life is $value")
92 | case Failure(exception) => println(s"the meaning of value failed: $exception")
93 | }
94 |
95 | val aPartialFunction: PartialFunction[Try[Int], Unit] = {
96 | case Success(value) => println(s"the async meaning of life is $value")
97 | case Failure(exception) => println(s"the meaning of value failed: $exception")
98 | }
99 |
100 | // map, flatMap, filter, ...
101 | val doubledAsyncMOL: Future[Int] = aFuture.map(_ * 2)
102 |
103 | // implicits
104 |
105 | // 1 - implicit arguments and values
106 | implicit val timeout: Int = 3000 // implicit val == given instance
107 | def setTimeout(f: () => Unit)(implicit tout: Int) = { // (using tout: Int)
108 | Thread.sleep(tout)
109 | f()
110 | }
111 |
112 | setTimeout(() => println("timeout")) // (timeout)
113 |
114 | // 2 - extension methods
115 | implicit class MyRichInt(number: Int) { // implicit class = extension
116 | def isEven: Boolean = number % 2 == 0
117 | }
118 |
119 | val is2Even = 2.isEven // new RichInt(2).isEven
120 |
121 | // 3 - conversions
122 | implicit def string2Person(name: String): Person =
123 | Person(name, 57)
124 |
125 | val daniel: Person = "Daniel" // string2Person("Daniel")
126 |
127 | def main(args: Array[String]): Unit = {
128 |
129 | }
130 | }
131 |
--------------------------------------------------------------------------------
/src/main/scala/part2datastreams/EssentialStreams.scala:
--------------------------------------------------------------------------------
1 | package part2datastreams
2 |
3 | import org.apache.flink.api.common.functions.{FlatMapFunction, MapFunction, ReduceFunction}
4 | import org.apache.flink.api.common.serialization.SimpleStringEncoder
5 | import org.apache.flink.core.fs.Path
6 | import org.apache.flink.streaming.api.functions.ProcessFunction
7 | import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink
8 | import org.apache.flink.streaming.api.scala._
9 | import org.apache.flink.util.Collector
10 |
11 | object EssentialStreams {
12 |
13 | def applicationTemplate(): Unit = {
14 | // execution environment
15 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
16 |
17 | // in between, add any sort of computations
18 | import org.apache.flink.streaming.api.scala._ // import TypeInformation for the data of your DataStreams
19 | val simpleNumberStream: DataStream[Int] = env.fromElements(1,2,3,4)
20 |
21 | // perform some actions
22 | simpleNumberStream.print()
23 |
24 | // at the end
25 | env.execute() // trigger all the computations that were DESCRIBED earlier
26 | }
27 |
28 | // transformations
29 | def demoTransformations(): Unit = {
30 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
31 | val numbers: DataStream[Int] = env.fromElements(1,2,3,4,5)
32 |
33 | // checking parallelism
34 | println(s"Current parallelism: ${env.getParallelism}")
35 | // set different parallelism
36 | env.setParallelism(2)
37 | println(s"New parallelism: ${env.getParallelism}")
38 |
39 | // map
40 | val doubledNumbers: DataStream[Int] = numbers.map(_ * 2)
41 |
42 | // flatMap
43 | val expandedNumbers: DataStream[Int] = numbers.flatMap(n => List(n, n + 1))
44 |
45 | // filter
46 | val filteredNumbers: DataStream[Int] = numbers
47 | .filter(_ % 2 == 0)
48 | /* you can set parallelism here*/.setParallelism(4)
49 |
50 | val finalData = expandedNumbers.writeAsText("output/expandedStream") // directory with 12 files
51 | // set parallelism in the sink
52 | finalData.setParallelism(3)
53 |
54 | env.execute()
55 | }
56 |
57 | /**
58 | * Exercise: FizzBuzz on Flink
59 | * - take a stream of 100 natural numbers
60 | * - for every number
61 | * - if n % 3 == 0 then return "fizz"
62 | * - if n % 5 == 0 => "buzz"
63 | * - if both => "fizzbuzz"
64 | * - write the numbers for which you said "fizzbuzz" to a file
65 | */
66 | case class FizzBuzzResult(n: Long, output: String)
67 |
68 | def fizzBuzzExercise(): Unit = {
69 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
70 | val numbers = env.fromSequence(1, 100)
71 |
72 | // map
73 | val fizzbuzz = numbers
74 | .map { n =>
75 | val output =
76 | if (n % 3 == 0 && n % 5 == 0) "fizzbuzz"
77 | else if (n % 3 == 0) "fizz"
78 | else if (n % 5 == 0) "buzz"
79 | else s"$n"
80 | FizzBuzzResult(n, output)
81 | }
82 | .filter(_.output == "fizzbuzz") // DataStream[FizzBuzzResult]
83 | .map(_.n) // DataStream[Long]
84 |
85 | // alternative to
86 | // fizzbuzz.writeAsText("output/fizzbuzz.txt").setParallelism(1)
87 |
88 | // add a SINK
89 | fizzbuzz.addSink(
90 | StreamingFileSink
91 | .forRowFormat(
92 | new Path("output/streaming_sink"),
93 | new SimpleStringEncoder[Long]("UTF-8")
94 | )
95 | .build()
96 | ).setParallelism(1)
97 |
98 | env.execute()
99 | }
100 |
101 | // explicit transformations
102 | def demoExplicitTransformations(): Unit = {
103 | val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
104 | val numbers = env.fromSequence(1, 100)
105 |
106 | // map
107 | val doubledNumbers = numbers.map(_ * 2)
108 |
109 | // explicit version
110 | val doubledNumbers_v2 = numbers.map(new MapFunction[Long, Long] {
111 | // declare fields, methods, ...
112 | override def map(value: Long) = value * 2
113 | })
114 |
115 | // flatMap
116 | val expandedNumbers = numbers.flatMap(n => Range.Long(1, n, 1).toList)
117 |
118 | // explicit version
119 | val expandedNumbers_v2 = numbers.flatMap(new FlatMapFunction[Long, Long] {
120 | // declare fields, methods, ...
121 | override def flatMap(n: Long, out: Collector[Long]) =
122 | Range.Long(1, n, 1).foreach { i =>
123 | out.collect(i) // imperative style - pushes the new element downstream
124 | }
125 | })
126 |
127 | // process method
128 | // ProcessFunction is THE MOST GENERAL function to process elements in Flink
129 | val expandedNumbers_v3 = numbers.process(new ProcessFunction[Long, Long] {
130 | override def processElement(n: Long, ctx: ProcessFunction[Long, Long]#Context, out: Collector[Long]) =
131 | Range.Long(1, n, 1).foreach { i =>
132 | out.collect(i)
133 | }
134 | })
135 |
136 | // reduce
137 | // happens on keyed streams
138 | /*
139 | [ 1, false
140 | 2, true
141 |
142 | 100, true
143 |
144 | true => 2, 6, 12, 20, ...
145 | false => 1, 4, 9, 16, ...
146 | */
147 | val keyedNumbers: KeyedStream[Long, Boolean] = numbers.keyBy(n => n % 2 == 0)
148 |
149 | // reduce - FP approach
150 | val sumByKey = keyedNumbers.reduce(_ + _) // sum up all the elements BY KEY
151 |
152 | // reduce - explicit approach
153 | val sumByKey_v2 = keyedNumbers.reduce(new ReduceFunction[Long] {
154 | // additional fields, methods...
155 | override def reduce(x: Long, y: Long): Long = x + y
156 | })
157 |
158 | sumByKey_v2.print()
159 | env.execute()
160 | }
161 |
162 | def main(args: Array[String]): Unit = {
163 | demoExplicitTransformations()
164 | }
165 | }
166 |
--------------------------------------------------------------------------------
/src/main/scala/part2datastreams/MultipleStreams.scala:
--------------------------------------------------------------------------------
1 | package part2datastreams
2 |
3 | import generators.shopping._
4 | import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
5 | import org.apache.flink.streaming.api.functions.co.{CoProcessFunction, ProcessJoinFunction}
6 | import org.apache.flink.streaming.api.scala._
7 | import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows
8 | import org.apache.flink.streaming.api.windowing.time.Time
9 | import org.apache.flink.util.Collector
10 |
11 | object MultipleStreams {
12 |
13 | /*
14 | - union
15 | - window join
16 | - interval join
17 | - connect
18 | */
19 |
20 | // Unioning = combining the output of multiple streams into just one
21 | def demoUnion(): Unit = {
22 | val env = StreamExecutionEnvironment.getExecutionEnvironment
23 |
24 | // define two streams of the same type
25 | val shoppingCartEventsKafka: DataStream[ShoppingCartEvent] =
26 | env.addSource(new SingleShoppingCartEventsGenerator(300, sourceId = Option("kafka")))
27 |
28 | val shoppingCartEventsFiles: DataStream[ShoppingCartEvent] =
29 | env.addSource(new SingleShoppingCartEventsGenerator(1000, sourceId = Option("files")))
30 |
31 | val combinedShoppingCartEventStream: DataStream[ShoppingCartEvent] =
32 | shoppingCartEventsKafka.union(shoppingCartEventsFiles)
33 |
34 | combinedShoppingCartEventStream.print()
35 | env.execute()
36 | }
37 |
38 | // window join = elements belong to the same window + some join condition
39 | def demoWindowJoins(): Unit = {
40 | val env = StreamExecutionEnvironment.getExecutionEnvironment
41 |
42 | val shoppingCartEvents = env.addSource(new SingleShoppingCartEventsGenerator(1000, sourceId = Option("kafka")))
43 | val catalogEvents = env.addSource(new CatalogEventsGenerator(200))
44 |
45 | val joinedStream = shoppingCartEvents
46 | .join(catalogEvents)
47 | // provide a join condition
48 | .where(shoppingCartEvent => shoppingCartEvent.userId)
49 | .equalTo(catalogEvent => catalogEvent.userId)
50 | // provide the same window grouping
51 | .window(TumblingProcessingTimeWindows.of(Time.seconds(5)))
52 | // do something with correlated events
53 | .apply {
54 | (shoppingCartEvent, catalogEvent) =>
55 | s"User ${shoppingCartEvent.userId} browsed at ${catalogEvent.time} and bought at ${shoppingCartEvent.time}"
56 | }
57 |
58 | joinedStream.print()
59 | env.execute()
60 | }
61 |
62 | // interval joins = correlation between events A and B if durationMin < timeA - timeB < durationMax
63 | // involves EVENT TIME
64 | // only works on KEYED STREAMS
65 |
66 | def demoIntervalJoins(): Unit = {
67 | val env = StreamExecutionEnvironment.getExecutionEnvironment
68 |
69 | // we need to extract event times from both streams
70 | val shoppingCartEvents =
71 | env.addSource(new SingleShoppingCartEventsGenerator(300, sourceId = Option("kafka")))
72 | .assignTimestampsAndWatermarks(
73 | WatermarkStrategy.forBoundedOutOfOrderness(java.time.Duration.ofMillis(500))
74 | .withTimestampAssigner(new SerializableTimestampAssigner[ShoppingCartEvent] {
75 | override def extractTimestamp(element: ShoppingCartEvent, recordTimestamp: Long) =
76 | element.time.toEpochMilli
77 | })
78 | )
79 | .keyBy(_.userId)
80 |
81 | val catalogEvents = env.addSource(new CatalogEventsGenerator(500))
82 | .assignTimestampsAndWatermarks(
83 | WatermarkStrategy.forBoundedOutOfOrderness(java.time.Duration.ofMillis(500))
84 | .withTimestampAssigner(new SerializableTimestampAssigner[CatalogEvent] {
85 | override def extractTimestamp(element: CatalogEvent, recordTimestamp: Long) =
86 | element.time.toEpochMilli
87 | })
88 | )
89 | .keyBy(_.userId)
90 |
91 | val intervalJoinedStream = shoppingCartEvents
92 | .intervalJoin(catalogEvents)
93 | .between(Time.seconds(-2), Time.seconds(2))
94 | .lowerBoundExclusive() // interval is by default inclusive
95 | .upperBoundExclusive()
96 | .process(new ProcessJoinFunction[ShoppingCartEvent, CatalogEvent, String] {
97 | override def processElement(
98 | left: ShoppingCartEvent,
99 | right: CatalogEvent,
100 | ctx: ProcessJoinFunction[ShoppingCartEvent, CatalogEvent, String]#Context,
101 | out: Collector[String]
102 | ) =
103 | out.collect(s"User ${left.userId} browsed at ${right.time} and bought at ${left.time}")
104 | })
105 |
106 | intervalJoinedStream.print()
107 | env.execute()
108 | }
109 |
110 | // connect = two streams are treated with the same "operator"
111 | def demoConnect(): Unit = {
112 | val env = StreamExecutionEnvironment.getExecutionEnvironment
113 |
114 | // two separate streams
115 | val shoppingCartEvents = env.addSource(new SingleShoppingCartEventsGenerator(100)).setParallelism(1)
116 | val catalogEvents = env.addSource(new CatalogEventsGenerator(1000)).setParallelism(1)
117 |
118 | // connect the streams
119 | val connectedStream: ConnectedStreams[ShoppingCartEvent, CatalogEvent] = shoppingCartEvents.connect(catalogEvents)
120 |
121 | // variables - will use single-threaded
122 | env.setParallelism(1)
123 | env.setMaxParallelism(1)
124 |
125 | val ratioStream: DataStream[Double] = connectedStream.process(
126 | new CoProcessFunction[ShoppingCartEvent, CatalogEvent, Double] {
127 | var shoppingCartEventCount = 0
128 | var catalogEventCount = 0
129 |
130 | override def processElement1(
131 | value: ShoppingCartEvent,
132 | ctx: CoProcessFunction[ShoppingCartEvent, CatalogEvent, Double]#Context,
133 | out: Collector[Double]
134 | ) = {
135 | shoppingCartEventCount += 1
136 | out.collect(shoppingCartEventCount * 100.0 / (shoppingCartEventCount + catalogEventCount))
137 | }
138 |
139 | override def processElement2(
140 | value: CatalogEvent,
141 | ctx: CoProcessFunction[ShoppingCartEvent, CatalogEvent, Double]#Context,
142 | out: Collector[Double]
143 | ) = {
144 | catalogEventCount += 1
145 | out.collect(shoppingCartEventCount * 100.0 / (shoppingCartEventCount + catalogEventCount))
146 | }
147 | }
148 | )
149 |
150 | ratioStream.print()
151 | env.execute()
152 | }
153 |
154 | def main(args: Array[String]): Unit = {
155 | demoConnect()
156 | }
157 | }
158 |
--------------------------------------------------------------------------------
/src/main/scala/part2datastreams/Partitions.scala:
--------------------------------------------------------------------------------
1 | package part2datastreams
2 |
3 | import generators.shopping._
4 | import org.apache.flink.api.common.functions.Partitioner
5 | import org.apache.flink.streaming.api.scala._
6 |
7 | object Partitions {
8 |
9 | // splitting = partitioning
10 |
11 | def demoPartitioner(): Unit = {
12 | val env = StreamExecutionEnvironment.getExecutionEnvironment
13 |
14 | val shoppingCartEvents: DataStream[ShoppingCartEvent] =
15 | env.addSource(new SingleShoppingCartEventsGenerator(100)) // ~10 events/s
16 |
17 | // partitioner = logic to split the data
18 | val partitioner = new Partitioner[String] {
19 | override def partition(key: String, numPartitions: Int): Int = { // invoked on every event
20 | // hash code % number of partitions ~ even distribution
21 | println(s"Number of max partitions: $numPartitions")
22 | key.hashCode % numPartitions
23 | }
24 | }
25 |
26 | val partitionedStream = shoppingCartEvents.partitionCustom(
27 | partitioner,
28 | event => event.userId
29 | )
30 |
31 | /*
32 | Bad because
33 | - you lose parallelism
34 | - you risk overloading the task with the disproportionate data
35 |
36 | Good for e.g. sending HTTP requests
37 | */
38 | val badPartitioner = new Partitioner[String] {
39 | override def partition(key: String, numPartitions: Int): Int = { // invoked on every event
40 | numPartitions - 1 // last partition index
41 | }
42 | }
43 |
44 | val badPartitionedStream = shoppingCartEvents.partitionCustom(
45 | badPartitioner,
46 | event => event.userId
47 | )
48 | // redistribution of data evenly - involves data transfer through network
49 | .shuffle
50 |
51 | badPartitionedStream.print()
52 | env.execute()
53 | }
54 |
55 |
56 | def main(args: Array[String]): Unit = {
57 | demoPartitioner()
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/scala/part2datastreams/TimeBasedTransformations.scala:
--------------------------------------------------------------------------------
1 | package part2datastreams
2 |
3 | import generators.shopping._
4 | import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, Watermark, WatermarkGenerator, WatermarkOutput, WatermarkStrategy}
5 | import org.apache.flink.streaming.api.scala._
6 | import org.apache.flink.streaming.api.scala.function.ProcessAllWindowFunction
7 | import org.apache.flink.streaming.api.windowing.assigners.{TumblingEventTimeWindows, TumblingProcessingTimeWindows}
8 | import org.apache.flink.streaming.api.windowing.time.Time
9 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow
10 | import org.apache.flink.util.Collector
11 |
12 | import java.time.Instant
13 |
14 | object TimeBasedTransformations {
15 |
16 | val env = StreamExecutionEnvironment.getExecutionEnvironment
17 |
18 | val shoppingCartEvents: DataStream[ShoppingCartEvent] = env.addSource(
19 | new ShoppingCartEventsGenerator(
20 | sleepMillisPerEvent = 100,
21 | batchSize = 5,
22 | baseInstant = Instant.parse("2022-02-15T00:00:00.000Z")
23 | )
24 | )
25 |
26 | // 1. Event time = the moment the event was CREATED
27 | // 2. Processing time = the moment the event ARRIVES AT FLINK
28 |
29 | class CountByWindowAll extends ProcessAllWindowFunction[ShoppingCartEvent, String, TimeWindow] {
30 | override def process(context: Context, elements: Iterable[ShoppingCartEvent], out: Collector[String]): Unit = {
31 | val window = context.window
32 | out.collect(s"Window [${window.getStart} - ${window.getEnd}] ${elements.size}")
33 | }
34 | }
35 | /*
36 | Group by window, every 3s, tumbling (non-overlapping), PROCESSING TIME
37 | */
38 | /*
39 | With processing time
40 | - we don't care when the event was created
41 | - multiple runs generate different results
42 | */
43 | def demoProcessingTime(): Unit = {
44 | def groupedEventsByWindow = shoppingCartEvents.windowAll(TumblingProcessingTimeWindows.of(Time.seconds(3)))
45 | def countEventsByWindow: DataStream[String] = groupedEventsByWindow.process(new CountByWindowAll)
46 | countEventsByWindow.print()
47 | env.execute()
48 | }
49 |
50 | /*
51 | With event time
52 | - we NEED to care about handling late data - done with watermarks
53 | - we don't care about Flink internal time
54 | - we might see faster results
55 | - same events + different runs => same results
56 | */
57 | def demoEventTime(): Unit = {
58 | val groupedEventsByWindow = shoppingCartEvents
59 | .assignTimestampsAndWatermarks(
60 | WatermarkStrategy
61 | .forBoundedOutOfOrderness(java.time.Duration.ofMillis(500)) // max delay < 500 millis
62 | .withTimestampAssigner(new SerializableTimestampAssigner[ShoppingCartEvent] {
63 | override def extractTimestamp(element: ShoppingCartEvent, recordTimestamp: Long) = element.time.toEpochMilli
64 | })
65 | )
66 | .windowAll(TumblingEventTimeWindows.of(Time.seconds(3)))
67 |
68 | def countEventsByWindow: DataStream[String] = groupedEventsByWindow.process(new CountByWindowAll)
69 | countEventsByWindow.print()
70 | env.execute()
71 | }
72 |
73 | /**
74 | Custom watermarks
75 | */
76 | // with every new MAX timestamp, every new incoming element with event time < max timestamp - max delay will be discarded
77 | class BoundedOutOfOrdernessGenerator(maxDelay: Long) extends WatermarkGenerator[ShoppingCartEvent] {
78 | var currentMaxTimestamp: Long = 0L
79 |
80 | // maybe emit watermark on a particular event
81 | override def onEvent(event: ShoppingCartEvent, eventTimestamp: Long, output: WatermarkOutput) = {
82 | // ^ event being processed ^ timestamp attached to the event
83 | currentMaxTimestamp = Math.max(currentMaxTimestamp, event.time.toEpochMilli)
84 | // emitting a watermark is NOT mandatory
85 | // output.emitWatermark(new Watermark(event.time.toEpochMilli)) // every new event older than THIS EVENT will be discarded
86 | }
87 |
88 | // Flink can also call onPeriodicEmit regularly - up to us to maybe emit a watermark at these times
89 | override def onPeriodicEmit(output: WatermarkOutput) =
90 | output.emitWatermark(new Watermark(currentMaxTimestamp - maxDelay - 1))
91 | }
92 |
93 | def demoEventTime_v2(): Unit = {
94 | // control how often Flink calls onPeriodicEmit
95 | env.getConfig.setAutoWatermarkInterval(1000L) // call onPeriodicEmit every 1s
96 |
97 | val groupedEventsByWindow = shoppingCartEvents
98 | .assignTimestampsAndWatermarks(
99 | WatermarkStrategy
100 | .forGenerator(_ => new BoundedOutOfOrdernessGenerator(500L))
101 | .withTimestampAssigner(new SerializableTimestampAssigner[ShoppingCartEvent] {
102 | override def extractTimestamp(element: ShoppingCartEvent, recordTimestamp: Long) = element.time.toEpochMilli
103 | })
104 | )
105 | .windowAll(TumblingEventTimeWindows.of(Time.seconds(3)))
106 |
107 | def countEventsByWindow: DataStream[String] = groupedEventsByWindow.process(new CountByWindowAll)
108 | countEventsByWindow.print()
109 | env.execute()
110 | }
111 |
112 | def main(args: Array[String]): Unit = {
113 | demoEventTime_v2()
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/src/main/scala/part2datastreams/Triggers.scala:
--------------------------------------------------------------------------------
1 | package part2datastreams
2 |
3 | import generators.shopping._
4 | import org.apache.flink.streaming.api.scala._
5 | import org.apache.flink.streaming.api.scala.function.ProcessAllWindowFunction
6 | import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows
7 | import org.apache.flink.streaming.api.windowing.time.Time
8 | import org.apache.flink.streaming.api.windowing.triggers.{CountTrigger, PurgingTrigger}
9 | import org.apache.flink.streaming.api.windowing.windows.TimeWindow
10 | import org.apache.flink.util.Collector
11 |
12 | object Triggers {
13 |
14 | // Triggers -> WHEN a window function is executed
15 |
16 | val env = StreamExecutionEnvironment.getExecutionEnvironment
17 |
18 | def demoCountTrigger(): Unit = {
19 | val shoppingCartEvents: DataStream[String] = env
20 | .addSource(new ShoppingCartEventsGenerator(500, 2)) // 2 events/second
21 | .windowAll(TumblingProcessingTimeWindows.of(Time.seconds(5))) // 10 events/window
22 | .trigger(CountTrigger.of[TimeWindow](5)) // the window function runs every 5 elements
23 | .process(new CountByWindowAll) // runs twice for the same window
24 |
25 | shoppingCartEvents.print()
26 | env.execute()
27 | }
28 | /*
29 | 12> Window [1646129900000 - 1646129905000] 2
30 | 1> Window [1646129905000 - 1646129910000] 10
31 | 2> Window [1646129910000 - 1646129915000] 10
32 | 3> Window [1646129915000 - 1646129920000] 10
33 | 4> Window [1646129920000 - 1646129925000] 10
34 |
35 | with trigger
36 | 6> Window [1646130165000 - 1646130170000] 5 <- trigger running on the window 65000-70000 for the first time
37 | 7> Window [1646130165000 - 1646130170000] 10 <- second trigger FOR THE SAME WINDOW
38 | 8> Window [1646130170000 - 1646130175000] 5
39 | 9> Window [1646130170000 - 1646130175000] 10
40 | 10> Window [1646130175000 - 1646130180000] 5
41 | 11> Window [1646130175000 - 1646130180000] 10
42 | */
43 |
44 | // purging trigger - clear the window when it fires
45 |
46 | def demoPurgingTrigger(): Unit = {
47 | val shoppingCartEvents: DataStream[String] = env
48 | .addSource(new ShoppingCartEventsGenerator(500, 2)) // 2 events/second
49 | .windowAll(TumblingProcessingTimeWindows.of(Time.seconds(5))) // 10 events/window
50 | .trigger(PurgingTrigger.of(CountTrigger.of[TimeWindow](5))) // the window function runs every 5 elements, THEN CLEARS THE WINDOW
51 | .process(new CountByWindowAll) // runs twice for the same window
52 |
53 | shoppingCartEvents.print()
54 | env.execute()
55 | }
56 |
57 | /*
58 | with purging trigger
59 |
60 | 12> Window [1646134290000 - 1646134295000] 5
61 | 1> Window [1646134295000 - 1646134300000] 5
62 | 2> Window [1646134295000 - 1646134300000] 5
63 | 3> Window [1646134300000 - 1646134305000] 5
64 | 4> Window [1646134300000 - 1646134305000] 5
65 | 5> Window [1646134305000 - 1646134310000] 5
66 | 6> Window [1646134305000 - 1646134310000] 5
67 | */
68 |
69 | /*
70 | Other triggers:
71 | - EventTimeTrigger - happens by default when the watermark is > window end time (automatic for event time windows)
72 | - ProcessingTimeTrigger - fires when the current system time > window end time (automatic for processing time windows)
73 | - custom triggers - powerful APIs for custom firing behavior
74 | */
75 |
76 | def main(args: Array[String]): Unit = {
77 | demoPurgingTrigger()
78 | }
79 |
80 | }
81 |
82 | // copied from Time Based Transformations
83 | class CountByWindowAll extends ProcessAllWindowFunction[ShoppingCartEvent, String, TimeWindow] {
84 | override def process(context: Context, elements: Iterable[ShoppingCartEvent], out: Collector[String]): Unit = {
85 | val window = context.window
86 | out.collect(s"Window [${window.getStart} - ${window.getEnd}] ${elements.size}")
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/src/main/scala/part2datastreams/WindowFunctions.scala:
--------------------------------------------------------------------------------
1 | package part2datastreams
2 |
3 | import generators.gaming._
4 | import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
5 | import org.apache.flink.api.common.functions.AggregateFunction
6 | import org.apache.flink.streaming.api.scala._
7 | import org.apache.flink.streaming.api.scala.function.{AllWindowFunction, ProcessAllWindowFunction, ProcessWindowFunction, WindowFunction}
8 | import org.apache.flink.streaming.api.windowing.assigners.{EventTimeSessionWindows, GlobalWindows, SlidingEventTimeWindows, TumblingEventTimeWindows}
9 | import org.apache.flink.streaming.api.windowing.time.Time
10 | import org.apache.flink.streaming.api.windowing.triggers.CountTrigger
11 | import org.apache.flink.streaming.api.windowing.windows.{GlobalWindow, TimeWindow}
12 | import org.apache.flink.util.Collector
13 |
14 | import java.time.Instant
15 | import scala.concurrent.duration._
16 |
17 | object WindowFunctions {
18 |
19 | // use-case: stream of events for a gaming session
20 |
21 | val env = StreamExecutionEnvironment.getExecutionEnvironment
22 |
23 | implicit val serverStartTime: Instant =
24 | Instant.parse("2022-02-02T00:00:00.000Z")
25 |
26 | val events: List[ServerEvent] = List(
27 | bob.register(2.seconds), // player "Bob" registered 2s after the server started
28 | bob.online(2.seconds),
29 | sam.register(3.seconds),
30 | sam.online(4.seconds),
31 | rob.register(4.seconds),
32 | alice.register(4.seconds),
33 | mary.register(6.seconds),
34 | mary.online(6.seconds),
35 | carl.register(8.seconds),
36 | rob.online(10.seconds),
37 | alice.online(10.seconds),
38 | carl.online(10.seconds)
39 | )
40 |
41 | val eventStream: DataStream[ServerEvent] = env
42 | .fromCollection(events)
43 | .assignTimestampsAndWatermarks( // extract timestamps for events (event time) + watermarks
44 | WatermarkStrategy
45 | .forBoundedOutOfOrderness(java.time.Duration.ofMillis(500)) // once you get an event with time T, you will NOT accept further events with time < T - 500
46 | .withTimestampAssigner(new SerializableTimestampAssigner[ServerEvent] {
47 | override def extractTimestamp(element: ServerEvent, recordTimestamp: Long) =
48 | element.eventTime.toEpochMilli
49 | })
50 | )
51 |
52 | // how many players were registered every 3 seconds?
53 | // [0...3s] [3s...6s] [6s...9s]
54 | val threeSecondsTumblingWindow = eventStream.windowAll(TumblingEventTimeWindows.of(Time.seconds(3)))
55 | /*
56 | |----0----|----1----|--------2--------|--------3--------|---------4---------|---5---|--------6--------|---7---|--------8--------|--9--|------10-------|------11------|
57 | | | | bob registered | sam registered | sam online | | mary registered | | carl registered | | rob online | |
58 | | | | bob online | | rob registered | | mary online | | | | alice online | |
59 | | | | | | alice registered | | | | | | carl online | |
60 | ^|------------ window one ----------- + -------------- window two ----------------- + ------------- window three -------------- + ----------- window four ----------|^
61 | | | | | |
62 | | 1 registrations | 3 registrations | 2 registration | 0 registrations |
63 | | 1643760000000 - 1643760003000 | 1643760005000 - 1643760006000 | 1643760006000 - 1643760009000 | 1643760009000 - 1643760012000 |
64 | */
65 |
66 | // count by windowAll
67 | class CountByWindowAll extends AllWindowFunction[ServerEvent, String, TimeWindow] {
68 | // ^ input ^ output ^ window type
69 | override def apply(window: TimeWindow, input: Iterable[ServerEvent], out: Collector[String]): Unit = {
70 | val registrationEventCount = input.count(event => event.isInstanceOf[PlayerRegistered])
71 | out.collect(s"Window [${window.getStart} - ${window.getEnd}] $registrationEventCount")
72 | }
73 | }
74 |
75 | def demoCountByWindow(): Unit = {
76 | val registrationsPerThreeSeconds: DataStream[String] = threeSecondsTumblingWindow.apply(new CountByWindowAll)
77 | registrationsPerThreeSeconds.print()
78 | env.execute()
79 | }
80 |
81 | // alternative: process window function which offers a much richer API (lower-level)
82 | class CountByWindowAllV2 extends ProcessAllWindowFunction[ServerEvent, String, TimeWindow] {
83 | override def process(context: Context, elements: Iterable[ServerEvent], out: Collector[String]): Unit = {
84 | val window = context.window
85 | val registrationEventCount = elements.count(event => event.isInstanceOf[PlayerRegistered])
86 | out.collect(s"Window [${window.getStart} - ${window.getEnd}] $registrationEventCount")
87 | }
88 | }
89 |
90 | def demoCountByWindow_v2(): Unit = {
91 | val registrationsPerThreeSeconds: DataStream[String] = threeSecondsTumblingWindow.process(new CountByWindowAllV2)
92 | registrationsPerThreeSeconds.print()
93 | env.execute()
94 | }
95 |
96 | // alternative 2: aggregate function
97 | class CountByWindowV3 extends AggregateFunction[ServerEvent, Long, Long] {
98 | // ^ input ^ acc ^ output
99 |
100 | // start counting from 0
101 | override def createAccumulator(): Long = 0L
102 |
103 | // every element increases accumulator by 1
104 | override def add(value: ServerEvent, accumulator: Long) =
105 | if (value.isInstanceOf[PlayerRegistered]) accumulator + 1
106 | else accumulator
107 |
108 | // push a final output out of the final accumulator
109 | override def getResult(accumulator: Long) = accumulator
110 |
111 | // accum1 + accum2 = a bigger accumulator
112 | override def merge(a: Long, b: Long) = a + b
113 | }
114 |
115 | def demoCountByWindow_v3(): Unit = {
116 | val registrationsPerThreeSeconds: DataStream[Long] = threeSecondsTumblingWindow.aggregate(new CountByWindowV3)
117 | registrationsPerThreeSeconds.print()
118 | env.execute()
119 | }
120 |
121 | /**
122 | * Keyed streams and window functions
123 | */
124 | // each element will be assigned to a "mini-stream" for its own key
125 | val streamByType: KeyedStream[ServerEvent, String] = eventStream.keyBy(e => e.getClass.getSimpleName)
126 |
127 | // for every key, we'll have a separate window allocation
128 | val threeSecondsTumblingWindowByType = streamByType.window(TumblingEventTimeWindows.of(Time.seconds(3)))
129 |
130 | /*
131 | === Registration Events Stream ===
132 | |----0----|----1----|--------2--------|--------3--------|---------4---------|---5---|--------6--------|---7---|--------8--------|--9--|------10-------|------11------|
133 | | | | bob registered | sam registered | rob registered | | mary registered | | carl registered | | | |
134 | | | | | | alice registered | | | | | | | |
135 | ^|------------ window one ----------- + -------------- window two ----------------- + ------------- window three -------------- + ----------- window four ----------|^
136 | | 1 registration | 3 registrations | 2 registrations | 0 registrations |
137 | | 1643760000000 - 1643760003000 | 1643760003000 - 1643760006000 | 1643760006000 - 1643760009000 | 1643760009000 - 1643760012000 |
138 |
139 | === Online Events Stream ===
140 | |----0----|----1----|--------2--------|--------3--------|---------4---------|---5---|--------6--------|---7---|--------8--------|--9--|------10-------|------11------|
141 | | | | bob online | | sam online | | mary online | | | | rob online | carl online |
142 | | | | | | | | | | | | alice online | |
143 | ^|------------ window one ----------- + -------------- window two ----------------- + ------------- window three -------------- + ----------- window four ----------|^
144 | | 1 online | 1 online | 1 online | 3 online |
145 | | 1643760000000 - 1643760003000 | 1643760005000 - 1643760006000 | 1643760006000 - 1643760009000 | 1643760009000 - 1643760012000 |
146 | */
147 |
148 | class CountByWindow extends WindowFunction[ServerEvent, String, String, TimeWindow] {
149 | override def apply(key: String, window: TimeWindow, input: Iterable[ServerEvent], out: Collector[String]): Unit =
150 | out.collect(s"$key: $window, ${input.size}")
151 | }
152 |
153 | def demoCountByTypeByWindow(): Unit = {
154 | val finalStream = threeSecondsTumblingWindowByType.apply(new CountByWindow)
155 | finalStream.print()
156 | env.execute()
157 | }
158 |
159 | // alternative: process function for windows
160 | class CountByWindowV2 extends ProcessWindowFunction[ServerEvent, String, String, TimeWindow] {
161 | override def process(key: String, context: Context, elements: Iterable[ServerEvent], out: Collector[String]): Unit =
162 | out.collect(s"$key: ${context.window}, ${elements.size}")
163 | }
164 |
165 | def demoCountByTypeByWindow_v2(): Unit = {
166 | val finalStream = threeSecondsTumblingWindowByType.process(new CountByWindowV2)
167 | finalStream.print()
168 | env.execute()
169 | }
170 |
171 | // one task processes all the data for a particular key
172 |
173 | /**
174 | * Sliding Windows
175 | */
176 |
177 | // how many players were registered every 3 seconds, UPDATED EVERY 1s?
178 | // [0s...3s] [1s...4s] [2s...5s] ...
179 |
180 | /*
181 | |----0----|----1----|--------2--------|--------3--------|---------4---------|---5---|--------6--------|---7---|--------8--------|--9--|------10-------|------11------|
182 | | | | bob registered | sam registered | sam online | | mary registered | | carl registered | | rob online | carl online |
183 | | | | bob online | | rob registered | | mary online | | | | alice online | |
184 | | | | | | alice registered | | | | | | | |
185 | ^|------------ window one ----------- +
186 | 1 registration
187 |
188 | + ---------------- window two --------------- +
189 | 2 registrations
190 |
191 | + ------------------- window three ------------------- +
192 | 4 registrations
193 |
194 | + ---------------- window four --------------- +
195 | 3 registrations
196 |
197 | + ---------------- window five -------------- +
198 | 3 registrations
199 |
200 | + ---------- window six -------- +
201 | 1 registration
202 |
203 | + ------------ window seven ----------- +
204 | 2 registrations
205 |
206 | + ------- window eight------- +
207 | 1 registration
208 |
209 | + ----------- window nine ----------- +
210 | 1 registration
211 |
212 | + ---------- window ten --------- +
213 | 0 registrations
214 | */
215 |
216 | def demoSlidingAllWindows(): Unit = {
217 | val windowSize: Time = Time.seconds(3)
218 | val slidingTime: Time = Time.seconds(1)
219 |
220 | val slidingWindowsAll = eventStream.windowAll(SlidingEventTimeWindows.of(windowSize, slidingTime))
221 |
222 | // process the windowed stream with similar window functions
223 | val registrationCountByWindow = slidingWindowsAll.apply(new CountByWindowAll)
224 |
225 | // similar to the other example
226 | registrationCountByWindow.print()
227 | env.execute()
228 | }
229 |
230 | /**
231 | * Session windows = groups of events with NO MORE THAN a certain time gap in between them
232 | */
233 | // how many registration events do we have NO MORE THAN 1s apart?
234 | /*
235 | |----0----|----1----|--------2--------|--------3--------|---------4---------|---5---|--------6--------|---7---|--------8--------|--9--|------10-------|------11------|
236 | | | | bob registered | sam registered | sam online | | mary registered | | carl registered | | rob online | |
237 | | | | bob online | | rob registered | | mary online | | | | alice online | |
238 | | | | | | alice registered | | | | | | carl online | |
239 |
240 | after filtering:
241 |
242 | +---------+---------+-----------------+-----------------+-------------------+-------+-----------------+-------+-----------------+-----+---------------+--------------+
243 | | | | bob registered | sam registered | rob registered | | mary registered | | carl registered | | N/A | |
244 | | | | | | alice registered | | | | | | | |
245 | ^ ----------------- window 1 -------------------------- ^ ^ -- window 2 --- ^ ^ -- window 3 --- ^ ^ -- window 4 - ^
246 | */
247 |
248 | def demoSessionWindows(): Unit = {
249 | val groupBySessionWindows = eventStream.windowAll(EventTimeSessionWindows.withGap(Time.seconds(1)))
250 |
251 | // operate any kind of window function
252 | val countBySessionWindows = groupBySessionWindows.apply(new CountByWindowAll)
253 |
254 | // same things as before
255 | countBySessionWindows.print()
256 | env.execute()
257 | }
258 |
259 | /**
260 | * Global window
261 | */
262 | // how many registration events do we have every 10 events
263 |
264 | class CountByGlobalWindowAll extends AllWindowFunction[ServerEvent, String, GlobalWindow] {
265 | // ^ input ^ output ^ window type
266 | override def apply(window: GlobalWindow, input: Iterable[ServerEvent], out: Collector[String]): Unit = {
267 | val registrationEventCount = input.count(event => event.isInstanceOf[PlayerRegistered])
268 | out.collect(s"Window [$window] $registrationEventCount")
269 | }
270 | }
271 |
272 | def demoGlobalWindow(): Unit = {
273 | val globalWindowEvents = eventStream
274 | .windowAll(GlobalWindows.create())
275 | .trigger(CountTrigger.of[GlobalWindow](10))
276 | .apply(new CountByGlobalWindowAll)
277 |
278 | globalWindowEvents.print()
279 | env.execute()
280 | }
281 |
282 | /**
283 | * Exercise: what was the time window (continuous 2s) when we had THE MOST registration events?
284 | * - what kind of window functions should we use? ALL WINDOW FUNCTION
285 | * - what kind of windows should we use? SLIDING WINDOWS
286 | */
287 | class KeepWindowAndCountFunction extends AllWindowFunction[ServerEvent, (TimeWindow, Long), TimeWindow] {
288 | override def apply(window: TimeWindow, input: Iterable[ServerEvent], out: Collector[(TimeWindow, Long)]): Unit =
289 | out.collect((window, input.size))
290 | }
291 |
292 | def windowFunctionsExercise(): Unit = {
293 | val slidingWindows: DataStream[(TimeWindow, Long)] = eventStream
294 | .filter(_.isInstanceOf[PlayerRegistered])
295 | .windowAll(SlidingEventTimeWindows.of(Time.seconds(2), Time.seconds(1)))
296 | .apply(new KeepWindowAndCountFunction)
297 |
298 | val localWindows: List[(TimeWindow, Long)] = slidingWindows.executeAndCollect().toList
299 | val bestWindow: (TimeWindow, Long) = localWindows.maxBy(_._2)
300 | println(s"The best window is ${bestWindow._1} with ${bestWindow._2} registration events.")
301 | }
302 |
303 | def main(args: Array[String]): Unit = {
304 | windowFunctionsExercise()
305 | }
306 | }
307 |
--------------------------------------------------------------------------------
/src/main/scala/part3state/BroadcastState.scala:
--------------------------------------------------------------------------------
1 | package part3state
2 |
3 | import generators.shopping._
4 | import org.apache.flink.api.common.state.MapStateDescriptor
5 | import org.apache.flink.streaming.api.datastream.BroadcastStream
6 | import org.apache.flink.streaming.api.functions.co.KeyedBroadcastProcessFunction
7 | import org.apache.flink.streaming.api.functions.source.SourceFunction
8 | import org.apache.flink.streaming.api.scala._
9 | import org.apache.flink.util.Collector
10 |
11 | object BroadcastState {
12 |
13 | val env = StreamExecutionEnvironment.getExecutionEnvironment
14 | val shoppingCartEvents = env.addSource(new SingleShoppingCartEventsGenerator(100))
15 | val eventsByUser = shoppingCartEvents.keyBy(_.userId)
16 |
17 | // issue a warning if quantity > threshold
18 | def purchaseWarnings(): Unit = {
19 | val threshold = 2
20 |
21 | val notificationsStream = eventsByUser
22 | .filter(_.isInstanceOf[AddToShoppingCartEvent])
23 | .filter(_.asInstanceOf[AddToShoppingCartEvent].quantity > threshold)
24 | .map(event => event match {
25 | case AddToShoppingCartEvent(userId, sku, quantity, _) =>
26 | s"User $userId attempting to purchase $quantity items of $sku when threshold is $threshold"
27 | case _ => ""
28 | })
29 |
30 | notificationsStream.print()
31 | env.execute()
32 | }
33 |
34 | // ... if the threshold CHANGES over time?
35 | // thresholds will be BROADCAST
36 |
37 | def changingThresholds(): Unit = {
38 | val thresholds: DataStream[Int] = env.addSource(new SourceFunction[Int] {
39 | override def run(ctx: SourceFunction.SourceContext[Int]) =
40 | List(2,0,4,5,6,3).foreach { newThreshold =>
41 | Thread.sleep(1000)
42 | ctx.collect(newThreshold)
43 | }
44 |
45 | override def cancel() = ()
46 | })
47 |
48 | // broadcast state is ALWAYS a map
49 | val broadcastStateDescriptor = new MapStateDescriptor[String, Int]("thresholds", classOf[String], classOf[Int])
50 | val broadcastThresholds: BroadcastStream[Int] = thresholds.broadcast(broadcastStateDescriptor)
51 |
52 | val notificationsStream = eventsByUser
53 | .connect(broadcastThresholds)
54 | .process(new KeyedBroadcastProcessFunction[String, ShoppingCartEvent, Int, String] {
55 | // ^ key ^ first event ^ broadcast ^ output
56 | val thresholdsDescriptor = new MapStateDescriptor[String, Int]("thresholds", classOf[String], classOf[Int])
57 |
58 | override def processBroadcastElement(
59 | newThreshold: Int,
60 | ctx: KeyedBroadcastProcessFunction[String, ShoppingCartEvent, Int, String]#Context,
61 | out: Collector[String]
62 | ) = {
63 | println(s"Threshold about to be changed -- $newThreshold")
64 | // fetch the broadcast state = distributed variable
65 | val stateThresholds = ctx.getBroadcastState(thresholdsDescriptor)
66 | // update the state
67 | stateThresholds.put("quantity-threshold", newThreshold)
68 | }
69 |
70 |
71 | override def processElement(
72 | event: ShoppingCartEvent,
73 | ctx: KeyedBroadcastProcessFunction[String, ShoppingCartEvent, Int, String]#ReadOnlyContext,
74 | out: Collector[String]
75 | ) = {
76 | event match {
77 | case AddToShoppingCartEvent(userId, sku, quantity, time) =>
78 | val currentThreshold: Int = ctx.getBroadcastState(thresholdsDescriptor).get("quantity-threshold")
79 | if (quantity > currentThreshold)
80 | out.collect(s"User $userId attempting to purchase $quantity items of $sku when threshold is $currentThreshold")
81 | case _ =>
82 | }
83 | }
84 | })
85 |
86 |
87 | notificationsStream.print()
88 | env.execute()
89 | }
90 |
91 | def main(args: Array[String]): Unit = {
92 | changingThresholds()
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/src/main/scala/part3state/Checkpoints.scala:
--------------------------------------------------------------------------------
1 | package part3state
2 |
3 | import generators.shopping._
4 | import org.apache.flink.api.common.functions.FlatMapFunction
5 | import org.apache.flink.api.common.state.{CheckpointListener, ValueState, ValueStateDescriptor}
6 | import org.apache.flink.runtime.state.{FunctionInitializationContext, FunctionSnapshotContext}
7 | import org.apache.flink.streaming.api.checkpoint.CheckpointedFunction
8 | import org.apache.flink.streaming.api.scala._
9 | import org.apache.flink.util.Collector
10 |
11 | object Checkpoints {
12 |
13 | val env = StreamExecutionEnvironment.getExecutionEnvironment
14 |
15 | // set checkpoint intervals
16 | env.getCheckpointConfig.setCheckpointInterval(5000) // a checkpoint triggered every 5s
17 | // set checkpoint storage
18 | env.getCheckpointConfig.setCheckpointStorage("file:///Users/daniel/dev/rockthejvm/courses/flink-essentials/checkpoints")
19 |
20 | /*
21 | Keep track of the NUMBER OF AddToCart events PER USER, when quantity > a threshold (e.g. managing stock)
22 | Persist the data (state) via checkpoints
23 | */
24 |
25 | val shoppingCartEvents =
26 | env.addSource(new SingleShoppingCartEventsGenerator(sleepMillisBetweenEvents = 100, generateRemoved = true))
27 |
28 | val eventsByUser = shoppingCartEvents
29 | .keyBy(_.userId)
30 | .flatMap(new HighQuantityCheckpointedFunction(5))
31 |
32 |
33 | def main(args: Array[String]): Unit = {
34 | eventsByUser.print()
35 | env.execute()
36 | }
37 | }
38 |
39 | class HighQuantityCheckpointedFunction(val threshold: Long)
40 | extends FlatMapFunction[ShoppingCartEvent, (String, Long)]
41 | with CheckpointedFunction
42 | with CheckpointListener {
43 |
44 | var stateCount: ValueState[Long] = _ // instantiated PER KEY
45 |
46 | override def flatMap(event: ShoppingCartEvent, out: Collector[(String, Long)]): Unit =
47 | event match {
48 | case AddToShoppingCartEvent(userId, _, quantity, _) =>
49 | if (quantity > threshold) {
50 | // update state
51 | val newUserEventCount = stateCount.value() + 1
52 | stateCount.update(newUserEventCount)
53 |
54 | // push output
55 | out.collect((userId, newUserEventCount))
56 | }
57 | case _ => // do nothing
58 | }
59 |
60 | // invoked when the checkpoint is TRIGGERED
61 | override def snapshotState(context: FunctionSnapshotContext): Unit =
62 | println(s"CHECKPOINT AT ${context.getCheckpointTimestamp}")
63 |
64 | // lifecycle method to initialize state (~ open() in RichFunctions)
65 | override def initializeState(context: FunctionInitializationContext): Unit = {
66 | val stateCountDescriptor = new ValueStateDescriptor[Long]("impossibleOrderCount", classOf[Long])
67 | stateCount = context.getKeyedStateStore.getState(stateCountDescriptor)
68 | }
69 |
70 | override def notifyCheckpointComplete(checkpointId: Long): Unit = ()
71 | override def notifyCheckpointAborted(checkpointId: Long): Unit = ()
72 | }
73 |
--------------------------------------------------------------------------------
/src/main/scala/part3state/KeyedState.scala:
--------------------------------------------------------------------------------
1 | package part3state
2 |
3 | import generators.shopping._
4 | import org.apache.flink.api.common.state.{ListState, ListStateDescriptor, MapState, MapStateDescriptor, StateTtlConfig, ValueState, ValueStateDescriptor}
5 | import org.apache.flink.api.common.time.Time
6 | import org.apache.flink.configuration.Configuration
7 | import org.apache.flink.streaming.api.functions.KeyedProcessFunction
8 | import org.apache.flink.streaming.api.scala._
9 | import org.apache.flink.util.Collector
10 |
11 | object KeyedState {
12 |
13 | val env = StreamExecutionEnvironment.getExecutionEnvironment
14 | val shoppingCartEvents = env.addSource(
15 | new SingleShoppingCartEventsGenerator(
16 | sleepMillisBetweenEvents = 100, // ~ 10 events/s
17 | generateRemoved = true
18 | )
19 | )
20 |
21 | val eventsPerUser: KeyedStream[ShoppingCartEvent, String] = shoppingCartEvents.keyBy(_.userId)
22 |
23 | def demoValueState(): Unit = {
24 | /*
25 | How many events PER USER have been generated?
26 | */
27 |
28 | val numEventsPerUserNaive = eventsPerUser.process(
29 | new KeyedProcessFunction[String, ShoppingCartEvent, String] { // instantiated ONCE PER KEY
30 | // ^ key ^ event ^ result
31 |
32 | var nEventsForThisUser = 0
33 |
34 | override def processElement(
35 | value: ShoppingCartEvent,
36 | ctx: KeyedProcessFunction[String, ShoppingCartEvent, String]#Context,
37 | out: Collector[String]
38 | ): Unit = {
39 | nEventsForThisUser += 1
40 | out.collect(s"User ${value.userId} - $nEventsForThisUser")
41 | }
42 | }
43 | )
44 |
45 | /*
46 | Problems with local vars
47 | - they are local, so other nodes don't see them
48 | - if a node crashes, the var disappears
49 | */
50 |
51 | val numEventsPerUserStream = eventsPerUser.process(
52 | new KeyedProcessFunction[String, ShoppingCartEvent, String] {
53 |
54 | // can call .value to get current state
55 | // can call .update(newValue) to overwrite
56 | var stateCounter: ValueState[Long] = _ // a value state per key=userId
57 |
58 | override def open(parameters: Configuration): Unit = {
59 | // initialize all state
60 | stateCounter = getRuntimeContext // from RichFunction
61 | .getState(new ValueStateDescriptor[Long]("events-counter", classOf[Long]))
62 | }
63 |
64 | override def processElement(
65 | value: ShoppingCartEvent,
66 | ctx: KeyedProcessFunction[String, ShoppingCartEvent, String]#Context,
67 | out: Collector[String]
68 | ) = {
69 | val nEventsForThisUser = stateCounter.value()
70 | stateCounter.update(nEventsForThisUser + 1)
71 | out.collect(s"User ${value.userId} - ${nEventsForThisUser + 1}")
72 | }
73 | }
74 | )
75 |
76 | numEventsPerUserStream.print()
77 | env.execute()
78 | }
79 |
80 | // ListState
81 | def demoListState(): Unit = {
82 | // store all the events per user id
83 | val allEventsPerUserStream = eventsPerUser.process(
84 | new KeyedProcessFunction[String, ShoppingCartEvent, String] {
85 | // create state here
86 | /*
87 | Capabilities
88 | - add(value)
89 | - addAll(list)
90 | - update(new list) - overwriting
91 | - get()
92 | */
93 | var stateEventsForUser: ListState[ShoppingCartEvent] = _ // once per key
94 | // you need to be careful to keep the size of the list BOUNDED
95 |
96 | // initialization of state here
97 | override def open(parameters: Configuration): Unit =
98 | stateEventsForUser = getRuntimeContext.getListState(
99 | new ListStateDescriptor[ShoppingCartEvent]("shopping-cart-events", classOf[ShoppingCartEvent])
100 | )
101 |
102 | override def processElement(
103 | event: ShoppingCartEvent,
104 | ctx: KeyedProcessFunction[String, ShoppingCartEvent, String]#Context,
105 | out: Collector[String]
106 | ) = {
107 | stateEventsForUser.add(event)
108 | // import the Scala converters for collections
109 | // Scala 2.12
110 | import scala.collection.JavaConverters._ // implicit converters (extension methods)
111 | // Scala 2.13 & Scala 3
112 | // import scala.jdk.CollectionConverters._
113 |
114 | val currentEvents: Iterable[ShoppingCartEvent] = stateEventsForUser.get() // does not return a plain List, but a Java Iterable
115 | .asScala // convert to a Scala Iterable
116 |
117 | out.collect(s"User ${event.userId} - [${currentEvents.mkString(", ")}]")
118 | }
119 | }
120 | )
121 |
122 | allEventsPerUserStream.print()
123 | env.execute()
124 | }
125 |
126 | // MapState
127 | def demoMapState(): Unit = {
128 | // count how many events PER TYPE were ingested PER USER
129 | val streamOfCountsPerType = eventsPerUser.process(
130 | new KeyedProcessFunction[String, ShoppingCartEvent, String] {
131 | // Scala collection converters
132 | import scala.collection.JavaConverters._ // implicit converters (extension methods)
133 |
134 | // create the state
135 | var stateCountsPerEventType: MapState[String, Long] = _ // keep this bounded
136 |
137 | // initialize the state
138 | override def open(parameters: Configuration): Unit = {
139 | stateCountsPerEventType = getRuntimeContext.getMapState(
140 | new MapStateDescriptor[String, Long](
141 | "per-type-counter",
142 | classOf[String],
143 | classOf[Long]
144 | )
145 | )
146 | }
147 |
148 | override def processElement(
149 | event: ShoppingCartEvent,
150 | ctx: KeyedProcessFunction[String, ShoppingCartEvent, String]#Context,
151 | out: Collector[String]
152 | ) = {
153 | // fetch the type of the event
154 | val eventType = event.getClass.getSimpleName
155 | // updating the state
156 | if (stateCountsPerEventType.contains(eventType)) {
157 | val oldCount = stateCountsPerEventType.get(eventType)
158 | val newCount = oldCount + 1
159 | stateCountsPerEventType.put(eventType, newCount)
160 | } else {
161 | stateCountsPerEventType.put(eventType, 1)
162 | }
163 |
164 | // push some output
165 | out.collect(s"${ctx.getCurrentKey} - ${stateCountsPerEventType.entries().asScala.mkString(", ")}")
166 | }
167 | }
168 | )
169 |
170 | streamOfCountsPerType.print()
171 | env.execute()
172 | }
173 |
174 | // clear the state manually
175 | // clear the state at a regular interval
176 |
177 | def demoListStateWithClearance(): Unit = {
178 | val allEventsPerUserStream = eventsPerUser.process(
179 | new KeyedProcessFunction[String, ShoppingCartEvent, String] {
180 | import scala.collection.JavaConverters._ // implicit converters (extension methods)
181 |
182 | // if more than 10 elements, clear the list
183 | var stateEventsForUser: ListState[ShoppingCartEvent] = _
184 |
185 | // initialization of state here
186 | override def open(parameters: Configuration): Unit = {
187 | val descriptor = new ListStateDescriptor[ShoppingCartEvent]("shopping-cart-events", classOf[ShoppingCartEvent])
188 | // time to live = cleared if it's not modified after a certain time
189 | descriptor.enableTimeToLive(
190 | StateTtlConfig.newBuilder(Time.hours(1)) // clears the state after 1h
191 | .setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite) // specify when the timer resets
192 | .setStateVisibility(StateTtlConfig.StateVisibility.ReturnExpiredIfNotCleanedUp)
193 | .build()
194 | )
195 |
196 | stateEventsForUser = getRuntimeContext.getListState(descriptor)
197 |
198 | }
199 |
200 | override def processElement(
201 | event: ShoppingCartEvent,
202 | ctx: KeyedProcessFunction[String, ShoppingCartEvent, String]#Context,
203 | out: Collector[String]
204 | ) = {
205 | stateEventsForUser.add(event)
206 | val currentEvents = stateEventsForUser.get().asScala.toList
207 | if (currentEvents.size > 10)
208 | stateEventsForUser.clear() // clearing is not done immediately
209 |
210 | out.collect(s"User ${event.userId} - [${currentEvents.mkString(", ")}]")
211 | }
212 | }
213 | )
214 |
215 | allEventsPerUserStream.print()
216 | env.execute()
217 | }
218 |
219 | def main(args: Array[String]): Unit = {
220 | demoListStateWithClearance()
221 | }
222 | }
223 |
--------------------------------------------------------------------------------
/src/main/scala/part3state/RichFunctions.scala:
--------------------------------------------------------------------------------
1 | package part3state
2 |
3 | import generators.shopping._
4 | import org.apache.flink.api.common.functions.{FlatMapFunction, MapFunction, RichFlatMapFunction, RichMapFunction}
5 | import org.apache.flink.configuration.Configuration
6 | import org.apache.flink.streaming.api.functions.ProcessFunction
7 | import org.apache.flink.streaming.api.scala._
8 | import org.apache.flink.util.Collector
9 |
10 | object RichFunctions {
11 |
12 | val env = StreamExecutionEnvironment.getExecutionEnvironment
13 | env.setParallelism(1)
14 |
15 | val numbersStream: DataStream[Int] = env.fromElements(1,2,3,4,5,6)
16 |
17 | // pure FP
18 | val tenxNumbers: DataStream[Int] = numbersStream.map(_ * 10)
19 |
20 | // "explicit" map functions
21 | val tenxNumbers_v2: DataStream[Int] = numbersStream.map(new MapFunction[Int, Int] {
22 | override def map(value: Int) = value * 10
23 | })
24 |
25 | // Rich Map function
26 | val tenxNumbers_v3: DataStream[Int] = numbersStream.map(new RichMapFunction[Int, Int] {
27 | override def map(value: Int) = value * 10
28 | })
29 |
30 | // Rich map function + lifecycle methods
31 | val tenxNumbersWithLifecycle: DataStream[Int] = numbersStream.map(new RichMapFunction[Int, Int] {
32 | override def map(value: Int) = value * 10 // mandatory override
33 |
34 | // optional overrides: lifecycle methods open/close
35 | // called BEFORE data goes through
36 | override def open(parameters: Configuration): Unit =
37 | println("Starting my work!!")
38 |
39 | // invoked AFTER all the data
40 | override def close(): Unit =
41 | println("Finishing my work...")
42 | })
43 |
44 | // ProcessFunction - the most general function abstraction in Flink
45 | val tenxNumbersProcess: DataStream[Int] = numbersStream.process(new ProcessFunction[Int, Int] {
46 | override def processElement(value: Int, ctx: ProcessFunction[Int, Int]#Context, out: Collector[Int]) =
47 | out.collect(value * 10)
48 |
49 | // can also override the lifecycle methods
50 | override def open(parameters: Configuration): Unit =
51 | println("Process function starting")
52 |
53 | override def close(): Unit =
54 | println("Closing process function")
55 | })
56 |
57 | /**
58 | * Exercise: "explode" all purchase events to a single item
59 | * [("boots", 2), (iPhone, 1)] ->
60 | * ["boots", "boots", iPhone]
61 | * - lambdas
62 | * - explicit functions
63 | * - rich functions
64 | * - process functions
65 | */
66 | def exercise(): Unit = {
67 | val exerciseEnv = StreamExecutionEnvironment.getExecutionEnvironment
68 | val shoppingCartStream: DataStream[AddToShoppingCartEvent] = exerciseEnv.addSource(new SingleShoppingCartEventsGenerator(100)) // ~10 events/s
69 | .filter(_.isInstanceOf[AddToShoppingCartEvent])
70 | .map(_.asInstanceOf[AddToShoppingCartEvent])
71 |
72 | // 1 - lambdas: flatMap
73 | val itemsPurchasedStream: DataStream[String] =
74 | shoppingCartStream.flatMap(event => (1 to event.quantity).map(_ => event.sku))
75 |
76 | // 2 - explicit flatMap function
77 | val itemsPurchasedStream_v2: DataStream[String] =
78 | shoppingCartStream.flatMap(new FlatMapFunction[AddToShoppingCartEvent, String] {
79 | override def flatMap(event: AddToShoppingCartEvent, out: Collector[String]) =
80 | (1 to event.quantity).map(_ => event.sku).foreach(out.collect)
81 | })
82 |
83 | // 3 - rich flatMap function
84 | val itemsPurchasedStream_v3: DataStream[String] =
85 | shoppingCartStream.flatMap(new RichFlatMapFunction[AddToShoppingCartEvent, String] {
86 | override def flatMap(event: AddToShoppingCartEvent, out: Collector[String]) =
87 | (1 to event.quantity).map(_ => event.sku).foreach(out.collect)
88 |
89 | override def open(parameters: Configuration): Unit =
90 | println("Processing with rich flatMap function")
91 |
92 | override def close(): Unit =
93 | println("Finishing rich flatMap function")
94 | })
95 |
96 | // 4 - process function
97 | val itemsPurchasedStream_v4: DataStream[String] =
98 | shoppingCartStream.process(new ProcessFunction[AddToShoppingCartEvent, String] {
99 | override def processElement(event: AddToShoppingCartEvent, ctx: ProcessFunction[AddToShoppingCartEvent, String]#Context, out: Collector[String]) =
100 | (1 to event.quantity).map(_ => event.sku).foreach(out.collect)
101 | })
102 |
103 | itemsPurchasedStream_v3.print()
104 | exerciseEnv.execute()
105 | }
106 |
107 |
108 | def main(args: Array[String]): Unit = {
109 | exercise()
110 | }
111 | }
112 |
--------------------------------------------------------------------------------
/src/main/scala/part4io/CassandraIntegration.scala:
--------------------------------------------------------------------------------
1 | package part4io
2 |
3 | import org.apache.flink.streaming.api.scala._
4 | import org.apache.flink.streaming.connectors.cassandra.CassandraSink
5 |
6 | object CassandraIntegration {
7 |
8 | val env = StreamExecutionEnvironment.getExecutionEnvironment
9 |
10 | case class Person(name: String, age: Int)
11 |
12 | // write data to Cassandra
13 | def demoWriteDataToCassandra(): Unit = {
14 | val people = env.fromElements(
15 | Person("Daniel", 99),
16 | Person("Alice", 12),
17 | Person("Julie", 14),
18 | Person("Mom", 54),
19 | )
20 |
21 | // we can only write TUPLES to Cassandra
22 | val personTuples: DataStream[(String, Int)] = people.map(p => (p.name, p.age))
23 |
24 | // write the data
25 | CassandraSink.addSink(personTuples) // builder pattern
26 | .setQuery("insert into rtjvm.people(name, age) values (?, ?)")
27 | .setHost("localhost")
28 | .build()
29 |
30 | env.execute()
31 | }
32 |
33 |
34 | def main(args: Array[String]): Unit = {
35 | demoWriteDataToCassandra()
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/scala/part4io/CustomSinks.scala:
--------------------------------------------------------------------------------
1 | package part4io
2 |
3 | import org.apache.flink.configuration.Configuration
4 | import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction}
5 | import org.apache.flink.streaming.api.scala._
6 |
7 | import java.io.{FileWriter, PrintWriter}
8 | import java.net.{ServerSocket, Socket}
9 | import java.util.Scanner
10 |
11 | object CustomSinks {
12 |
13 | val env = StreamExecutionEnvironment.getExecutionEnvironment
14 | val stringStream: DataStream[String] = env.fromElements(
15 | "This is an example of a sink function",
16 | "some other string",
17 | "Daniel says this is ok"
18 | )
19 |
20 | // push the strings to a file sink
21 |
22 | // instantiated once per thread
23 | class FileSink(path: String) extends RichSinkFunction[String] {
24 | /*
25 | - hold state
26 | - lifecycle methods
27 | */
28 |
29 | var writer: PrintWriter = _
30 |
31 | // called once per event in the datastream
32 | override def invoke(event: String, context: SinkFunction.Context): Unit = {
33 | writer.println(event)
34 | writer.flush()
35 | }
36 |
37 | override def open(parameters: Configuration): Unit = {
38 | // initialize resources
39 | writer = new PrintWriter(new FileWriter(path, true)) // append mode
40 | }
41 |
42 | override def close(): Unit = {
43 | // close resources
44 | writer.close()
45 | }
46 | }
47 |
48 | def demoFileSink(): Unit = {
49 | stringStream.addSink(new FileSink("output/demoFileSink.txt"))
50 | stringStream.print()
51 | env.execute()
52 | }
53 |
54 | /**
55 | * Create a sink function that will push data (as strings) to a socket sink.
56 | */
57 | class SocketSink(host: String, port: Int) extends RichSinkFunction[String] {
58 | var socket: Socket = _
59 | var writer: PrintWriter = _
60 |
61 | override def invoke(value: String, context: SinkFunction.Context): Unit = {
62 | writer.println(value)
63 | writer.flush()
64 | }
65 |
66 | override def open(parameters: Configuration): Unit = {
67 | socket = new Socket(host, port)
68 | writer = new PrintWriter(socket.getOutputStream)
69 | }
70 |
71 | override def close(): Unit = {
72 | socket.close() // closes the writer as well
73 | }
74 | }
75 |
76 | def demoSocketSink(): Unit = {
77 | stringStream.addSink(new SocketSink("localhost", 12345)).setParallelism(1)
78 | stringStream.print()
79 | env.execute()
80 | }
81 |
82 | def main(args: Array[String]): Unit = {
83 | demoSocketSink()
84 | }
85 | }
86 |
87 | /*
88 | - start data receiver
89 | - start flink
90 | */
91 | object DataReceiver {
92 | def main(args: Array[String]): Unit = {
93 | val server = new ServerSocket(12345)
94 | println("Waiting for Flink to connect...")
95 | val socket = server.accept()
96 | val reader = new Scanner(socket.getInputStream)
97 | println("Flink connected. Reading...")
98 |
99 | while (reader.hasNextLine) {
100 | println(s"> ${reader.nextLine()}")
101 | }
102 |
103 | socket.close()
104 | println("All data read. Closing app.")
105 | server.close()
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/src/main/scala/part4io/CustomSources.scala:
--------------------------------------------------------------------------------
1 | package part4io
2 |
3 | import org.apache.flink.configuration.Configuration
4 | import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, RichSourceFunction, SourceFunction}
5 | import org.apache.flink.streaming.api.scala._
6 |
7 | import java.io.PrintStream
8 | import java.net.{ServerSocket, Socket}
9 | import java.util.Scanner
10 | import scala.util.Random
11 |
12 | object CustomSources {
13 | val env = StreamExecutionEnvironment.getExecutionEnvironment
14 |
15 | // source of numbers, randomly generated
16 | class RandomNumberGeneratorSource(minEventsPerSeconds: Double)
17 | extends RichParallelSourceFunction[Long] {
18 |
19 | // create local fields/methods
20 | val maxSleepTime = (1000 / minEventsPerSeconds).toLong
21 | var isRunning: Boolean = true
22 |
23 | // called ONCE, when the function is instantiated
24 | // SourceFunction/RichSourceFunction runs on a (single) dedicated thread
25 |
26 | // Parallel function is called ONCE PER THREAD, each instance has its own thread
27 | override def run(ctx: SourceFunction.SourceContext[Long]): Unit =
28 | while (isRunning) {
29 | val sleepTime = Math.abs(Random.nextLong() % maxSleepTime)
30 | val nextNumber = Random.nextLong()
31 | Thread.sleep(sleepTime)
32 |
33 | // push something to the output
34 | ctx.collect(nextNumber)
35 | }
36 |
37 | // called at application shutdown
38 | // contract: the run method should stop immediately
39 | override def cancel(): Unit =
40 | isRunning = false
41 |
42 | // capability of lifecycle methods - initialize state ...
43 | override def open(parameters: Configuration): Unit =
44 | println(s"[${Thread.currentThread().getName}] starting source function")
45 | override def close(): Unit =
46 | println(s"[${Thread.currentThread().getName}] closing source function")
47 |
48 | // can hold state - ValueState, ListState, MapState
49 | }
50 |
51 | def demoSourceFunction(): Unit = {
52 | val numbersStream: DataStream[Long] = env.addSource(new RandomNumberGeneratorSource(10)).setParallelism(10)
53 | numbersStream.print()
54 | env.execute()
55 | }
56 |
57 | /**
58 | * Create a source function that reads data from a socket.
59 | */
60 |
61 | class SocketStringSource(host: String, port: Int) extends RichSourceFunction[String] {
62 | // whenever you manage a resource, use a RichSourceFunction
63 | var socket: Socket = _
64 | var isRunning = true
65 |
66 | override def run(ctx: SourceFunction.SourceContext[String]): Unit = {
67 | val scanner = new Scanner(socket.getInputStream)
68 | while (isRunning && scanner.hasNextLine) {
69 | ctx.collect(scanner.nextLine())
70 | }
71 | }
72 |
73 | override def cancel(): Unit =
74 | isRunning = false
75 |
76 | override def open(parameters: Configuration): Unit =
77 | socket = new Socket(host, port)
78 |
79 | override def close(): Unit =
80 | socket.close()
81 | }
82 |
83 | def demoSocketSource(): Unit = {
84 | val socketStringStream = env.addSource(new SocketStringSource("localhost", 12345))
85 | socketStringStream.print()
86 | env.execute()
87 | }
88 |
89 | def main(args: Array[String]): Unit = {
90 | demoSocketSource()
91 | }
92 | }
93 |
94 | /*
95 | - start DataSender
96 | - start Flink
97 | - DataSender -> Flink
98 | */
99 |
100 | object DataSender {
101 | def main(args: Array[String]): Unit = {
102 | val serverSocket = new ServerSocket(12345)
103 | println("Waiting for Flink to connect...")
104 |
105 | val socket = serverSocket.accept()
106 | println("Flink connected. Sending data...")
107 |
108 | val printer = new PrintStream(socket.getOutputStream)
109 | printer.println("Hello from the other side...")
110 | Thread.sleep(3000)
111 | printer.println("Almost ready...")
112 | Thread.sleep(500)
113 | (1 to 10).foreach { i =>
114 | Thread.sleep(200)
115 | printer.println(s"Number $i")
116 | }
117 |
118 | println("Data sending completed.")
119 | serverSocket.close()
120 | }
121 | }
--------------------------------------------------------------------------------
/src/main/scala/part4io/JDBCIntegration.scala:
--------------------------------------------------------------------------------
1 | package part4io
2 |
3 | import org.apache.flink.connector.jdbc.{JdbcConnectionOptions, JdbcSink, JdbcStatementBuilder}
4 | import org.apache.flink.streaming.api.scala._
5 |
6 | import java.sql.PreparedStatement
7 |
8 | object JDBCIntegration {
9 |
10 | val env = StreamExecutionEnvironment.getExecutionEnvironment
11 |
12 | case class Person(name: String, age: Int)
13 |
14 | // write data to JDBC
15 | def demoWriteToJDBC(): Unit = {
16 | val people = env.fromElements(
17 | Person("Daniel", 99),
18 | Person("Alice", 1),
19 | Person("Bob", 10),
20 | Person("Mary Jane", 43)
21 | )
22 |
23 | val jdbcSink = JdbcSink.sink[Person](
24 | // 1 - SQL statement
25 | "insert into people (name, age) values (?, ?)",
26 | new JdbcStatementBuilder[Person] { // the way to expand the wildcards with actual values
27 | override def accept(statement: PreparedStatement, person: Person): Unit = {
28 | statement.setString(1, person.name) // the first ? is replaced with person.name
29 | statement.setInt(2, person.age) // similar
30 | }
31 | },
32 | new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
33 | .withUrl("jdbc:postgresql://localhost:5432/rtjvm")
34 | .withDriverName("org.postgresql.Driver")
35 | .withUsername("docker")
36 | .withPassword("docker")
37 | .build()
38 | )
39 |
40 | // push the data through the sink
41 | people.addSink(jdbcSink)
42 | people.print()
43 | env.execute()
44 | }
45 |
46 | def main(args: Array[String]): Unit = {
47 | demoWriteToJDBC()
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/scala/part4io/KafkaIntegration.scala:
--------------------------------------------------------------------------------
1 | package part4io
2 |
3 | import org.apache.commons.lang3.CharSet
4 | import org.apache.flink.api.common.eventtime.WatermarkStrategy
5 | import org.apache.flink.api.common.serialization.{DeserializationSchema, SerializationSchema, SimpleStringSchema}
6 | import org.apache.flink.api.common.typeinfo.TypeInformation
7 | import org.apache.flink.connector.kafka.source.KafkaSource
8 | import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
9 | import org.apache.flink.streaming.api.scala._
10 | import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer
11 |
12 | object KafkaIntegration {
13 |
14 | val env = StreamExecutionEnvironment.getExecutionEnvironment
15 |
16 | // read simple data (strings) from a Kafka topic
17 | def readStrings(): Unit = {
18 | val kafkaSource = KafkaSource.builder[String]()
19 | .setBootstrapServers("localhost:9092")
20 | .setTopics("events")
21 | .setGroupId("events-group")
22 | .setStartingOffsets(OffsetsInitializer.earliest())
23 | .setValueOnlyDeserializer(new SimpleStringSchema())
24 | .build()
25 |
26 | val kafkaStrings: DataStream[String] = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "Kafka Source")
27 |
28 | // use the DS
29 | kafkaStrings.print()
30 | env.execute()
31 | }
32 |
33 | // read custom data
34 | case class Person(name: String, age: Int)
35 | class PersonDeserializer extends DeserializationSchema[Person] {
36 | override def deserialize(message: Array[Byte]): Person = {
37 | // format: name,age
38 | val string = new String(message)
39 | val tokens = string.split(",")
40 | val name = tokens(0)
41 | val age = tokens(1)
42 | Person(name, age.toInt)
43 | }
44 |
45 | override def isEndOfStream(nextElement: Person): Boolean = false
46 |
47 | override def getProducedType: TypeInformation[Person] = implicitly[TypeInformation[Person]]
48 | }
49 |
50 | def readCustomData(): Unit = {
51 | val kafkaSource = KafkaSource.builder[Person]()
52 | .setBootstrapServers("localhost:9092")
53 | .setTopics("people")
54 | .setGroupId("people-group")
55 | .setStartingOffsets(OffsetsInitializer.earliest())
56 | .setValueOnlyDeserializer(new PersonDeserializer)
57 | .build()
58 |
59 | val kafkaPeople: DataStream[Person] = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "Kafka Source")
60 |
61 | // use the DS
62 | kafkaPeople.print()
63 | env.execute()
64 | }
65 |
66 | // write custom data
67 | // need serializer
68 | class PersonSerializer extends SerializationSchema[Person] {
69 | override def serialize(person: Person): Array[Byte] =
70 | s"${person.name},${person.age}".getBytes("UTF-8")
71 | }
72 |
73 | def writeCustomData(): Unit = {
74 | val kafkaSink = new FlinkKafkaProducer[Person](
75 | "localhost:9092", // bootstrap server
76 | "people", // topic
77 | new PersonSerializer
78 | )
79 |
80 | val peopleStream = env.fromElements(
81 | Person("Alice", 10),
82 | Person("Bob", 11),
83 | Person("Charlie", 12),
84 | )
85 |
86 | peopleStream.addSink(kafkaSink)
87 | peopleStream.print()
88 | env.execute()
89 | }
90 |
91 | def main(args: Array[String]): Unit = {
92 | writeCustomData()
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/src/main/scala/part4io/SideOutputs.scala:
--------------------------------------------------------------------------------
1 | package part4io
2 |
3 | import generators.shopping._
4 | import org.apache.flink.streaming.api.functions.ProcessFunction
5 | import org.apache.flink.streaming.api.scala._
6 | import org.apache.flink.util.Collector
7 |
8 | object SideOutputs {
9 |
10 | // shopping cart events
11 | // process this in 2 different ways with the same function
12 | // e.g. events for user "Alice", and all the events of everyone else
13 |
14 | val env = StreamExecutionEnvironment.getExecutionEnvironment
15 | val shoppingCartEvents = env.addSource(new SingleShoppingCartEventsGenerator(100))
16 |
17 | // output tags - only available for ProcessFunctions
18 | val aliceTag = new OutputTag[ShoppingCartEvent]("alice-events") // name should be unique
19 |
20 | class AliceEventsFunction extends ProcessFunction[ShoppingCartEvent, ShoppingCartEvent] {
21 | override def processElement(
22 | event: ShoppingCartEvent,
23 | ctx: ProcessFunction[ShoppingCartEvent, ShoppingCartEvent]#Context,
24 | out: Collector[ShoppingCartEvent] // "primary" destination
25 | ): Unit = {
26 | if (event.userId == "Alice") {
27 | ctx.output(aliceTag, event) // collecting an event through a secondary destination
28 | } else {
29 | out.collect(event)
30 | }
31 | }
32 | }
33 |
34 | def demoSideOutput(): Unit = {
35 | val allEventsButAlices: DataStream[ShoppingCartEvent] = shoppingCartEvents.process(new AliceEventsFunction)
36 | val alicesEvents: DataStream[ShoppingCartEvent] = allEventsButAlices.getSideOutput(aliceTag)
37 |
38 | // process the datastreams separately
39 | alicesEvents.print()
40 | env.execute()
41 | }
42 |
43 | def main(args: Array[String]): Unit = {
44 | demoSideOutput()
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/scala/playground/Playground.scala:
--------------------------------------------------------------------------------
1 | package playground
2 |
3 | import org.apache.flink.streaming.api.scala._
4 |
5 | /**
6 | * Probably the simplest Flink application imaginable.
7 | * Run this app when you first download the repository of the course.
8 | * If the app compiles, runs and prints something, then Flink is installed in your project and you're good to go.
9 | *
10 | * Feel free to modify this app as you see fit. Practice and play with the concepts you learn in the course.
11 | */
12 | object Playground {
13 |
14 | def main(args: Array[String]): Unit = {
15 | val env = StreamExecutionEnvironment.getExecutionEnvironment
16 | val data = env.fromElements(1 to 1000: _*)
17 | data.print()
18 | env.execute()
19 | }
20 | }
21 |
--------------------------------------------------------------------------------