├── pyflink-doc
├── requirements.txt
├── requirements-dev.txt
└── data.py
├── learning-materials
├── cep
│ └── .gitkeep
└── table_sql
│ └── .gitkeep
├── sql-cookbook
├── requirements.txt
├── requirements-dev.txt
├── note.sh
├── src
│ └── python_udf.py
├── compose-kafka.yml
└── Dockerfile
├── .vscode
├── ltex.dictionary.en-US.txt
└── settings.json
├── sql-training
├── client-image
│ ├── VERSION
│ ├── sql-client
│ │ └── sql-client.sh
│ ├── conf
│ │ └── flink-conf.yaml
│ └── java
│ │ ├── sql-training-data-producer
│ │ └── src
│ │ │ └── main
│ │ │ └── java
│ │ │ └── com
│ │ │ └── ververica
│ │ │ └── sql_training
│ │ │ └── data_producer
│ │ │ ├── records
│ │ │ ├── TaxiRecord.java
│ │ │ ├── DriverChange.java
│ │ │ ├── Ride.java
│ │ │ └── Fare.java
│ │ │ ├── ConsolePrinter.java
│ │ │ ├── json_serde
│ │ │ ├── JsonDeserializer.java
│ │ │ └── JsonSerializer.java
│ │ │ └── FileReader.java
│ │ └── sql-training-udfs
│ │ ├── src
│ │ └── main
│ │ │ └── java
│ │ │ └── com
│ │ │ └── ververica
│ │ │ └── sql_training
│ │ │ └── udfs
│ │ │ ├── IsInNYC.java
│ │ │ ├── ToAreaId.java
│ │ │ └── ToCoords.java
│ │ └── pom.xml
├── mysql
│ └── create_tables.sql
├── minio
│ └── data
│ │ └── .minio.sys
│ │ ├── pool.bin
│ │ └── xl.meta
│ │ ├── config
│ │ ├── config.json
│ │ │ └── xl.meta
│ │ └── iam
│ │ │ └── format.json
│ │ │ └── xl.meta
│ │ ├── buckets
│ │ ├── .usage.json
│ │ │ └── xl.meta
│ │ └── .bloomcycle.bin
│ │ │ └── xl.meta
│ │ └── format.json
└── receipts.md
├── fraud-detection
├── remote
│ ├── infra
│ │ ├── key-pair
│ │ │ └── .gitkeep
│ │ ├── providers.tf
│ │ ├── s3.tf
│ │ ├── vpc.tf
│ │ ├── data.tf
│ │ ├── ddb.tf
│ │ ├── scripts
│ │ │ └── bootstrap.sh
│ │ ├── variables.tf
│ │ └── outputs.tf
│ ├── requirements.txt
│ ├── requirements-dev.txt
│ ├── docker-compose.yml
│ ├── application_properties.json
│ ├── package
│ │ └── uber-jar-for-pyflink
│ │ │ └── src
│ │ │ └── main
│ │ │ └── resources
│ │ │ └── log4j2.properties
│ └── build.sh
└── local
│ ├── requirements.txt
│ ├── requirements-dev.txt
│ ├── configs
│ ├── sink.json
│ └── ddb.json
│ ├── compose-connect.yml
│ ├── application_properties.json
│ ├── build.sh
│ └── docker-compose.yml
├── real-time-streaming-aws
├── infra
│ ├── key-pair
│ │ └── .gitkeep
│ ├── providers.tf
│ ├── vpc.tf
│ ├── data.tf
│ ├── s3.tf
│ └── scripts
│ │ └── bootstrap.sh
├── requirements-dev.txt
├── producer
│ └── requirements.txt
├── configs
│ ├── ddb.json
│ ├── note.sh
│ └── sink.json
├── loader
│ └── application_properties.json
├── exporter
│ ├── application_properties.json
│ ├── athena.sql
│ └── flinksql.sql
├── forwarder
│ ├── application_properties.json
│ └── flinksql.sql
├── compose-ui.yml
├── build.sh
├── download.sh
├── Dockerfile
├── package
│ ├── lab2-pipeline
│ │ └── src
│ │ │ └── main
│ │ │ └── resources
│ │ │ └── log4j2.properties
│ ├── lab3-pipeline
│ │ └── src
│ │ │ └── main
│ │ │ └── resources
│ │ │ └── log4j2.properties
│ └── lab4-pipeline
│ │ └── src
│ │ └── main
│ │ └── resources
│ │ └── log4j2.properties
├── compose-extra.yml
└── compose-msk.yml
├── stream-processing-with-pyflink
├── src
│ ├── jars
│ │ └── .gitkeep
│ ├── chapter7
│ │ ├── queryable_state.py
│ │ ├── utils
│ │ │ ├── type_helper.py
│ │ │ └── model.py
│ │ ├── checkpointed_function.py
│ │ └── operator_list_state_function.py
│ ├── chapter1
│ │ └── utils
│ │ │ ├── type_helper.py
│ │ │ └── model.py
│ ├── chapter5
│ │ ├── utils
│ │ │ ├── type_helper.py
│ │ │ └── model.py
│ │ ├── rolling_sum.py
│ │ ├── basic_transformations.py
│ │ └── keyed_transformations.py
│ ├── chapter6
│ │ ├── utils
│ │ │ ├── type_helper.py
│ │ │ └── model.py
│ │ └── test_window_functions_reduce.py
│ └── manage_topics.py
├── requirements.txt
├── requirements-dev.txt
├── README.md
└── Dockerfile
├── pyflink-getting-started-on-aws
├── remote
│ ├── infra
│ │ ├── key-pair
│ │ │ └── .gitkeep
│ │ ├── providers.tf
│ │ ├── s3.tf
│ │ ├── vpc.tf
│ │ ├── data.tf
│ │ ├── scripts
│ │ │ └── bootstrap.sh
│ │ ├── variables.tf
│ │ └── outputs.tf
│ ├── requirements-dev.txt
│ ├── requirements.txt
│ ├── compose-ui.yml
│ ├── application_properties.json
│ ├── build.sh
│ ├── Dockerfile
│ ├── package
│ │ └── uber-jar-for-pyflink
│ │ │ └── src
│ │ │ └── main
│ │ │ └── resources
│ │ │ └── log4j2.properties
│ └── compose-flink.yml
└── local
│ ├── requirements.txt
│ ├── requirements-dev.txt
│ ├── build.sh
│ ├── application_properties.json
│ ├── Dockerfile
│ ├── compose-flink.yml
│ └── compose-kafka.yml
├── stream-processing-with-flink
├── gradle.properties
├── .idea
│ ├── .gitignore
│ ├── dictionaries
│ │ └── project.xml
│ ├── kotlinc.xml
│ ├── vcs.xml
│ ├── ktlint-plugin.xml
│ ├── misc.xml
│ └── gradle.xml
├── gradle
│ └── wrapper
│ │ ├── gradle-wrapper.jar
│ │ └── gradle-wrapper.properties
├── settings.gradle.kts
├── src
│ ├── main
│ │ ├── kotlin
│ │ │ └── me
│ │ │ │ └── jaehyeon
│ │ │ │ ├── smoke
│ │ │ │ ├── SmokeLevel.kt
│ │ │ │ ├── SmokeLevelSplit.kt
│ │ │ │ ├── Alert.kt
│ │ │ │ └── SmokeLevelSourceReader.kt
│ │ │ │ ├── connector
│ │ │ │ ├── Post.kt
│ │ │ │ ├── HttpSink.kt
│ │ │ │ ├── HttpSplit.kt
│ │ │ │ └── HttpSplitEnumerator.kt
│ │ │ │ ├── sensor
│ │ │ │ ├── SensorReading.kt
│ │ │ │ └── SensorSplit.kt
│ │ │ │ ├── chapter8
│ │ │ │ └── CustomConnectors.kt
│ │ │ │ ├── chapter5
│ │ │ │ ├── RollingSum.kt
│ │ │ │ ├── KeyedTransformations.kt
│ │ │ │ └── BasicTransformations.kt
│ │ │ │ ├── misc
│ │ │ │ └── ControlStreamGenerator.kt
│ │ │ │ └── chapter6
│ │ │ │ └── MarkerBasedWatermarkGeneration.kt
│ │ └── resources
│ │ │ └── simplelogger.properties
│ └── test
│ │ └── kotlin
│ │ └── me
│ │ └── jaehyeon
│ │ └── chapter1
│ │ └── AverageSensorReadingsTest.kt
├── .gitignore
└── build.gradle.kts
├── pyflink-udemy
├── requirements-16.txt
├── requirements-16-dev.txt
├── requirements-15-dev.txt
├── seller-input
│ └── sellers.csv
├── quarterly-sales-input
│ └── quarterly_sales.csv
├── requirements-15.txt
├── s3_01_tbl_env.py
├── csv-input
│ └── locale-sales.csv
├── s3_03_csv_source.py
├── s3_02_python_source.py
├── s3_05_csv_sink.py
├── s4_13_row_operations.py
├── s4_04_aggregations.py
├── s4_01_projections.py
├── s3_04_kafka_source.py
├── s4_05_producer.py
├── s4_02_filtering.py
├── s4_03_joining.py
└── s3_06_kafka_sink.py
├── flink-sql-cookbook
├── img
│ └── sql-client.gif
├── Dockerfile
├── README.md
└── docker-compose.yml
├── datorios
├── note.sh
├── docker-compose.yml
└── README.md
├── confluent-flink-101
├── notes.sh
├── Dockerfile
├── notes.md
└── compose-flink-standalone.yml
└── README.md
/pyflink-doc/requirements.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/learning-materials/cep/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/sql-cookbook/requirements.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/learning-materials/table_sql/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.vscode/ltex.dictionary.en-US.txt:
--------------------------------------------------------------------------------
1 | Flink
2 |
--------------------------------------------------------------------------------
/sql-training/client-image/VERSION:
--------------------------------------------------------------------------------
1 | 1.0
2 |
--------------------------------------------------------------------------------
/fraud-detection/remote/infra/key-pair/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/infra/key-pair/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/jars/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/fraud-detection/local/requirements.txt:
--------------------------------------------------------------------------------
1 | kafka-python==2.0.2
2 |
--------------------------------------------------------------------------------
/fraud-detection/remote/requirements.txt:
--------------------------------------------------------------------------------
1 | kafka-python==2.0.2
2 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/infra/key-pair/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/requirements.txt:
--------------------------------------------------------------------------------
1 | kafka-python==2.0.2
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/local/requirements.txt:
--------------------------------------------------------------------------------
1 | kafka-python==2.0.2
--------------------------------------------------------------------------------
/stream-processing-with-flink/gradle.properties:
--------------------------------------------------------------------------------
1 | kotlin.code.style=official
2 |
--------------------------------------------------------------------------------
/pyflink-udemy/requirements-16.txt:
--------------------------------------------------------------------------------
1 | apache-flink==1.16.1
2 | kafka-python==2.0.2
3 |
--------------------------------------------------------------------------------
/pyflink-udemy/requirements-16-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements-16.txt
2 | black
3 | pytest
4 | pytest-cov
--------------------------------------------------------------------------------
/pyflink-udemy/requirements-15-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements-15.txt
2 | black==19.10b0
3 | pytest
4 | pytest-cov
--------------------------------------------------------------------------------
/stream-processing-with-flink/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/pyflink-doc/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | apache-flink==1.17.1
3 | black
4 | pytest
5 | pytest-cov
--------------------------------------------------------------------------------
/sql-cookbook/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | apache-flink==1.17.1
3 | black
4 | pytest
5 | pytest-cov
--------------------------------------------------------------------------------
/sql-cookbook/note.sh:
--------------------------------------------------------------------------------
1 | docker build -t flink-sql-cookbook:1.17.1-scala_2.12 .
2 |
3 | docker-compose run sql-client
4 |
5 |
--------------------------------------------------------------------------------
/sql-training/mysql/create_tables.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE AreaCnts (areaId INT NOT NULL, cnt BIGINT NOT NULL, PRIMARY KEY (areaId));
2 |
--------------------------------------------------------------------------------
/fraud-detection/local/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | apache-flink==1.15.2
3 | black==19.10b0
4 | pytest
5 | pytest-cov
--------------------------------------------------------------------------------
/fraud-detection/remote/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | apache-flink==1.15.2
3 | black==19.10b0
4 | pytest
5 | pytest-cov
--------------------------------------------------------------------------------
/flink-sql-cookbook/img/sql-client.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/flink-sql-cookbook/img/sql-client.gif
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/local/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | apache-flink==1.15.2
3 | black==19.10b0
4 | pytest
5 | pytest-cov
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | apache-flink==1.17.1
3 | black
4 | pytest
5 | pytest-cov
6 | ipdb
7 |
--------------------------------------------------------------------------------
/pyflink-udemy/seller-input/sellers.csv:
--------------------------------------------------------------------------------
1 | id,city,state
2 | LNK,Lincoln,Nebraska
3 | OMA,Omaha,Nebraska
4 | KC,Kansas City,Missouri
5 | DEN,Denver,Colorado
--------------------------------------------------------------------------------
/real-time-streaming-aws/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r producer/requirements.txt
2 | apache-flink==1.17.1
3 | black
4 | boto3
5 | pytest
6 | pytest-cov
7 | ipdb
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | apache-flink==1.15.2
3 | black==19.10b0
4 | boto3
5 | pytest
6 | pytest-cov
--------------------------------------------------------------------------------
/sql-training/minio/data/.minio.sys/pool.bin/xl.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/sql-training/minio/data/.minio.sys/pool.bin/xl.meta
--------------------------------------------------------------------------------
/sql-training/client-image/sql-client/sql-client.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ${FLINK_HOME}/bin/sql-client.sh embedded -d ${FLINK_HOME}/conf/sql-client-conf.yaml -l ${SQL_CLIENT_HOME}/lib
--------------------------------------------------------------------------------
/sql-training/minio/data/.minio.sys/config/config.json/xl.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/sql-training/minio/data/.minio.sys/config/config.json/xl.meta
--------------------------------------------------------------------------------
/sql-training/minio/data/.minio.sys/buckets/.usage.json/xl.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/sql-training/minio/data/.minio.sys/buckets/.usage.json/xl.meta
--------------------------------------------------------------------------------
/stream-processing-with-flink/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/stream-processing-with-flink/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/sql-training/minio/data/.minio.sys/buckets/.bloomcycle.bin/xl.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/sql-training/minio/data/.minio.sys/buckets/.bloomcycle.bin/xl.meta
--------------------------------------------------------------------------------
/sql-training/minio/data/.minio.sys/config/iam/format.json/xl.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/sql-training/minio/data/.minio.sys/config/iam/format.json/xl.meta
--------------------------------------------------------------------------------
/flink-sql-cookbook/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM flink:1.20.1
2 |
3 | # add faker connector
4 | RUN wget -P /opt/flink/lib/ \
5 | https://github.com/knaufk/flink-faker/releases/download/v0.5.3/flink-faker-0.5.3.jar
--------------------------------------------------------------------------------
/pyflink-udemy/quarterly-sales-input/quarterly_sales.csv:
--------------------------------------------------------------------------------
1 | seller_id,q1,q2,q3,q4
2 | LNK,10000,12300,9600,13200
3 | OMA,18100,17600,11800,15000
4 | KC,19700,18600,21800,17300
5 | DEN,18500,19600,17200,22800
--------------------------------------------------------------------------------
/stream-processing-with-flink/settings.gradle.kts:
--------------------------------------------------------------------------------
1 | plugins {
2 | id("org.gradle.toolchains.foojay-resolver-convention") version "0.8.0"
3 | }
4 | rootProject.name = "stream-processing-with-flink"
5 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/.idea/dictionaries/project.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | sideoutput
5 |
6 |
7 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/producer/requirements.txt:
--------------------------------------------------------------------------------
1 | # kafka-python with IAM auth support - https://github.com/dpkp/kafka-python/pull/2255
2 | https://github.com/mattoberle/kafka-python/archive/7ff323727d99e0c33a68423300e7f88a9cf3f830.tar.gz
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/requirements.txt:
--------------------------------------------------------------------------------
1 | # kafka-python with IAM auth support - https://github.com/dpkp/kafka-python/pull/2255
2 | https://github.com/mattoberle/kafka-python/archive/7ff323727d99e0c33a68423300e7f88a9cf3f830.tar.gz
--------------------------------------------------------------------------------
/stream-processing-with-flink/.idea/kotlinc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/pyflink-udemy/requirements-15.txt:
--------------------------------------------------------------------------------
1 | apache-flink==1.15.4
2 | # kafka-python with IAM auth support - https://github.com/dpkp/kafka-python/pull/2255
3 | https://github.com/mattoberle/kafka-python/archive/7ff323727d99e0c33a68423300e7f88a9cf3f830.tar.gz
4 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter7/queryable_state.py:
--------------------------------------------------------------------------------
1 | # https://flink.apache.org/2023/10/24/announcing-the-release-of-apache-flink-1.18/#important-deprecations
2 | # Queryable State is now officially deprecated and will be dropped in Flink 2.0.
3 |
--------------------------------------------------------------------------------
/sql-training/minio/data/.minio.sys/format.json:
--------------------------------------------------------------------------------
1 | {"version":"1","format":"xl-single","id":"e28d208f-a22f-4eb4-933b-2de241ef0141","xl":{"version":"3","this":"d3684101-1f69-493b-90c9-56c4e2763ebd","sets":[["d3684101-1f69-493b-90c9-56c4e2763ebd"]],"distributionAlgo":"SIPMOD+PARITY"}}
--------------------------------------------------------------------------------
/stream-processing-with-flink/.idea/ktlint-plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | DISTRACT_FREE
5 | DEFAULT
6 |
7 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Fri Nov 21 17:12:54 AEDT 2025
2 | distributionBase=GRADLE_USER_HOME
3 | distributionPath=wrapper/dists
4 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.14-bin.zip
5 | zipStoreBase=GRADLE_USER_HOME
6 | zipStorePath=wrapper/dists
7 |
--------------------------------------------------------------------------------
/fraud-detection/remote/infra/providers.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_version = ">= 1.0.1"
3 |
4 | required_providers {
5 | aws = {
6 | source = "hashicorp/aws"
7 | version = ">= 3.72, < 5.0"
8 | }
9 |
10 | random = {
11 | source = "hashicorp/random"
12 | version = ">= 3.0.1"
13 | }
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/infra/providers.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_version = ">= 1.0.1"
3 |
4 | required_providers {
5 | aws = {
6 | source = "hashicorp/aws"
7 | version = ">= 3.72, < 5.0"
8 | }
9 |
10 | random = {
11 | source = "hashicorp/random"
12 | version = ">= 3.0.1"
13 | }
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/configs/ddb.json:
--------------------------------------------------------------------------------
1 | {
2 | "TableName": "real-time-streaming-taxi-rides",
3 | "KeySchema": [{ "AttributeName": "id", "KeyType": "HASH" }],
4 | "AttributeDefinitions": [{ "AttributeName": "id", "AttributeType": "S" }],
5 | "ProvisionedThroughput": {
6 | "ReadCapacityUnits": 1,
7 | "WriteCapacityUnits": 1
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/infra/providers.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 | required_version = ">= 1.0.1"
3 |
4 | required_providers {
5 | aws = {
6 | source = "hashicorp/aws"
7 | version = ">= 3.72, < 5.0"
8 | }
9 |
10 | random = {
11 | source = "hashicorp/random"
12 | version = ">= 3.0.1"
13 | }
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/smoke/SmokeLevel.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.smoke
2 |
3 | /**
4 | * Represents the two possible levels of smoke detection.
5 | * This is the standard, type-safe, and idiomatic way to define
6 | * a fixed set of constants in modern Kotlin.
7 | */
8 | enum class SmokeLevel {
9 | High,
10 | Low,
11 | }
12 |
--------------------------------------------------------------------------------
/sql-cookbook/src/python_udf.py:
--------------------------------------------------------------------------------
1 | from pyflink.table import DataTypes
2 | from pyflink.table.udf import udf
3 |
4 | us_cities = {"Chicago", "Portland", "Seattle", "New York"}
5 |
6 |
7 | @udf(input_types=[DataTypes.STRING(), DataTypes.FLOAT()], result_type=DataTypes.FLOAT())
8 | def to_fahr(city, temperature):
9 | return temperature if city not in us_cities else (temperature * 9.0 / 5.0) + 32.0
10 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/datorios/note.sh:
--------------------------------------------------------------------------------
1 | /opt/flink/examples/python/datastream/datorios
2 | cat /opt/flink/examples/python/datastream/datorios/tumbling_count_window.py
3 |
4 |
5 | docker exec
6 |
7 | ./datorios.sh my-cluster start
8 | ./datorios.sh list
9 | ./datorios.sh my-cluster flink run /flink_jobs/CarData.jar
10 | ./datorios.sh my-cluster stop
11 |
12 | ./datorios.sh my-cluster flink run \
13 | -py /opt/flink/apps/tumbling_count_window.py
14 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/configs/note.sh:
--------------------------------------------------------------------------------
1 | curl -i -X POST -H "Accept:application/json" -H "Content-Type:application/json" \
2 | http://localhost:8083/connectors/ -d @configs/sink.json
3 |
4 | curl http://localhost:8083/connectors/real-time-streaming-taxi-rides-sink/status
5 |
6 | curl -X DELETE http://localhost:8083/connectors/real-time-streaming-taxi-rides-sink
7 |
8 | aws dynamodb create-table --cli-input-json file://configs/ddb.json
9 | aws dynamodb delete-table --table-name real-time-streaming-taxi-rides
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/resources/simplelogger.properties:
--------------------------------------------------------------------------------
1 | # Default log level for all loggers
2 | org.slf4j.simpleLogger.defaultLogLevel=warn
3 |
4 | # Log level for a specific package
5 | org.slf4j.simpleLogger.log.me.jaehyeon=info
6 |
7 | # Show date and time
8 | org.slf4j.simpleLogger.showDateTime=true
9 |
10 | # Format for date and time
11 | org.slf4j.simpleLogger.dateTimeFormat=yyyy-MM-dd HH:mm:ss.SSS
12 |
13 | # Show the thread name
14 | org.slf4j.simpleLogger.showThreadName=false
--------------------------------------------------------------------------------
/confluent-flink-101/notes.sh:
--------------------------------------------------------------------------------
1 | # build docker image
2 | docker build -t=confluent-flink-101:1.15.4 .
3 |
4 | # start docker compose services and run sql client
5 | docker-compose -f compose-kafka.yml up -d
6 | docker-compose -f compose-flink-linked.yml up -d
7 | docker-compose -f compose-flink-linked.yml run sql-client
8 |
9 |
10 | docker run --rm -it --network=kafka-network bitnami/kafka:2.8.1 \
11 | /opt/bitnami/kafka/bin/kafka-topics.sh \
12 | --bootstrap-server kafka-0:9092 \
13 | --create --topic pageviews
14 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/smoke/SmokeLevelSplit.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.smoke
2 |
3 | import org.apache.flink.api.connector.source.SourceSplit
4 | import java.io.Serializable
5 |
6 | /**
7 | * Represents a split for the SmokeLevelSource.
8 | *
9 | * Since each reader instance behaves identically, we only need a single split type.
10 | */
11 | data class SmokeLevelSplit(
12 | private val id: String = "smoke-level-split",
13 | ) : SourceSplit,
14 | Serializable {
15 | override fun splitId(): String = id
16 | }
17 |
--------------------------------------------------------------------------------
/pyflink-udemy/s3_01_tbl_env.py:
--------------------------------------------------------------------------------
1 | # batch/stream table env
2 | from pyflink.table import EnvironmentSettings, TableEnvironment
3 |
4 | batch_tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode())
5 | stream_tbl_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode())
6 |
7 | # from DataStram environment
8 | from pyflink.datastream import StreamExecutionEnvironment
9 | from pyflink.table import StreamTableEnvironment
10 |
11 | ds_env = StreamExecutionEnvironment.get_execution_environment()
12 | tbl_env = StreamTableEnvironment.create(ds_env)
13 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/smoke/Alert.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.smoke
2 |
3 | import java.io.Serializable
4 |
5 | /**
6 | * Represents a smoke level alert.
7 | * A `data class` is the idiomatic Kotlin way to create a class
8 | * whose primary purpose is to hold data. The compiler automatically
9 | * generates `equals()`, `hashCode()`, `toString()`, and `copy()` methods,
10 | * which is crucial for Flink operations.
11 | */
12 | data class Alert(
13 | val sensorId: String,
14 | val timestamp: Long,
15 | val message: String,
16 | ) : Serializable
17 |
--------------------------------------------------------------------------------
/confluent-flink-101/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM flink:1.15.4-scala_2.12
2 |
3 | RUN wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-kafka/1.15.4/flink-connector-kafka-1.15.4.jar; \
4 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/kafka/kafka-clients/2.8.1/kafka-clients-2.8.1.jar; \
5 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/1.15.4/flink-sql-connector-kafka-1.15.4.jar; \
6 | wget -P /opt/flink/lib/ https://github.com/knaufk/flink-faker/releases/download/v0.5.0/flink-faker-0.5.0.jar;
7 |
--------------------------------------------------------------------------------
/fraud-detection/remote/infra/s3.tf:
--------------------------------------------------------------------------------
1 | resource "aws_s3_bucket" "default_bucket" {
2 | bucket = local.default_bucket.name
3 |
4 | force_destroy = true
5 |
6 | tags = local.tags
7 | }
8 |
9 | resource "aws_s3_bucket_acl" "default_bucket" {
10 | count = local.default_bucket.to_set_acl ? 1 : 0
11 |
12 | bucket = aws_s3_bucket.default_bucket.id
13 | acl = "private"
14 | }
15 |
16 | resource "aws_s3_bucket_server_side_encryption_configuration" "default_bucket" {
17 | bucket = aws_s3_bucket.default_bucket.bucket
18 |
19 | rule {
20 | apply_server_side_encryption_by_default {
21 | sse_algorithm = "AES256"
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/fraud-detection/remote/infra/vpc.tf:
--------------------------------------------------------------------------------
1 | module "vpc" {
2 | source = "terraform-aws-modules/vpc/aws"
3 | version = "~> 3.14"
4 |
5 | name = "${local.name}-vpc"
6 | cidr = local.vpc.cidr
7 |
8 | azs = local.vpc.azs
9 | public_subnets = [for k, v in local.vpc.azs : cidrsubnet(local.vpc.cidr, 3, k)]
10 | private_subnets = [for k, v in local.vpc.azs : cidrsubnet(local.vpc.cidr, 3, k + 3)]
11 |
12 | enable_nat_gateway = true
13 | create_igw = true
14 | enable_dns_hostnames = true
15 | single_nat_gateway = true
16 |
17 | private_subnet_tags = {
18 | "Tier" = "Private"
19 | }
20 |
21 | tags = local.tags
22 | }
23 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/infra/vpc.tf:
--------------------------------------------------------------------------------
1 | module "vpc" {
2 | source = "terraform-aws-modules/vpc/aws"
3 | version = "~> 3.14"
4 |
5 | name = "${local.name}-vpc"
6 | cidr = local.vpc.cidr
7 |
8 | azs = local.vpc.azs
9 | public_subnets = [for k, v in local.vpc.azs : cidrsubnet(local.vpc.cidr, 3, k)]
10 | private_subnets = [for k, v in local.vpc.azs : cidrsubnet(local.vpc.cidr, 3, k + 3)]
11 |
12 | enable_nat_gateway = true
13 | create_igw = true
14 | enable_dns_hostnames = true
15 | single_nat_gateway = true
16 |
17 | private_subnet_tags = {
18 | "Tier" = "Private"
19 | }
20 |
21 | tags = local.tags
22 | }
23 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/connector/Post.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.connector
2 |
3 | import kotlinx.serialization.Serializable
4 | import java.io.Serializable as JavaSerializable
5 |
6 | /**
7 | * Data class representing a Post, fetched from the external API.
8 | *
9 | * - @Serializable: For Ktor to deserialize JSON into this object.
10 | * - JavaSerializable: For Flink to send objects between TaskManagers.
11 | * An alias is used to avoid a name clash.
12 | */
13 | @Serializable
14 | data class Post(
15 | val userId: Int,
16 | val id: Int,
17 | val title: String,
18 | val body: String,
19 | ) : JavaSerializable
20 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/infra/s3.tf:
--------------------------------------------------------------------------------
1 | resource "aws_s3_bucket" "default_bucket" {
2 | bucket = local.default_bucket.name
3 |
4 | force_destroy = true
5 |
6 | tags = local.tags
7 | }
8 |
9 | resource "aws_s3_bucket_acl" "default_bucket" {
10 | count = local.default_bucket.to_set_acl ? 1 : 0
11 |
12 | bucket = aws_s3_bucket.default_bucket.id
13 | acl = "private"
14 | }
15 |
16 | resource "aws_s3_bucket_server_side_encryption_configuration" "default_bucket" {
17 | bucket = aws_s3_bucket.default_bucket.bucket
18 |
19 | rule {
20 | apply_server_side_encryption_by_default {
21 | sse_algorithm = "AES256"
22 | }
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/infra/vpc.tf:
--------------------------------------------------------------------------------
1 | module "vpc" {
2 | source = "terraform-aws-modules/vpc/aws"
3 | version = "~> 3.14"
4 |
5 | name = "${local.name}-vpc"
6 | cidr = local.vpc.cidr
7 |
8 | azs = local.vpc.azs
9 | public_subnets = [for k, v in local.vpc.azs : cidrsubnet(local.vpc.cidr, 3, k)]
10 | private_subnets = [for k, v in local.vpc.azs : cidrsubnet(local.vpc.cidr, 3, k + 3)]
11 |
12 | enable_nat_gateway = true
13 | create_igw = true
14 | enable_dns_hostnames = true
15 | single_nat_gateway = true
16 |
17 | private_subnet_tags = {
18 | "Tier" = "Private"
19 | }
20 |
21 | tags = local.tags
22 | }
23 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter1/utils/type_helper.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractstaticmethod
2 | from typing import List, Dict
3 |
4 | from pyflink.common.typeinfo import Types, TypeInformation
5 |
6 |
7 | class TypeMapping(ABC):
8 | @abstractstaticmethod
9 | def type_mapping():
10 | pass
11 |
12 |
13 | def set_type_info(type_mapping: Dict[str, TypeInformation], selects: List[str] = []):
14 | names, types = [], []
15 | for key in type_mapping.keys():
16 | if not selects or key in selects:
17 | names.append(key)
18 | types.append(type_mapping[key])
19 | return Types.ROW_NAMED(field_names=names, field_types=types)
20 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter5/utils/type_helper.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractstaticmethod
2 | from typing import List, Dict
3 |
4 | from pyflink.common.typeinfo import Types, TypeInformation
5 |
6 |
7 | class TypeMapping(ABC):
8 | @abstractstaticmethod
9 | def type_mapping():
10 | pass
11 |
12 |
13 | def set_type_info(type_mapping: Dict[str, TypeInformation], selects: List[str] = []):
14 | names, types = [], []
15 | for key in type_mapping.keys():
16 | if not selects or key in selects:
17 | names.append(key)
18 | types.append(type_mapping[key])
19 | return Types.ROW_NAMED(field_names=names, field_types=types)
20 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter6/utils/type_helper.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractstaticmethod
2 | from typing import List, Dict
3 |
4 | from pyflink.common.typeinfo import Types, TypeInformation
5 |
6 |
7 | class TypeMapping(ABC):
8 | @abstractstaticmethod
9 | def type_mapping():
10 | pass
11 |
12 |
13 | def set_type_info(type_mapping: Dict[str, TypeInformation], selects: List[str] = []):
14 | names, types = [], []
15 | for key in type_mapping.keys():
16 | if not selects or key in selects:
17 | names.append(key)
18 | types.append(type_mapping[key])
19 | return Types.ROW_NAMED(field_names=names, field_types=types)
20 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter7/utils/type_helper.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractstaticmethod
2 | from typing import List, Dict
3 |
4 | from pyflink.common.typeinfo import Types, TypeInformation
5 |
6 |
7 | class TypeMapping(ABC):
8 | @abstractstaticmethod
9 | def type_mapping():
10 | pass
11 |
12 |
13 | def set_type_info(type_mapping: Dict[str, TypeInformation], selects: List[str] = []):
14 | names, types = [], []
15 | for key in type_mapping.keys():
16 | if not selects or key in selects:
17 | names.append(key)
18 | types.append(type_mapping[key])
19 | return Types.ROW_NAMED(field_names=names, field_types=types)
20 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/loader/application_properties.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "PropertyGroupId": "kinesis.analytics.flink.run.options",
4 | "PropertyMap": {
5 | "python": "processor.py",
6 | "jarfile": "package/lib/lab2-pipeline-1.0.0.jar"
7 | }
8 | },
9 | {
10 | "PropertyGroupId": "source.config.0",
11 | "PropertyMap": {
12 | "table.name": "taxi_trip_source",
13 | "file.path": "s3://real-time-streaming-ap-southeast-2/taxi-csv/"
14 | }
15 | },
16 | {
17 | "PropertyGroupId": "sink.config.0",
18 | "PropertyMap": {
19 | "table.name": "taxi_trip_sink",
20 | "topic.name": "taxi-trip",
21 | "bootstrap.servers": "localhost:29092"
22 | }
23 | }
24 | ]
25 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/exporter/application_properties.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "PropertyGroupId": "kinesis.analytics.flink.run.options",
4 | "PropertyMap": {
5 | "python": "processor.py",
6 | "jarfile": "package/lib/lab3-pipeline-1.0.0.jar"
7 | }
8 | },
9 | {
10 | "PropertyGroupId": "source.config.0",
11 | "PropertyMap": {
12 | "table.name": "taxi_rides_src",
13 | "topic.name": "taxi-rides",
14 | "bootstrap.servers": "localhost:29092"
15 | }
16 | },
17 | {
18 | "PropertyGroupId": "sink.config.0",
19 | "PropertyMap": {
20 | "table.name": "taxi_rides_sink",
21 | "file.path": "s3://real-time-streaming-ap-southeast-2/taxi-rides/"
22 | }
23 | }
24 | ]
25 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/sensor/SensorReading.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.sensor
2 |
3 | import java.io.Serializable
4 |
5 | /**
6 | * Represents a single sensor reading.
7 | *
8 | * This data class is the fundamental event type used throughout the Flink job.
9 | * It must be Serializable to be sent across the Flink cluster.
10 | *
11 | * @property id The unique identifier of the sensor.
12 | * @property timestamp The timestamp of the reading, in milliseconds since the epoch.
13 | * @property temperature The temperature value of the reading.
14 | */
15 | data class SensorReading(
16 | val id: String,
17 | val timestamp: Long,
18 | val temperature: Double,
19 | ) : Serializable
20 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/forwarder/application_properties.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "PropertyGroupId": "kinesis.analytics.flink.run.options",
4 | "PropertyMap": {
5 | "python": "processor.py",
6 | "jarfile": "package/lib/lab4-pipeline-1.0.0.jar"
7 | }
8 | },
9 | {
10 | "PropertyGroupId": "source.config.0",
11 | "PropertyMap": {
12 | "table.name": "taxi_rides_src",
13 | "topic.name": "taxi-rides",
14 | "bootstrap.servers": "localhost:29092"
15 | }
16 | },
17 | {
18 | "PropertyGroupId": "sink.config.0",
19 | "PropertyMap": {
20 | "table.name": "trip_stats_sink",
21 | "os_hosts": "http://opensearch:9200",
22 | "os_index": "trip_stats"
23 | }
24 | }
25 | ]
26 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter5/rolling_sum.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode
4 |
5 | if __name__ == "__main__":
6 | """
7 | ## local execution
8 | python src/chapter5/rolling_sum.py
9 | """
10 |
11 | RUNTIME_ENV = os.getenv("RUNTIME_ENV", "local")
12 |
13 | env = StreamExecutionEnvironment.get_execution_environment()
14 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
15 |
16 | input_stream = env.from_collection(collection=[(1, 2, 2), (2, 3, 1), (2, 2, 4), (1, 5, 3)])
17 |
18 | result_stream = input_stream.key_by(lambda e: e[0]).sum(1)
19 |
20 | result_stream.print()
21 |
22 | env.execute("Rolling Sum Example")
23 |
--------------------------------------------------------------------------------
/fraud-detection/remote/infra/data.tf:
--------------------------------------------------------------------------------
1 | ## data sources for general resources
2 | # Find the user currently in use by AWS
3 | data "aws_caller_identity" "current" {}
4 |
5 | # Region in which to deploy the solution
6 | data "aws_region" "current" {}
7 |
8 | # Availability zones to use in our soultion
9 | data "aws_availability_zones" "available" {
10 | state = "available"
11 | }
12 |
13 | ## data sources for VPN
14 | # Local ip address
15 | data "http" "local_ip_address" {
16 | url = "https://ifconfig.me/ip"
17 | }
18 |
19 | # Latest Amazon linux 2 AMI
20 | data "aws_ami" "amazon_linux_2" {
21 | owners = ["amazon"]
22 | most_recent = true
23 |
24 | filter {
25 | name = "name"
26 | values = ["amzn2-ami-hvm-*-x86_64-ebs"]
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/.idea/gradle.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
17 |
18 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/infra/data.tf:
--------------------------------------------------------------------------------
1 | ## data sources for general resources
2 | # Find the user currently in use by AWS
3 | data "aws_caller_identity" "current" {}
4 |
5 | # Region in which to deploy the solution
6 | data "aws_region" "current" {}
7 |
8 | # Availability zones to use in our soultion
9 | data "aws_availability_zones" "available" {
10 | state = "available"
11 | }
12 |
13 | ## data sources for VPN
14 | # Local ip address
15 | data "http" "local_ip_address" {
16 | url = "https://ifconfig.me/ip"
17 | }
18 |
19 | # Latest Amazon linux 2 AMI
20 | data "aws_ami" "amazon_linux_2" {
21 | owners = ["amazon"]
22 | most_recent = true
23 |
24 | filter {
25 | name = "name"
26 | values = ["amzn2-ami-hvm-*-x86_64-ebs"]
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/pyflink-udemy/csv-input/locale-sales.csv:
--------------------------------------------------------------------------------
1 | seller_id,product,quantity,product_price,sales_date
2 | LNK,Toothbrush,22,3.99,2021-07-01
3 | LNK,Dental Floss,17,1.99,2021-07-01
4 | LNK,Toothpaste,8,4.99,2021-07-01
5 | OMA,Toothbrush,29,3.99,2021-07-01
6 | OMA,Toothpaste,9,4.99,2021-07-01
7 | OMA,Dental Floss,23,1.99,2021-07-01
8 | LNK,Toothbrush,25,3.99,2021-07-02
9 | LNK,Dental Floss,16,1.99,2021-07-02
10 | LNK,Toothpaste,9,4.99,2021-07-02
11 | OMA,Toothbrush,32,3.99,2021-07-02
12 | OMA,Toothpaste,13,4.99,2021-07-02
13 | OMA,Dental Floss,18,1.99,2021-07-02
14 | LNK,Toothbrush,20,3.99,2021-07-03
15 | LNK,Dental Floss,15,1.99,2021-07-03
16 | LNK,Toothpaste,11,4.99,2021-07-03
17 | OMA,Toothbrush,31,3.99,2021-07-03
18 | OMA,Toothpaste,10,4.99,2021-07-03
19 | OMA,Dental Floss,21,1.99,2021-07-03
--------------------------------------------------------------------------------
/pyflink-udemy/s3_03_csv_source.py:
--------------------------------------------------------------------------------
1 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource
2 |
3 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode())
4 |
5 | field_names = "seller_id,product,quantity,product_price,sales_date".split(",")
6 | field_types = [
7 | DataTypes.STRING(),
8 | DataTypes.STRING(),
9 | DataTypes.INT(),
10 | DataTypes.DOUBLE(),
11 | DataTypes.DATE(),
12 | ]
13 | source = CsvTableSource("./csv-input", field_names, field_types, ignore_first_line=True)
14 | tbl_env.register_table_source("product_locale_sales", source)
15 | tbl = tbl_env.from_path("product_locale_sales")
16 | print("\nProduct Sales Schema")
17 | tbl.print_schema()
18 | print("\nProduct Sales data")
19 | print(tbl.to_pandas())
20 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/.gitignore:
--------------------------------------------------------------------------------
1 | .gradle
2 | build/
3 | !gradle/wrapper/gradle-wrapper.jar
4 | !**/src/main/**/build/
5 | !**/src/test/**/build/
6 |
7 | ### IntelliJ IDEA ###
8 | .idea/modules.xml
9 | .idea/jarRepositories.xml
10 | .idea/compiler.xml
11 | .idea/libraries/
12 | *.iws
13 | *.iml
14 | *.ipr
15 | out/
16 | !**/src/main/**/out/
17 | !**/src/test/**/out/
18 |
19 | ### Kotlin ###
20 | .kotlin
21 |
22 | ### Eclipse ###
23 | .apt_generated
24 | .classpath
25 | .factorypath
26 | .project
27 | .settings
28 | .springBeans
29 | .sts4-cache
30 | bin/
31 | !**/src/main/**/bin/
32 | !**/src/test/**/bin/
33 |
34 | ### NetBeans ###
35 | /nbproject/private/
36 | /nbbuild/
37 | /dist/
38 | /nbdist/
39 | /.nb-gradle/
40 |
41 | ### VS Code ###
42 | .vscode/
43 |
44 | ### Mac OS ###
45 | .DS_Store
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/compose-ui.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 |
3 | services:
4 | kpow:
5 | image: factorhouse/kpow-ce:91.2.1
6 | container_name: kpow
7 | ports:
8 | - "3000:3000"
9 | networks:
10 | - appnet
11 | environment:
12 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID
13 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY
14 | AWS_SESSION_TOKEN: $AWS_SESSION_TOKEN
15 | # kafka cluster
16 | BOOTSTRAP: $BOOTSTRAP_SERVERS
17 | SECURITY_PROTOCOL: SASL_SSL
18 | SASL_MECHANISM: AWS_MSK_IAM
19 | SASL_CLIENT_CALLBACK_HANDLER_CLASS: software.amazon.msk.auth.iam.IAMClientCallbackHandler
20 | SASL_JAAS_CONFIG: software.amazon.msk.auth.iam.IAMLoginModule required;
21 |
22 | networks:
23 | appnet:
24 | name: app-network
25 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/local/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | SCRIPT_DIR="$(cd $(dirname "$0"); pwd)"
3 | SRC_PATH=$SCRIPT_DIR/package
4 | rm -rf $SRC_PATH && mkdir -p $SRC_PATH/lib
5 |
6 | ## Download flink sql connector kafka
7 | echo "download flink sql connector kafka..."
8 | VERSION=1.15.2
9 | FILE_NAME=flink-sql-connector-kafka-$VERSION
10 | DOWNLOAD_URL=https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/$VERSION/flink-sql-connector-kafka-$VERSION.jar
11 | curl -L -o $SRC_PATH/lib/$FILE_NAME.jar ${DOWNLOAD_URL}
12 |
13 | ## Install pip packages
14 | echo "install and zip pip packages..."
15 | pip install -r requirements.txt --target $SRC_PATH/site_packages
16 |
17 | ## Package pyflink app
18 | echo "package pyflink app"
19 | zip -r kda-package.zip processor.py package/lib package/site_packages
20 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/application_properties.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "PropertyGroupId": "kinesis.analytics.flink.run.options",
4 | "PropertyMap": {
5 | "python": "processor.py",
6 | "jarfile": "package/lib/pyflink-getting-started-1.0.0.jar",
7 | "pyFiles": "package/site_packages/"
8 | }
9 | },
10 | {
11 | "PropertyGroupId": "consumer.config.0",
12 | "PropertyMap": {
13 | "table.name": "source_table",
14 | "topic.name": "stocks-in",
15 | "bootstrap.servers": "localhost:29092",
16 | "startup.mode": "earliest-offset"
17 | }
18 | },
19 | {
20 | "PropertyGroupId": "producer.config.0",
21 | "PropertyMap": {
22 | "table.name": "sink_table",
23 | "topic.name": "stocks-out",
24 | "bootstrap.servers": "localhost:29092"
25 | }
26 | }
27 | ]
28 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/local/application_properties.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "PropertyGroupId": "kinesis.analytics.flink.run.options",
4 | "PropertyMap": {
5 | "python": "processor.py",
6 | "jarfile": "package/lib/flink-sql-connector-kinesis-1.15.2.jar",
7 | "pyFiles": "package/site_packages/"
8 | }
9 | },
10 | {
11 | "PropertyGroupId": "consumer.config.0",
12 | "PropertyMap": {
13 | "table.name": "source_table",
14 | "topic.name": "stocks-in",
15 | "bootstrap.servers": "localhost:29092",
16 | "startup.mode": "earliest-offset"
17 | }
18 | },
19 | {
20 | "PropertyGroupId": "producer.config.0",
21 | "PropertyMap": {
22 | "table.name": "sink_table",
23 | "topic.name": "stocks-out",
24 | "bootstrap.servers": "localhost:29092"
25 | }
26 | }
27 | ]
28 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/compose-ui.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 |
3 | services:
4 | kpow:
5 | image: factorhouse/kpow-ce:91.5.1
6 | container_name: kpow
7 | ports:
8 | - "3000:3000"
9 | networks:
10 | - kafkanet
11 | environment:
12 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID
13 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY
14 | # AWS_SESSION_TOKEN: $AWS_SESSION_TOKEN
15 | BOOTSTRAP: $BOOTSTRAP_SERVERS
16 | SECURITY_PROTOCOL: SASL_SSL
17 | SASL_MECHANISM: AWS_MSK_IAM
18 | SASL_JAAS_CONFIG: software.amazon.msk.auth.iam.IAMLoginModule required;
19 | SASL_CLIENT_CALLBACK_HANDLER_CLASS: software.amazon.msk.auth.iam.IAMClientCallbackHandler
20 | env_file: # https://kpow.io/get-started/#individual
21 | - ./kpow.env
22 |
23 | networks:
24 | kafkanet:
25 | name: kafka-network
26 |
--------------------------------------------------------------------------------
/fraud-detection/remote/infra/ddb.tf:
--------------------------------------------------------------------------------
1 | resource "aws_dynamodb_table" "transactions_table" {
2 | name = "${local.name}-flagged-transactions"
3 | billing_mode = "PROVISIONED"
4 | read_capacity = 2
5 | write_capacity = 2
6 | hash_key = "transaction_id"
7 | range_key = "transaction_date"
8 |
9 | attribute {
10 | name = "transaction_id"
11 | type = "S"
12 | }
13 |
14 | attribute {
15 | name = "account_id"
16 | type = "N"
17 | }
18 |
19 | attribute {
20 | name = "transaction_date"
21 | type = "S"
22 | }
23 |
24 | global_secondary_index {
25 | name = "account"
26 | hash_key = "account_id"
27 | range_key = "transaction_date"
28 | write_capacity = 2
29 | read_capacity = 2
30 | projection_type = "ALL"
31 | }
32 |
33 | tags = local.tags
34 | }
35 |
--------------------------------------------------------------------------------
/fraud-detection/remote/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 |
3 | services:
4 | kpow:
5 | image: factorhouse/kpow-ce:91.2.1
6 | container_name: kpow
7 | ports:
8 | - "3000:3000"
9 | networks:
10 | - appnet
11 | environment:
12 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID
13 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY
14 | AWS_SESSION_TOKEN: $AWS_SESSION_TOKEN
15 | # MSK cluster
16 | BOOTSTRAP: $BOOTSTRAP_SERVERS
17 | SECURITY_PROTOCOL: SASL_SSL
18 | SASL_MECHANISM: AWS_MSK_IAM
19 | SASL_CLIENT_CALLBACK_HANDLER_CLASS: software.amazon.msk.auth.iam.IAMClientCallbackHandler
20 | SASL_JAAS_CONFIG: software.amazon.msk.auth.iam.IAMLoginModule required;
21 | # MSK connect
22 | CONNECT_AWS_REGION: $AWS_DEFAULT_REGION
23 |
24 | networks:
25 | appnet:
26 | name: app-network
27 |
--------------------------------------------------------------------------------
/sql-training/client-image/conf/flink-conf.yaml:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Copyright 2019 Ververica GmbH
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ################################################################################
16 |
17 | jobmanager.rpc.address: jobmanager
18 |
--------------------------------------------------------------------------------
/fraud-detection/local/configs/sink.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "transactions-sink",
3 | "config": {
4 | "connector.class": "org.apache.camel.kafkaconnector.awsddbsink.CamelAwsddbsinkSinkConnector",
5 | "tasks.max": "2",
6 | "key.converter": "org.apache.kafka.connect.json.JsonConverter",
7 | "key.converter.schemas.enable": false,
8 | "value.converter": "org.apache.kafka.connect.json.JsonConverter",
9 | "value.converter.schemas.enable": false,
10 | "topics": "flagged-transactions",
11 |
12 | "camel.kamelet.aws-ddb-sink.table": "flagged-transactions",
13 | "camel.kamelet.aws-ddb-sink.region": "ap-southeast-2",
14 | "camel.kamelet.aws-ddb-sink.operation": "PutItem",
15 | "camel.kamelet.aws-ddb-sink.writeCapacity": 1,
16 | "camel.kamelet.aws-ddb-sink.useDefaultCredentialsProvider": true,
17 | "camel.sink.unmarshal": "jackson"
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/fraud-detection/local/compose-connect.yml:
--------------------------------------------------------------------------------
1 | version: "3.5"
2 |
3 | services:
4 | kafka-connect:
5 | image: bitnami/kafka:2.8.1
6 | container_name: connect
7 | command: >
8 | /opt/bitnami/kafka/bin/connect-distributed.sh
9 | /opt/bitnami/kafka/config/connect-distributed.properties
10 | ports:
11 | - "8083:8083"
12 | networks:
13 | - kafkanet
14 | environment:
15 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID
16 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY
17 | AWS_SESSION_TOKEN: $AWS_SESSION_TOKEN
18 | volumes:
19 | - "./configs/connect-distributed.properties:/opt/bitnami/kafka/config/connect-distributed.properties"
20 | - "./connectors/camel-aws-ddb-sink-kafka-connector:/opt/connectors/camel-aws-ddb-sink-kafka-connector"
21 |
22 | networks:
23 | kafkanet:
24 | external: true
25 | name: kafka-network
26 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/infra/data.tf:
--------------------------------------------------------------------------------
1 | ## data sources for general resources
2 | # Find the user currently in use by AWS
3 | data "aws_caller_identity" "current" {}
4 |
5 | # Region in which to deploy the solution
6 | data "aws_region" "current" {}
7 |
8 | # Availability zones to use in our soultion
9 | data "aws_availability_zones" "available" {
10 | state = "available"
11 | }
12 |
13 | ## data sources for VPN
14 | # Local ip address
15 | data "http" "local_ip_address" {
16 | url = "https://ifconfig.me/ip"
17 | }
18 |
19 | # Latest Amazon linux 2 AMI
20 | data "aws_ami" "amazon_linux_2" {
21 | owners = ["amazon"]
22 | most_recent = true
23 |
24 | filter {
25 | name = "name"
26 | values = ["amzn2-ami-hvm-*-x86_64-ebs"]
27 | }
28 | }
29 |
30 | data "aws_iam_role" "opensearch_service_linked_role" {
31 | name = "AWSServiceRoleForAmazonOpenSearchService"
32 | }
33 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/exporter/athena.sql:
--------------------------------------------------------------------------------
1 | -- https://docs.aws.amazon.com/athena/latest/ug/partitions.html
2 | CREATE EXTERNAL TABLE taxi_rides (
3 | id STRING,
4 | vendor_id INT,
5 | pickup_datetime TIMESTAMP,
6 | dropoff_datetime TIMESTAMP,
7 | passenger_count INT,
8 | pickup_longitude STRING,
9 | pickup_latitude STRING,
10 | dropoff_longitude STRING,
11 | dropoff_latitude STRING,
12 | store_and_fwd_flag STRING,
13 | gc_distance INT,
14 | trip_duration INT,
15 | google_distance INT,
16 | google_duration INT
17 | )
18 | PARTITIONED BY (year STRING, month STRING, date STRING, hour STRING)
19 | STORED AS parquet
20 | LOCATION 's3://real-time-streaming-ap-southeast-2/taxi-rides/';
21 |
22 | MSCK REPAIR TABLE taxi_rides;
23 |
24 | SELECT * FROM taxi_rides WHERE year='2023';
--------------------------------------------------------------------------------
/real-time-streaming-aws/configs/sink.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "real-time-streaming-taxi-rides-sink",
3 | "config": {
4 | "connector.class": "org.apache.camel.kafkaconnector.awsddbsink.CamelAwsddbsinkSinkConnector",
5 | "tasks.max": "1",
6 | "key.converter": "org.apache.kafka.connect.storage.StringConverter",
7 | "key.converter.schemas.enable": false,
8 | "value.converter": "org.apache.kafka.connect.json.JsonConverter",
9 | "value.converter.schemas.enable": false,
10 | "topics": "taxi-rides",
11 |
12 | "camel.kamelet.aws-ddb-sink.table": "real-time-streaming-taxi-rides",
13 | "camel.kamelet.aws-ddb-sink.region": "ap-southeast-2",
14 | "camel.kamelet.aws-ddb-sink.operation": "PutItem",
15 | "camel.kamelet.aws-ddb-sink.writeCapacity": 1,
16 | "camel.kamelet.aws-ddb-sink.useDefaultCredentialsProvider": true,
17 | "camel.sink.unmarshal": "jackson"
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/infra/s3.tf:
--------------------------------------------------------------------------------
1 | resource "aws_s3_bucket" "default_bucket" {
2 | bucket = local.default_bucket.name
3 |
4 | force_destroy = true
5 |
6 | tags = local.tags
7 | }
8 |
9 | resource "aws_s3_bucket_acl" "default_bucket" {
10 | count = local.default_bucket.to_set_acl ? 1 : 0
11 |
12 | bucket = aws_s3_bucket.default_bucket.id
13 | acl = "private"
14 | }
15 |
16 | resource "aws_s3_bucket_server_side_encryption_configuration" "default_bucket" {
17 | bucket = aws_s3_bucket.default_bucket.bucket
18 |
19 | rule {
20 | apply_server_side_encryption_by_default {
21 | sse_algorithm = "AES256"
22 | }
23 | }
24 | }
25 |
26 | resource "aws_s3_object" "kda_package" {
27 | bucket = aws_s3_bucket.default_bucket.id
28 | key = "taxi-csv/taxi-trips.csv"
29 | source = "${dirname(path.cwd)}/data/taxi-trips.csv"
30 |
31 | etag = filemd5("${dirname(path.cwd)}/data/taxi-trips.csv")
32 | }
33 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | SCRIPT_DIR="$(cd $(dirname "$0"); pwd)"
3 | SRC_PATH=$SCRIPT_DIR/package
4 |
5 | # remove contents under $SRC_PATH (except for the folders beginging with lab)
6 | shopt -s extglob
7 | rm -rf $SRC_PATH/!(lab*)
8 |
9 | ## Generate Uber jar file for individual labs
10 | echo "generate Uber jar for PyFlink app..."
11 | mkdir $SRC_PATH/lib
12 | mvn clean install -f $SRC_PATH/lab2-pipeline/pom.xml \
13 | && mv $SRC_PATH/lab2-pipeline/target/lab2-pipeline-1.0.0.jar $SRC_PATH/lib \
14 | && rm -rf $SRC_PATH/lab2-pipeline/target
15 |
16 | mvn clean install -f $SRC_PATH/lab3-pipeline/pom.xml \
17 | && mv $SRC_PATH/lab3-pipeline/target/lab3-pipeline-1.0.0.jar $SRC_PATH/lib \
18 | && rm -rf $SRC_PATH/lab3-pipeline/target
19 |
20 | mvn clean install -f $SRC_PATH/lab4-pipeline/pom.xml \
21 | && mv $SRC_PATH/lab4-pipeline/target/lab4-pipeline-1.0.0.jar $SRC_PATH/lib \
22 | && rm -rf $SRC_PATH/lab4-pipeline/target
23 |
--------------------------------------------------------------------------------
/sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/records/TaxiRecord.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Ververica GmbH
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.ververica.sql_training.data_producer.records;
18 |
19 | import java.util.Date;
20 |
21 | public interface TaxiRecord {
22 |
23 | Date getEventTime();
24 | }
25 |
--------------------------------------------------------------------------------
/pyflink-udemy/s3_02_python_source.py:
--------------------------------------------------------------------------------
1 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes
2 |
3 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode())
4 |
5 | products = [("Toothbrush", 3.99), ("Dental Floss", 1.99), ("Toothpaste", 4.99)]
6 |
7 | ## tbl1
8 | tbl1 = tbl_env.from_elements(products)
9 | print("\ntbl1 schema")
10 | tbl1.print_schema()
11 | print("\ntbl1 data")
12 | print(tbl1.to_pandas())
13 |
14 | ## tbl2
15 | col_names = ["product", "price"]
16 | tbl2 = tbl_env.from_elements(products, col_names)
17 | print("\ntbl2 schema")
18 | tbl2.print_schema()
19 | print("\ntbl2 data")
20 | print(tbl2.to_pandas())
21 |
22 | ## tbl3
23 | schema = DataTypes.ROW(
24 | [DataTypes.FIELD("product", DataTypes.STRING()), DataTypes.FIELD("price", DataTypes.DOUBLE())]
25 | )
26 | tbl3 = tbl_env.from_elements(products, schema)
27 | print("\ntbl3 schema")
28 | tbl3.print_schema()
29 | print("\ntbl3 data")
30 | print(tbl3.to_pandas())
31 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | SCRIPT_DIR="$(cd $(dirname "$0"); pwd)"
3 | SRC_PATH=$SCRIPT_DIR/package
4 |
5 | # remove contents under $SRC_PATH (except for uber-jar-for-pyflink) and kda-package.zip file
6 | shopt -s extglob
7 | rm -rf $SRC_PATH/!(uber-jar-for-pyflink) kda-package.zip
8 |
9 | ## Generate Uber Jar for PyFlink app for MSK cluster with IAM authN
10 | echo "generate Uber jar for PyFlink app..."
11 | mkdir $SRC_PATH/lib
12 | mvn clean install -f $SRC_PATH/uber-jar-for-pyflink/pom.xml \
13 | && mv $SRC_PATH/uber-jar-for-pyflink/target/pyflink-getting-started-1.0.0.jar $SRC_PATH/lib \
14 | && rm -rf $SRC_PATH/uber-jar-for-pyflink/target
15 |
16 | ## Install pip packages
17 | echo "install and zip pip packages..."
18 | pip install -r requirements.txt --target $SRC_PATH/site_packages
19 |
20 | ## Package pyflink app
21 | echo "package pyflink app"
22 | zip -r kda-package.zip processor.py package/lib package/site_packages
23 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/download.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | SCRIPT_DIR="$(cd $(dirname "$0"); pwd)"
3 |
4 | SRC_PATH=${SCRIPT_DIR}/infra/connectors
5 | rm -rf ${SRC_PATH} && mkdir -p ${SRC_PATH}
6 |
7 | ## Download camel dynamodb sink connector
8 | echo "download camel dynamodb sink connector..."
9 | DOWNLOAD_URL=https://repo.maven.apache.org/maven2/org/apache/camel/kafkaconnector/camel-aws-ddb-sink-kafka-connector/3.20.3/camel-aws-ddb-sink-kafka-connector-3.20.3-package.tar.gz
10 |
11 | # decompress and zip contents to create custom plugin of msk connect later
12 | curl -o ${SRC_PATH}/camel-aws-ddb-sink-kafka-connector.tar.gz ${DOWNLOAD_URL} \
13 | && tar -xvzf ${SRC_PATH}/camel-aws-ddb-sink-kafka-connector.tar.gz -C ${SRC_PATH} \
14 | && cd ${SRC_PATH}/camel-aws-ddb-sink-kafka-connector \
15 | && zip -r camel-aws-ddb-sink-kafka-connector.zip . \
16 | && mv camel-aws-ddb-sink-kafka-connector.zip ${SRC_PATH} \
17 | && rm ${SRC_PATH}/camel-aws-ddb-sink-kafka-connector.tar.gz
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/sensor/SensorSplit.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.sensor
2 |
3 | import org.apache.flink.api.connector.source.SourceSplit
4 | import java.io.Serializable
5 |
6 | /**
7 | * Represents a split of work for the SensorSource.
8 | *
9 | * In the context of this custom source, a "split" is a logical unit of work assigned
10 | * to a single parallel SourceReader instance. Here, it simply wraps the subtask index
11 | * to ensure each parallel reader generates unique sensor IDs.
12 | * It must be Serializable to be sent from the SplitEnumerator (on the JobManager)
13 | * to the SourceReaders (on the TaskManagers).
14 | *
15 | * @property subtaskIndex The parallel instance index this split is for.
16 | */
17 | data class SensorSplit(
18 | val subtaskIndex: Int,
19 | ) : SourceSplit,
20 | Serializable {
21 | /**
22 | * Provides a unique identifier for this split.
23 | */
24 | override fun splitId(): String = "split-$subtaskIndex"
25 | }
26 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/local/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM flink:1.15.2-scala_2.12
2 |
3 | ARG PYTHON_VERSION
4 | ENV PYTHON_VERSION=${PYTHON_VERSION:-3.8.10}
5 | ARG FLINK_VERSION
6 | ENV FLINK_VERSION=${FLINK_VERSION:-1.15.2}
7 |
8 | # Currently only Python 3.6, 3.7 and 3.8 are supported officially.
9 | RUN apt-get update -y && \
10 | apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev libffi-dev && \
11 | wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz && \
12 | tar -xvf Python-${PYTHON_VERSION}.tgz && \
13 | cd Python-${PYTHON_VERSION} && \
14 | ./configure --without-tests --enable-shared && \
15 | make -j6 && \
16 | make install && \
17 | ldconfig /usr/local/lib && \
18 | cd .. && rm -f Python-${PYTHON_VERSION}.tgz && rm -rf Python-${PYTHON_VERSION} && \
19 | ln -s /usr/local/bin/python3 /usr/local/bin/python && \
20 | apt-get clean && \
21 | rm -rf /var/lib/apt/lists/*
22 |
23 | # install PyFlink
24 | RUN pip3 install apache-flink==${FLINK_VERSION}
25 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM flink:1.15.4-scala_2.12
2 |
3 | ARG PYTHON_VERSION
4 | ENV PYTHON_VERSION=${PYTHON_VERSION:-3.8.10}
5 | ARG FLINK_VERSION
6 | ENV FLINK_VERSION=${FLINK_VERSION:-1.15.2}
7 |
8 | # Currently only Python 3.6, 3.7 and 3.8 are supported officially.
9 | RUN apt-get update -y && \
10 | apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev libffi-dev && \
11 | wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz && \
12 | tar -xvf Python-${PYTHON_VERSION}.tgz && \
13 | cd Python-${PYTHON_VERSION} && \
14 | ./configure --without-tests --enable-shared && \
15 | make -j6 && \
16 | make install && \
17 | ldconfig /usr/local/lib && \
18 | cd .. && rm -f Python-${PYTHON_VERSION}.tgz && rm -rf Python-${PYTHON_VERSION} && \
19 | ln -s /usr/local/bin/python3 /usr/local/bin/python && \
20 | apt-get clean && \
21 | rm -rf /var/lib/apt/lists/*
22 |
23 | # install PyFlink
24 | RUN pip3 install apache-flink==${FLINK_VERSION}
25 |
--------------------------------------------------------------------------------
/fraud-detection/local/configs/ddb.json:
--------------------------------------------------------------------------------
1 | {
2 | "TableName": "flagged-transactions",
3 | "KeySchema": [
4 | { "AttributeName": "transaction_id", "KeyType": "HASH" },
5 | { "AttributeName": "transaction_date", "KeyType": "RANGE" }
6 | ],
7 | "AttributeDefinitions": [
8 | { "AttributeName": "transaction_id", "AttributeType": "S" },
9 | { "AttributeName": "account_id", "AttributeType": "N" },
10 | { "AttributeName": "transaction_date", "AttributeType": "S" }
11 | ],
12 | "ProvisionedThroughput": {
13 | "ReadCapacityUnits": 2,
14 | "WriteCapacityUnits": 2
15 | },
16 | "GlobalSecondaryIndexes": [
17 | {
18 | "IndexName": "account",
19 | "KeySchema": [
20 | { "AttributeName": "account_id", "KeyType": "HASH" },
21 | { "AttributeName": "transaction_date", "KeyType": "RANGE" }
22 | ],
23 | "Projection": { "ProjectionType": "ALL" },
24 | "ProvisionedThroughput": {
25 | "ReadCapacityUnits": 2,
26 | "WriteCapacityUnits": 2
27 | }
28 | }
29 | ]
30 | }
31 |
--------------------------------------------------------------------------------
/confluent-flink-101/notes.md:
--------------------------------------------------------------------------------
1 | [Apache Flink® 101](https://developer.confluent.io/courses/apache-flink/intro/)
2 |
3 | ## Table of Contents
4 |
5 | 1. Introduction
6 | 2. Intro to Stream Processing with Apache Flink
7 | 3. Intro to Flink SQL
8 | 4. Batch and Stream Processing with Flink SQL (Exercise)
9 | 5. The Flink Runtime
10 | 6. Using the Flink Web UI (Exercise)
11 | 7. Using Kafka with Flink
12 | 8. Deploying an ETL Pipeline using Flink SQL (Exercise)
13 | 9. Stateful Stream Processing with Flink SQL
14 | 10. Streaming Analytics with Flink SQL (Exercise)
15 | 11. Event Time and Watermarks
16 | 12. Implementing and Troubleshooting Watermarks (Exercise)
17 | 13. Checkpoints and Recovery
18 | 14. Experiencing Failure Recovery (Exercise)
19 | 15. Conclusion
20 |
21 | ## 4 Big Ideas
22 |
23 | ### Streaming
24 |
25 | - Intro to Stream Processing with Apache Flink
26 | - Intro to Flink SQL
27 | - The Flink Runtime
28 | - Using Kafka with Flink
29 |
30 | ### State
31 |
32 | - Stateful Stream Processing with Flink SQL
33 |
34 | ### Time
35 |
36 | - Event Time and Watermarks
37 |
38 | ### Snapshot
39 |
40 | - Checkpoints and Recovery
41 |
--------------------------------------------------------------------------------
/fraud-detection/local/application_properties.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "PropertyGroupId": "kinesis.analytics.flink.run.options",
4 | "PropertyMap": {
5 | "python": "processor.py",
6 | "jarfile": "package/lib/flink-sql-connector-kinesis-1.15.2.jar",
7 | "pyFiles": "package/site_packages/"
8 | }
9 | },
10 | {
11 | "PropertyGroupId": "consumer.config.0",
12 | "PropertyMap": {
13 | "table.name": "flagged_accounts",
14 | "topic.name": "flagged-accounts",
15 | "bootstrap.servers": "localhost:29092",
16 | "startup.mode": "earliest-offset"
17 | }
18 | },
19 | {
20 | "PropertyGroupId": "consumer.config.1",
21 | "PropertyMap": {
22 | "table.name": "transactions",
23 | "topic.name": "transactions",
24 | "bootstrap.servers": "localhost:29092",
25 | "startup.mode": "earliest-offset"
26 | }
27 | },
28 | {
29 | "PropertyGroupId": "producer.config.0",
30 | "PropertyMap": {
31 | "table.name": "flagged_transactions",
32 | "topic.name": "flagged-transactions",
33 | "bootstrap.servers": "localhost:29092"
34 | }
35 | }
36 | ]
37 |
--------------------------------------------------------------------------------
/fraud-detection/remote/application_properties.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "PropertyGroupId": "kinesis.analytics.flink.run.options",
4 | "PropertyMap": {
5 | "python": "processor.py",
6 | "jarfile": "package/lib/flink-sql-connector-kinesis-1.15.2.jar",
7 | "pyFiles": "package/site_packages/"
8 | }
9 | },
10 | {
11 | "PropertyGroupId": "consumer.config.0",
12 | "PropertyMap": {
13 | "table.name": "flagged_accounts",
14 | "topic.name": "flagged-accounts",
15 | "bootstrap.servers": "localhost:29092",
16 | "startup.mode": "earliest-offset"
17 | }
18 | },
19 | {
20 | "PropertyGroupId": "consumer.config.1",
21 | "PropertyMap": {
22 | "table.name": "transactions",
23 | "topic.name": "transactions",
24 | "bootstrap.servers": "localhost:29092",
25 | "startup.mode": "earliest-offset"
26 | }
27 | },
28 | {
29 | "PropertyGroupId": "producer.config.0",
30 | "PropertyMap": {
31 | "table.name": "flagged_transactions",
32 | "topic.name": "flagged-transactions",
33 | "bootstrap.servers": "localhost:29092"
34 | }
35 | }
36 | ]
37 |
--------------------------------------------------------------------------------
/pyflink-udemy/s3_05_csv_sink.py:
--------------------------------------------------------------------------------
1 | from pyflink.table import (
2 | EnvironmentSettings,
3 | TableEnvironment,
4 | CsvTableSource,
5 | CsvTableSink,
6 | WriteMode,
7 | DataTypes,
8 | )
9 |
10 | env_settings = EnvironmentSettings.in_batch_mode()
11 | table_env = TableEnvironment.create(env_settings)
12 | table_env.get_config().set("parallelism.default", "1") # output to single file
13 |
14 | field_names = "seller_id,product,quantity,product_price,sales_date".split(",")
15 | field_types = [
16 | DataTypes.STRING(),
17 | DataTypes.STRING(),
18 | DataTypes.INT(),
19 | DataTypes.DOUBLE(),
20 | DataTypes.DATE(),
21 | ]
22 |
23 | # source table
24 | source = CsvTableSource("./csv-input", field_names, field_types, ignore_first_line=True)
25 | table_env.register_table_source("product_locale_sales", source)
26 | tbl = table_env.from_path("product_locale_sales")
27 |
28 | # sink table
29 | sink = CsvTableSink(
30 | field_names, field_types, "revenue.csv", num_files=1, write_mode=WriteMode.OVERWRITE
31 | )
32 | table_env.register_table_sink("locale_revenue", sink)
33 | tbl.execute_insert("locale_revenue").wait()
34 |
--------------------------------------------------------------------------------
/fraud-detection/remote/infra/scripts/bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -ex
2 |
3 | ## Allocate elastic IP and disable source/destination checks
4 | TOKEN=$(curl --silent --max-time 60 -X PUT http://169.254.169.254/latest/api/token -H "X-aws-ec2-metadata-token-ttl-seconds: 30")
5 | INSTANCEID=$(curl --silent --max-time 60 -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-id)
6 | aws --region ${aws_region} ec2 associate-address --instance-id $INSTANCEID --allocation-id ${allocation_id}
7 | aws --region ${aws_region} ec2 modify-instance-attribute --instance-id $INSTANCEID --source-dest-check "{\"Value\": false}"
8 |
9 | ## Start SoftEther VPN server
10 | yum update -y && yum install docker -y
11 | systemctl enable docker.service && systemctl start docker.service
12 |
13 | docker pull siomiz/softethervpn:debian
14 | docker run -d \
15 | --cap-add NET_ADMIN \
16 | --name softethervpn \
17 | --restart unless-stopped \
18 | -p 500:500/udp -p 4500:4500/udp -p 1701:1701/tcp -p 1194:1194/udp -p 5555:5555/tcp -p 443:443/tcp \
19 | -e PSK=${vpn_psk} \
20 | -e SPW=${admin_password} \
21 | -e HPW=DEFAULT \
22 | siomiz/softethervpn:debian
--------------------------------------------------------------------------------
/real-time-streaming-aws/infra/scripts/bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -ex
2 |
3 | ## Allocate elastic IP and disable source/destination checks
4 | TOKEN=$(curl --silent --max-time 60 -X PUT http://169.254.169.254/latest/api/token -H "X-aws-ec2-metadata-token-ttl-seconds: 30")
5 | INSTANCEID=$(curl --silent --max-time 60 -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-id)
6 | aws --region ${aws_region} ec2 associate-address --instance-id $INSTANCEID --allocation-id ${allocation_id}
7 | aws --region ${aws_region} ec2 modify-instance-attribute --instance-id $INSTANCEID --source-dest-check "{\"Value\": false}"
8 |
9 | ## Start SoftEther VPN server
10 | yum update -y && yum install docker -y
11 | systemctl enable docker.service && systemctl start docker.service
12 |
13 | docker pull siomiz/softethervpn:debian
14 | docker run -d \
15 | --cap-add NET_ADMIN \
16 | --name softethervpn \
17 | --restart unless-stopped \
18 | -p 500:500/udp -p 4500:4500/udp -p 1701:1701/tcp -p 1194:1194/udp -p 5555:5555/tcp -p 443:443/tcp \
19 | -e PSK=${vpn_psk} \
20 | -e SPW=${admin_password} \
21 | -e HPW=DEFAULT \
22 | siomiz/softethervpn:debian
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/test/kotlin/me/jaehyeon/chapter1/AverageSensorReadingsTest.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.chapter1
2 |
3 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
4 | import org.junit.jupiter.api.Assertions.assertNotNull
5 | import org.junit.jupiter.api.Test
6 |
7 | /**
8 | * A simple unit test to verify that Flink classes are available on the test classpath.
9 | */
10 | class AverageSensorReadingsTest {
11 | @Test
12 | fun `test that Flink StreamExecutionEnvironment can be created`() {
13 | // This is the core of the test.
14 | // We are trying to use a class from the 'flink-streaming-java' dependency,
15 | // which is marked as 'compileOnly' in our build.gradle.kts.
16 | // If the test classpath was not configured correctly, this line would fail
17 | // with a ClassNotFoundException.
18 | val env = StreamExecutionEnvironment.getExecutionEnvironment()
19 |
20 | // A simple assertion to confirm that the environment object was created successfully.
21 | assertNotNull(env, "The Flink execution environment should not be null.")
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/infra/scripts/bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash -ex
2 |
3 | ## Allocate elastic IP and disable source/destination checks
4 | TOKEN=$(curl --silent --max-time 60 -X PUT http://169.254.169.254/latest/api/token -H "X-aws-ec2-metadata-token-ttl-seconds: 30")
5 | INSTANCEID=$(curl --silent --max-time 60 -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-id)
6 | aws --region ${aws_region} ec2 associate-address --instance-id $INSTANCEID --allocation-id ${allocation_id}
7 | aws --region ${aws_region} ec2 modify-instance-attribute --instance-id $INSTANCEID --source-dest-check "{\"Value\": false}"
8 |
9 | ## Start SoftEther VPN server
10 | yum update -y && yum install docker -y
11 | systemctl enable docker.service && systemctl start docker.service
12 |
13 | docker pull siomiz/softethervpn:debian
14 | docker run -d \
15 | --cap-add NET_ADMIN \
16 | --name softethervpn \
17 | --restart unless-stopped \
18 | -p 500:500/udp -p 4500:4500/udp -p 1701:1701/tcp -p 1194:1194/udp -p 5555:5555/tcp -p 443:443/tcp \
19 | -e PSK=${vpn_psk} \
20 | -e SPW=${admin_password} \
21 | -e HPW=DEFAULT \
22 | siomiz/softethervpn:debian
--------------------------------------------------------------------------------
/sql-training/client-image/java/sql-training-udfs/src/main/java/com/ververica/sql_training/udfs/IsInNYC.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Ververica GmbH
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.ververica.sql_training.udfs;
18 |
19 | import org.apache.flink.table.functions.ScalarFunction;
20 |
21 | import static com.ververica.sql_training.udfs.util.GeoUtils.isInNYC;
22 |
23 | /**
24 | * Table API / SQL Scalar UDF to check if a coordinate is in NYC.
25 | */
26 | public class IsInNYC extends ScalarFunction {
27 |
28 | public boolean eval(Float lon, Float lat) {
29 | return isInNYC(lon, lat);
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/sql-training/client-image/java/sql-training-udfs/src/main/java/com/ververica/sql_training/udfs/ToAreaId.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Ververica GmbH
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.ververica.sql_training.udfs;
18 |
19 | import org.apache.flink.table.functions.ScalarFunction;
20 |
21 | import com.ververica.sql_training.udfs.util.GeoUtils;
22 |
23 | /**
24 | * Table API / SQL Scalar UDF to convert a lon/lat pair into a cell ID.
25 | */
26 | public class ToAreaId extends ScalarFunction {
27 |
28 | public int eval(Float lon, Float lat) {
29 | return GeoUtils.mapToGridCell(lon, lat);
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM flink:1.17.1
2 |
3 | ARG PYTHON_VERSION
4 | ENV PYTHON_VERSION=${PYTHON_VERSION:-3.8.10}
5 | ARG FLINK_VERSION
6 | ENV FLINK_VERSION=${FLINK_VERSION:-1.17.1}
7 |
8 | RUN mkdir ./plugins/s3-fs-hadoop \
9 | && cp ./opt/flink-s3-fs-hadoop-${FLINK_VERSION}.jar ./plugins/s3-fs-hadoop
10 |
11 | RUN apt-get update -y && \
12 | apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev libffi-dev liblzma-dev && \
13 | wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz && \
14 | tar -xvf Python-${PYTHON_VERSION}.tgz && \
15 | cd Python-${PYTHON_VERSION} && \
16 | ./configure --without-tests --enable-shared && \
17 | make -j6 && \
18 | make install && \
19 | ldconfig /usr/local/lib && \
20 | cd .. && rm -f Python-${PYTHON_VERSION}.tgz && rm -rf Python-${PYTHON_VERSION} && \
21 | ln -s /usr/local/bin/python3 /usr/local/bin/python && \
22 | apt-get clean && \
23 | rm -rf /var/lib/apt/lists/*
24 |
25 | # install PyFlink
26 | RUN pip3 install apache-flink==${FLINK_VERSION}
27 |
28 | # add kafka client for Flink SQL client, will be added manually
29 | RUN wget -P /etc/lib/ https://repo.maven.apache.org/maven2/org/apache/kafka/kafka-clients/3.2.3/kafka-clients-3.2.3.jar;
30 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/connector/HttpSink.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.connector
2 |
3 | import org.apache.flink.api.connector.sink2.Sink
4 | import org.apache.flink.api.connector.sink2.SinkWriter
5 | import org.apache.flink.api.connector.sink2.WriterInitContext
6 |
7 | /**
8 | * The Sink class is the main entry point and a factory for the SinkWriter.
9 | *
10 | * @param url The target HTTP endpoint for all records.
11 | * @param httpMethodName The HTTP method name to use for the requests.
12 | */
13 | class HttpSink(
14 | private val url: String,
15 | private val httpMethodName: String,
16 | ) : Sink {
17 | @Deprecated("Overrides deprecated member in superclass.")
18 | override fun createWriter(context: Sink.InitContext): SinkWriter {
19 | val subtaskId =
20 | if (context is WriterInitContext) {
21 | // Modern, warning-free path
22 | context.subtaskId
23 | } else {
24 | // Fallback path with targeted warning suppression
25 | @Suppress("DEPRECATION")
26 | context.getSubtaskId()
27 | }
28 |
29 | return HttpSinkWriter(
30 | url,
31 | httpMethodName,
32 | subtaskId,
33 | )
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/chapter8/CustomConnectors.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.chapter8
2 |
3 | import me.jaehyeon.connector.HttpSink
4 | import me.jaehyeon.connector.HttpSource
5 | import org.apache.flink.api.common.eventtime.WatermarkStrategy
6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
7 |
8 | object CustomConnectors {
9 | @JvmStatic
10 | fun main(args: Array) {
11 | val env = StreamExecutionEnvironment.getExecutionEnvironment()
12 | env.parallelism = 2
13 |
14 | val httpSource =
15 | HttpSource(
16 | baseUrlPattern = "https://jsonplaceholder.typicode.com/posts/{id}",
17 | startId = 1,
18 | maxId = 100,
19 | )
20 |
21 | val sourceStream =
22 | env.fromSource(
23 | httpSource,
24 | WatermarkStrategy.noWatermarks(),
25 | "Cyclical HTTP Source",
26 | )
27 |
28 | val httpSink =
29 | HttpSink(
30 | url = "https://jsonplaceholder.typicode.com/posts",
31 | httpMethodName = "POST",
32 | )
33 |
34 | sourceStream.sinkTo(httpSink)
35 |
36 | env.execute("Custom HTTP Source and Sink Jobs")
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter7/checkpointed_function.py:
--------------------------------------------------------------------------------
1 | # Deprecation of ListCheckpointed interface #
2 | # FLINK-6258 #
3 | # The ListCheckpointed interface has been deprecated because it uses Java Serialization for checkpointing state which is problematic for savepoint compatibility. Use the CheckpointedFunction interface instead, which gives more control over state serialization.
4 |
5 | # Operator State
6 | # https://nightlies.apache.org/flink/flink-docs-release-1.18/docs/dev/datastream/fault-tolerance/state/
7 |
8 | # Operator State (or non-keyed state) is state that is bound to one parallel operator instance. The Kafka Connector is a good motivating example for the use of Operator State in Flink. Each parallel instance of the Kafka consumer maintains a map of topic partitions and offsets as its Operator State.
9 |
10 | # The Operator State interfaces support redistributing state among parallel operator instances when the parallelism is changed. There are different schemes for doing this redistribution.
11 |
12 | # In a typical stateful Flink Application you don’t need operators state. It is mostly a special type of state that is used in source/sink implementations and scenarios where you don’t have a key by which state can be partitioned.
13 |
14 | # Notes: Operator state is still not supported in Python DataStream API.
15 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter7/operator_list_state_function.py:
--------------------------------------------------------------------------------
1 | # Deprecation of ListCheckpointed interface #
2 | # FLINK-6258 #
3 | # The ListCheckpointed interface has been deprecated because it uses Java Serialization for checkpointing state which is problematic for savepoint compatibility. Use the CheckpointedFunction interface instead, which gives more control over state serialization.
4 |
5 | # Operator State
6 | # https://nightlies.apache.org/flink/flink-docs-release-1.18/docs/dev/datastream/fault-tolerance/state/
7 |
8 | # Operator State (or non-keyed state) is state that is bound to one parallel operator instance. The Kafka Connector is a good motivating example for the use of Operator State in Flink. Each parallel instance of the Kafka consumer maintains a map of topic partitions and offsets as its Operator State.
9 |
10 | # The Operator State interfaces support redistributing state among parallel operator instances when the parallelism is changed. There are different schemes for doing this redistribution.
11 |
12 | # In a typical stateful Flink Application you don’t need operators state. It is mostly a special type of state that is used in source/sink implementations and scenarios where you don’t have a key by which state can be partitioned.
13 |
14 | # Notes: Operator state is still not supported in Python DataStream API.
15 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/package/lab2-pipeline/src/main/resources/log4j2.properties:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | ################################################################################
18 |
19 | rootLogger.level = INFO
20 | rootLogger.appenderRef.console.ref = ConsoleAppender
21 |
22 | appender.console.name = ConsoleAppender
23 | appender.console.type = CONSOLE
24 | appender.console.layout.type = PatternLayout
25 | appender.console.layout.pattern = %d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n
26 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/package/lab3-pipeline/src/main/resources/log4j2.properties:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | ################################################################################
18 |
19 | rootLogger.level = INFO
20 | rootLogger.appenderRef.console.ref = ConsoleAppender
21 |
22 | appender.console.name = ConsoleAppender
23 | appender.console.type = CONSOLE
24 | appender.console.layout.type = PatternLayout
25 | appender.console.layout.pattern = %d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n
26 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/package/lab4-pipeline/src/main/resources/log4j2.properties:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | ################################################################################
18 |
19 | rootLogger.level = INFO
20 | rootLogger.appenderRef.console.ref = ConsoleAppender
21 |
22 | appender.console.name = ConsoleAppender
23 | appender.console.type = CONSOLE
24 | appender.console.layout.type = PatternLayout
25 | appender.console.layout.pattern = %d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n
26 |
--------------------------------------------------------------------------------
/fraud-detection/remote/package/uber-jar-for-pyflink/src/main/resources/log4j2.properties:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | ################################################################################
18 |
19 | rootLogger.level = INFO
20 | rootLogger.appenderRef.console.ref = ConsoleAppender
21 |
22 | appender.console.name = ConsoleAppender
23 | appender.console.type = CONSOLE
24 | appender.console.layout.type = PatternLayout
25 | appender.console.layout.pattern = %d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n
26 |
--------------------------------------------------------------------------------
/sql-training/client-image/java/sql-training-udfs/src/main/java/com/ververica/sql_training/udfs/ToCoords.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Ververica GmbH
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.ververica.sql_training.udfs;
18 |
19 | import org.apache.flink.table.annotation.DataTypeHint;
20 | import org.apache.flink.table.functions.ScalarFunction;
21 | import org.apache.flink.types.Row;
22 |
23 | import com.ververica.sql_training.udfs.util.GeoUtils;
24 |
25 | /**
26 | * Table API / SQL Scalar UDF to convert a cell ID into a lon/lat pair.
27 | */
28 | public class ToCoords extends ScalarFunction {
29 |
30 | @DataTypeHint("ROW")
31 | public Row eval(Integer cellId) {
32 | return Row.of(
33 | GeoUtils.getGridCellCenterLon(cellId),
34 | GeoUtils.getGridCellCenterLat(cellId)
35 | );
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/package/uber-jar-for-pyflink/src/main/resources/log4j2.properties:
--------------------------------------------------------------------------------
1 | ################################################################################
2 | # Licensed to the Apache Software Foundation (ASF) under one
3 | # or more contributor license agreements. See the NOTICE file
4 | # distributed with this work for additional information
5 | # regarding copyright ownership. The ASF licenses this file
6 | # to you under the Apache License, Version 2.0 (the
7 | # "License"); you may not use this file except in compliance
8 | # with the License. You may obtain a copy of the License at
9 | #
10 | # http://www.apache.org/licenses/LICENSE-2.0
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | ################################################################################
18 |
19 | rootLogger.level = INFO
20 | rootLogger.appenderRef.console.ref = ConsoleAppender
21 |
22 | appender.console.name = ConsoleAppender
23 | appender.console.type = CONSOLE
24 | appender.console.layout.type = PatternLayout
25 | appender.console.layout.pattern = %d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n
26 |
--------------------------------------------------------------------------------
/confluent-flink-101/compose-flink-standalone.yml:
--------------------------------------------------------------------------------
1 | version: "3.5"
2 |
3 | services:
4 | jobmanager:
5 | image: confluent-flink-101:1.15.4
6 | command: jobmanager
7 | ports:
8 | - "8081:8081"
9 | environment:
10 | - |
11 | FLINK_PROPERTIES=
12 | jobmanager.rpc.address: jobmanager
13 | state.backend: filesystem
14 | state.checkpoints.dir: file:///tmp/flink-checkpoints
15 | heartbeat.interval: 1000
16 | heartbeat.timeout: 5000
17 | rest.flamegraph.enabled: true
18 | web.backpressure.refresh-interval: 10000
19 | taskmanager:
20 | image: confluent-flink-101:1.15.4
21 | command: taskmanager
22 | volumes:
23 | - flink_data:/tmp/
24 | environment:
25 | - |
26 | FLINK_PROPERTIES=
27 | jobmanager.rpc.address: jobmanager
28 | taskmanager.numberOfTaskSlots: 3
29 | state.backend: filesystem
30 | state.checkpoints.dir: file:///tmp/flink-checkpoints
31 | heartbeat.interval: 1000
32 | heartbeat.timeout: 5000
33 | depends_on:
34 | - jobmanager
35 | sql-client:
36 | image: confluent-flink-101:1.15.4
37 | command: bin/sql-client.sh
38 | depends_on:
39 | - jobmanager
40 | environment:
41 | - |
42 | FLINK_PROPERTIES=
43 | jobmanager.rpc.address: jobmanager
44 | rest.address: jobmanager
45 |
46 | volumes:
47 | flink_data:
48 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/README.md:
--------------------------------------------------------------------------------
1 | # Stream Processing with Flink in Python
2 |
3 | This project contains Python implementations of examples for learning Apache Flink, inspired by the [`streaming-with-flink` Scala](https://github.com/streaming-with-flink/examples-scala) project.
4 |
5 | ## Getting Started
6 |
7 | First, clone the repository to your local machine:
8 |
9 | ```bash
10 | git clone https://github.com/jaehyeon-kim/flink-demos.git
11 | cd flink-demos/stream-processing-with-pyflink
12 | ```
13 |
14 | It is recommended to create a virtual environment to manage the project's dependencies.
15 |
16 | ```bash
17 | python -m venv venv
18 | source venv/bin/activate
19 | ```
20 |
21 | Next, install the required Python packages, which are listed in the `requirements.txt` file.
22 |
23 | ```bash
24 | pip install -r requirements-dev.txt
25 | ```
26 |
27 | ## Running the Examples
28 |
29 | You can run the Flink jobs locally from your command line. Use the `python` command to execute the individual example scripts.
30 |
31 | Here are a few examples from different chapters. Please check the `src` directory for all available applications.
32 |
33 | ```bash
34 | # Run an example from Chapter 5
35 | python src/chapter5/basic_transformations.py
36 |
37 | # Run an example from Chapter 6
38 | python src/chapter6/process_function_timers.py
39 |
40 | # Run an example from Chapter 7
41 | python src/chapter7/keyed_state_function.py
42 | ```
43 |
--------------------------------------------------------------------------------
/sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/ConsolePrinter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Ververica GmbH
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.ververica.sql_training.data_producer;
18 |
19 | import com.ververica.sql_training.data_producer.json_serde.JsonSerializer;
20 | import com.ververica.sql_training.data_producer.records.TaxiRecord;
21 |
22 | import java.util.function.Consumer;
23 |
24 | /**
25 | * Prints TaxiRecords as JSON strings on the standard output.
26 | */
27 | public class ConsolePrinter implements Consumer {
28 |
29 | private final JsonSerializer serializer = new JsonSerializer<>();
30 |
31 | @Override
32 | public void accept(TaxiRecord record) {
33 | String jsonString = serializer.toJSONString(record);
34 | System.out.println(jsonString);
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "files.watcherExclude": {
3 | "**/venv": true
4 | },
5 | "editor.formatOnSave": true,
6 | "editor.defaultFormatter": "esbenp.prettier-vscode",
7 | "editor.tabSize": 2,
8 | "[python]": {
9 | "editor.tabSize": 4,
10 | "editor.formatOnSave": true,
11 | "editor.defaultFormatter": "charliermarsh.ruff"
12 | },
13 | "[terraform]": {
14 | "editor.defaultFormatter": "hashicorp.terraform",
15 | "editor.formatOnSave": false,
16 | "editor.codeActionsOnSave": {
17 | "source.formatAll.terraform": "explicit"
18 | }
19 | },
20 | "[terraform-vars]": {
21 | "editor.defaultFormatter": "hashicorp.terraform",
22 | "editor.formatOnSave": false,
23 | "editor.codeActionsOnSave": {
24 | "source.formatAll.terraform": "explicit"
25 | }
26 | },
27 | "yaml.customTags": [
28 | "!Base64 scalar",
29 | "!Cidr scalar",
30 | "!And sequence",
31 | "!Equals sequence",
32 | "!If sequence",
33 | "!Not sequence",
34 | "!Or sequence",
35 | "!Condition scalar",
36 | "!FindInMap sequence",
37 | "!GetAtt scalar",
38 | "!GetAtt sequence",
39 | "!GetAZs scalar",
40 | "!ImportValue scalar",
41 | "!Join sequence",
42 | "!Select sequence",
43 | "!Split sequence",
44 | "!Sub scalar",
45 | "!Transform mapping",
46 | "!Ref scalar"
47 | ],
48 | "[terraform][terraform-vars]": {
49 | "editor.codeActionsOnSave": {
50 | "source.formatAll.terraform": "explicit"
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/sql-training/client-image/java/sql-training-udfs/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.ververica.sql-training
8 | sql-training-udfs
9 | 2-FLINK-1.11_2.11
10 |
11 |
12 | 1.11.1
13 |
14 |
15 |
16 |
17 | org.apache.flink
18 | flink-table-common
19 | ${flink.version}
20 |
21 |
22 | org.apache.flink
23 | flink-streaming-java_2.11
24 | ${flink.version}
25 |
26 |
27 |
28 |
29 |
30 |
31 | org.apache.maven.plugins
32 | maven-compiler-plugin
33 |
34 | 8
35 | 8
36 |
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter1/utils/model.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import dataclasses
3 | from typing import Iterable, Tuple
4 |
5 | from pyflink.common import Row
6 | from pyflink.common.typeinfo import Types
7 |
8 | from .type_helper import TypeMapping, set_type_info
9 |
10 |
11 | @dataclasses.dataclass
12 | class SensorReading(TypeMapping):
13 | id: str
14 | timestamp: int
15 | num_records: int
16 | temperature: float
17 |
18 | def to_row(self):
19 | return Row(**dataclasses.asdict(self))
20 |
21 | @classmethod
22 | def from_row(cls, row: Row):
23 | return cls(**row.as_dict())
24 |
25 | @staticmethod
26 | def process_elements(elements: Iterable[Tuple[int, int, datetime.datetime]]):
27 | id, count, temperature = None, 0, 0
28 | for e in elements:
29 | next_id = f"sensor_{e[0]}"
30 | if id is not None:
31 | assert id == next_id
32 | id = next_id
33 | count += 1
34 | temperature += 65 + (e[1] / 100 * 20)
35 | return id, count, temperature
36 |
37 | @staticmethod
38 | def type_mapping():
39 | return {
40 | "id": Types.STRING(),
41 | "timestamp": Types.LONG(),
42 | "num_records": Types.INT(),
43 | "temperature": Types.DOUBLE(),
44 | }
45 |
46 | @staticmethod
47 | def set_key_type_info():
48 | return set_type_info(SensorReading.type_mapping(), selects=["id"])
49 |
50 | @staticmethod
51 | def set_value_type_info():
52 | return set_type_info(SensorReading.type_mapping())
53 |
--------------------------------------------------------------------------------
/sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/records/DriverChange.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Ververica GmbH
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.ververica.sql_training.data_producer.records;
18 |
19 | import com.fasterxml.jackson.annotation.JsonFormat;
20 |
21 | import java.util.Date;
22 |
23 | /**
24 | * POJO for a DriverChange record.
25 | */
26 | public class DriverChange implements TaxiRecord {
27 |
28 | @JsonFormat(shape = JsonFormat.Shape.STRING, pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'")
29 | private Date eventTime;
30 | @JsonFormat
31 | private long taxiId;
32 | @JsonFormat
33 | private long driverId;
34 |
35 | public DriverChange() {}
36 |
37 | public DriverChange(Date eventTime, long taxiId, long driverId) {
38 | this.eventTime = eventTime;
39 | this.taxiId = taxiId;
40 | this.driverId = driverId;
41 | }
42 |
43 | @Override
44 | public Date getEventTime() {
45 | return eventTime;
46 | }
47 |
48 | }
49 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/local/compose-flink.yml:
--------------------------------------------------------------------------------
1 | version: "3.5"
2 |
3 | services:
4 | jobmanager:
5 | image: pyflink:1.15.2-scala_2.12
6 | container_name: jobmanager
7 | command: jobmanager
8 | ports:
9 | - "8081:8081"
10 | networks:
11 | - kafkanet
12 | environment:
13 | - |
14 | FLINK_PROPERTIES=
15 | jobmanager.rpc.address: jobmanager
16 | state.backend: filesystem
17 | state.checkpoints.dir: file:///tmp/flink-checkpoints
18 | heartbeat.interval: 1000
19 | heartbeat.timeout: 5000
20 | rest.flamegraph.enabled: true
21 | web.backpressure.refresh-interval: 10000
22 | - RUNTIME_ENV=DOCKER
23 | - BOOTSTRAP_SERVERS=kafka-0:9092
24 | volumes:
25 | - $PWD:/etc/flink
26 | taskmanager:
27 | image: pyflink:1.15.2-scala_2.12
28 | container_name: taskmanager
29 | command: taskmanager
30 | networks:
31 | - kafkanet
32 | volumes:
33 | - flink_data:/tmp/
34 | - $PWD:/etc/flink
35 | environment:
36 | - |
37 | FLINK_PROPERTIES=
38 | jobmanager.rpc.address: jobmanager
39 | taskmanager.numberOfTaskSlots: 3
40 | state.backend: filesystem
41 | state.checkpoints.dir: file:///tmp/flink-checkpoints
42 | heartbeat.interval: 1000
43 | heartbeat.timeout: 5000
44 | - RUNTIME_ENV=DOCKER
45 | - BOOTSTRAP_SERVERS=kafka-0:9092
46 | depends_on:
47 | - jobmanager
48 |
49 | networks:
50 | kafkanet:
51 | external: true
52 | name: kafka-network
53 |
54 | volumes:
55 | flink_data:
56 | driver: local
57 | name: flink_data
58 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/local/compose-kafka.yml:
--------------------------------------------------------------------------------
1 | version: "3.5"
2 |
3 | services:
4 | zookeeper:
5 | image: bitnami/zookeeper:3.5
6 | container_name: zookeeper
7 | ports:
8 | - "2181"
9 | networks:
10 | - kafkanet
11 | environment:
12 | - ALLOW_ANONYMOUS_LOGIN=yes
13 | volumes:
14 | - zookeeper_data:/bitnami/zookeeper
15 | kafka-0:
16 | image: bitnami/kafka:2.8.1
17 | container_name: kafka-0
18 | expose:
19 | - 9092
20 | ports:
21 | - "29092:29092"
22 | networks:
23 | - kafkanet
24 | environment:
25 | - ALLOW_PLAINTEXT_LISTENER=yes
26 | - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
27 | - KAFKA_CFG_BROKER_ID=0
28 | - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT
29 | - KAFKA_CFG_LISTENERS=INTERNAL://:9092,EXTERNAL://:29092
30 | - KAFKA_CFG_ADVERTISED_LISTENERS=INTERNAL://kafka-0:9092,EXTERNAL://localhost:29092
31 | - KAFKA_CFG_INTER_BROKER_LISTENER_NAME=INTERNAL
32 | - KAFKA_CFG_NUM_PARTITIONS=2
33 | volumes:
34 | - kafka_0_data:/bitnami/kafka
35 | depends_on:
36 | - zookeeper
37 | kpow:
38 | image: factorhouse/kpow-ce:91.2.1
39 | container_name: kpow
40 | ports:
41 | - "3000:3000"
42 | networks:
43 | - kafkanet
44 | environment:
45 | BOOTSTRAP: kafka-0:9092
46 | depends_on:
47 | - zookeeper
48 | - kafka-0
49 |
50 | networks:
51 | kafkanet:
52 | name: kafka-network
53 |
54 | volumes:
55 | zookeeper_data:
56 | driver: local
57 | name: zookeeper_data
58 | kafka_0_data:
59 | driver: local
60 | name: kafka_0_data
61 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/connector/HttpSplit.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.connector
2 |
3 | import org.apache.flink.api.connector.source.SourceSplit
4 | import org.apache.flink.core.io.SimpleVersionedSerializer
5 | import java.io.ByteArrayInputStream
6 | import java.io.ByteArrayOutputStream
7 | import java.io.DataInputStream
8 | import java.io.DataOutputStream
9 | import java.io.IOException
10 |
11 | /**
12 | * Represents one unit of work: a single URL to be fetched by a SourceReader.
13 | */
14 | data class HttpSplit(
15 | val url: String,
16 | ) : SourceSplit {
17 | override fun splitId(): String = url
18 | }
19 |
20 | /**
21 | * Serializer for sending HttpSplit objects from the JobManager (Enumerator)
22 | * to the TaskManagers (Readers).
23 | */
24 | class HttpSplitSerializer : SimpleVersionedSerializer {
25 | companion object {
26 | private const val VERSION = 1
27 | }
28 |
29 | override fun getVersion(): Int = VERSION
30 |
31 | override fun serialize(split: HttpSplit): ByteArray =
32 | ByteArrayOutputStream().use { baos ->
33 | DataOutputStream(baos).use { out ->
34 | out.writeUTF(split.url)
35 | baos.toByteArray()
36 | }
37 | }
38 |
39 | override fun deserialize(
40 | version: Int,
41 | serialized: ByteArray,
42 | ): HttpSplit {
43 | if (version != VERSION) throw IOException("Unknown version: $version")
44 | return ByteArrayInputStream(serialized).use { bais ->
45 | DataInputStream(bais).use { inp ->
46 | HttpSplit(inp.readUTF())
47 | }
48 | }
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/json_serde/JsonDeserializer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Ververica GmbH
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.ververica.sql_training.data_producer.json_serde;
18 |
19 | import com.fasterxml.jackson.databind.ObjectMapper;
20 |
21 | import java.io.IOException;
22 |
23 | /**
24 | * Deserializes a record from a JSON string.
25 | *
26 | * @param The type of the deserialized record.
27 | */
28 | public class JsonDeserializer {
29 |
30 | private final Class recordClazz;
31 | private final ObjectMapper jsonMapper;
32 |
33 | public JsonDeserializer(Class recordClazz) {
34 | this.recordClazz = recordClazz;
35 | this.jsonMapper = new ObjectMapper();
36 | }
37 |
38 | public T parseFromString(String line) {
39 | try {
40 | return jsonMapper.readValue(line, this.recordClazz);
41 | } catch (IOException e) {
42 | throw new IllegalArgumentException("Could not deserialize record: " + line + " as class " + recordClazz, e);
43 | }
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/datorios/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: '3.7'
2 | services:
3 | jobmanager:
4 | image: localhost/metro-flink:1.17.2
5 | pull_policy: never
6 | command: jobmanager
7 | container_name: datorios-${CLUSTER_NAME}-jobmanager
8 | volumes:
9 | - ${MOUNT_SRC_PATH}:${MOUNT_DST_PATH}
10 | ports:
11 | - "${CLUSTER_JOB_MANAGER_PORT}:8081"
12 | environment:
13 | - |
14 | FLINK_PROPERTIES=
15 | jobmanager.rpc.address: jobmanager
16 | taskmanager:
17 | image: localhost/metro-flink:1.17.2
18 | pull_policy: never
19 | depends_on:
20 | - jobmanager
21 | command: taskmanager
22 | volumes:
23 | - ${MOUNT_SRC_PATH}:${MOUNT_DST_PATH}
24 | environment:
25 | - |
26 | FLINK_PROPERTIES=
27 | jobmanager.rpc.address: jobmanager
28 | taskmanager.memory.task.off-heap.size: 128mb
29 | runner:
30 | image: localhost/metro-flink-runner:1.17.2
31 | pull_policy: never
32 | depends_on:
33 | - jobmanager
34 | - taskmanager
35 | - fluent-bit
36 | tty: true
37 | container_name: datorios-${CLUSTER_NAME}-runner
38 | volumes:
39 | - ${MOUNT_SRC_PATH}:${MOUNT_DST_PATH}
40 | env_file:
41 | - .env
42 | environment:
43 | - CLUSTER_NAME=${CLUSTER_NAME}
44 | fluent-bit:
45 | image: localhost/metro-fluent-bit:2.2.2
46 | pull_policy: never
47 | depends_on:
48 | - jobmanager
49 | - taskmanager
50 | command: /opt/fluent-bit/bin/fluent-bit -c /fluent-bit/config.conf
51 | container_name: datorios-${CLUSTER_NAME}-fluent-bit
52 | volumes:
53 | - /var/run/docker.sock:/var/run/docker.sock:ro
54 | env_file:
55 | - .env
56 | environment:
57 | - CLUSTER_NAME=${CLUSTER_NAME}
58 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/compose-extra.yml:
--------------------------------------------------------------------------------
1 | version: "3.5"
2 |
3 | services:
4 | opensearch:
5 | image: opensearchproject/opensearch:2.7.0
6 | container_name: opensearch
7 | environment:
8 | - discovery.type=single-node
9 | - node.name=opensearch
10 | - DISABLE_SECURITY_PLUGIN=true
11 | - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m"
12 | volumes:
13 | - opensearch_data:/usr/share/opensearch/data
14 | ports:
15 | - 9200:9200
16 | - 9600:9600
17 | networks:
18 | - appnet
19 | opensearch-dashboards:
20 | image: opensearchproject/opensearch-dashboards:2.7.0
21 | container_name: opensearch-dashboards
22 | ports:
23 | - 5601:5601
24 | expose:
25 | - "5601"
26 | environment:
27 | OPENSEARCH_HOSTS: '["http://opensearch:9200"]'
28 | DISABLE_SECURITY_DASHBOARDS_PLUGIN: true
29 | networks:
30 | - appnet
31 | kafka-connect:
32 | image: bitnami/kafka:2.8.1
33 | container_name: connect
34 | command: >
35 | /opt/bitnami/kafka/bin/connect-distributed.sh
36 | /opt/bitnami/kafka/config/connect-distributed.properties
37 | ports:
38 | - "8083:8083"
39 | networks:
40 | - appnet
41 | environment:
42 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID
43 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY
44 | volumes:
45 | - "./configs/connect-distributed.properties:/opt/bitnami/kafka/config/connect-distributed.properties"
46 | - "./infra/connectors/camel-aws-ddb-sink-kafka-connector:/opt/connectors/camel-aws-ddb-sink-kafka-connector"
47 |
48 | networks:
49 | appnet:
50 | external: true
51 | name: app-network
52 |
53 | volumes:
54 | opensearch_data:
55 | driver: local
56 | name: opensearch_data
57 |
--------------------------------------------------------------------------------
/sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/json_serde/JsonSerializer.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Ververica GmbH
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.ververica.sql_training.data_producer.json_serde;
18 |
19 | import com.fasterxml.jackson.core.JsonProcessingException;
20 | import com.fasterxml.jackson.databind.ObjectMapper;
21 |
22 | /**
23 | * Serializes a record as JSON string.
24 | *
25 | * @param The type for the records to serialize.
26 | */
27 | public class JsonSerializer {
28 |
29 | private final ObjectMapper jsonMapper = new ObjectMapper();
30 |
31 | public String toJSONString(T r) {
32 | try {
33 | return jsonMapper.writeValueAsString(r);
34 | } catch (JsonProcessingException e) {
35 | throw new IllegalArgumentException("Could not serialize record: " + r, e);
36 | }
37 | }
38 |
39 | public byte[] toJSONBytes(T r) {
40 | try {
41 | return jsonMapper.writeValueAsBytes(r);
42 | } catch (JsonProcessingException e) {
43 | throw new IllegalArgumentException("Could not serialize record: " + r, e);
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/sql-cookbook/compose-kafka.yml:
--------------------------------------------------------------------------------
1 | version: "3.5"
2 |
3 | services:
4 | zookeeper:
5 | image: bitnami/zookeeper:3.5
6 | container_name: zookeeper
7 | ports:
8 | - "2181"
9 | networks:
10 | - appnet
11 | environment:
12 | - ALLOW_ANONYMOUS_LOGIN=yes
13 | volumes:
14 | - zookeeper_data:/bitnami/zookeeper
15 | kafka-0:
16 | image: bitnami/kafka:2.8.1
17 | container_name: kafka-0
18 | expose:
19 | - 9092
20 | ports:
21 | - "29092:29092"
22 | networks:
23 | - appnet
24 | environment:
25 | - ALLOW_PLAINTEXT_LISTENER=yes
26 | - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
27 | - KAFKA_CFG_BROKER_ID=0
28 | - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT
29 | - KAFKA_CFG_LISTENERS=INTERNAL://:9092,EXTERNAL://:29092
30 | - KAFKA_CFG_ADVERTISED_LISTENERS=INTERNAL://kafka-0:9092,EXTERNAL://localhost:29092
31 | - KAFKA_CFG_INTER_BROKER_LISTENER_NAME=INTERNAL
32 | # - KAFKA_CFG_NUM_PARTITIONS=3
33 | # - KAFKA_CFG_DEFAULT_REPLICATION_FACTOR=1
34 | volumes:
35 | - kafka_0_data:/bitnami/kafka
36 | depends_on:
37 | - zookeeper
38 | kafka-ui:
39 | image: provectuslabs/kafka-ui:master
40 | container_name: kafka-ui
41 | ports:
42 | - "8080:8080"
43 | networks:
44 | - appnet
45 | environment:
46 | KAFKA_CLUSTERS_0_NAME: local
47 | KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka-0:9092
48 | KAFKA_CLUSTERS_0_ZOOKEEPER: zookeeper:2181
49 | depends_on:
50 | - zookeeper
51 | - kafka-0
52 |
53 | networks:
54 | appnet:
55 | external: true
56 | name: app-network
57 |
58 | volumes:
59 | zookeeper_data:
60 | driver: local
61 | name: zookeeper_data
62 | kafka_0_data:
63 | driver: local
64 | name: kafka_0_data
65 |
--------------------------------------------------------------------------------
/pyflink-udemy/s4_13_row_operations.py:
--------------------------------------------------------------------------------
1 | from statistics import stdev, mean
2 |
3 | from pyflink.common import Row
4 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource
5 | from pyflink.table.udf import udf
6 |
7 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode())
8 |
9 | field_names = "seller_id,q1,q2,q3,q4".split(",")
10 | field_types = [
11 | DataTypes.STRING(),
12 | DataTypes.INT(),
13 | DataTypes.INT(),
14 | DataTypes.INT(),
15 | DataTypes.INT(),
16 | ]
17 | source = CsvTableSource("./quarterly-sales-input", field_names, field_types, ignore_first_line=True)
18 | tbl_env.register_table_source("quarterly_sales", source)
19 |
20 | tbl = tbl_env.from_path("quarterly_sales")
21 | print("\nQuarterly Sales Schema")
22 | tbl.print_schema()
23 |
24 | print("\nQuarterly Sales Data")
25 | tbl.execute().print()
26 |
27 |
28 | @udf(
29 | result_type=DataTypes.ROW(
30 | [
31 | DataTypes.FIELD("seller_id", DataTypes.STRING()),
32 | DataTypes.FIELD("sales_total", DataTypes.INT()),
33 | DataTypes.FIELD("qtr_avg", DataTypes.DOUBLE()),
34 | DataTypes.FIELD("qtr_stdev", DataTypes.DOUBLE()),
35 | ]
36 | )
37 | )
38 | def sales_summary_stats(seller_sales: Row) -> Row:
39 | seller_id, q1, q2, q3, q4 = seller_sales
40 | sales = (q1, q2, q3, q4)
41 | total_sales = sum(sales)
42 | qtr_avg = round(mean(sales), 2)
43 | qtr_stdev = round(stdev(sales), 2)
44 | return Row(seller_id, total_sales, qtr_avg, qtr_stdev)
45 |
46 |
47 | sales_stats = tbl.map(sales_summary_stats).alias(
48 | "seller_id", "total_sales", "quarterly_avg", "quarterly_stdev"
49 | )
50 |
51 | print("\nSales Summary Stats schema")
52 | sales_stats.print_schema()
53 |
54 | print("\nSales Summary Stats data")
55 | sales_stats.execute().print()
56 |
--------------------------------------------------------------------------------
/fraud-detection/local/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | PKG_ALL="${PKG_ALL:-no}"
3 |
4 | SCRIPT_DIR="$(cd $(dirname "$0"); pwd)"
5 |
6 | #### Steps to package the flink app
7 | SRC_PATH=$SCRIPT_DIR/package
8 | rm -rf $SRC_PATH && mkdir -p $SRC_PATH/lib
9 |
10 | ## Download flink sql connector kafka
11 | echo "download flink sql connector kafka..."
12 | VERSION=1.15.2
13 | FILE_NAME=flink-sql-connector-kafka-$VERSION
14 | FLINK_SRC_DOWNLOAD_URL=https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/$VERSION/flink-sql-connector-kafka-$VERSION.jar
15 | curl -L -o $SRC_PATH/lib/$FILE_NAME.jar ${FLINK_SRC_DOWNLOAD_URL}
16 |
17 | ## Install pip packages
18 | echo "install and zip pip packages..."
19 | pip3 install -r requirements.txt --target $SRC_PATH/site_packages
20 |
21 | if [ $PKG_ALL == "yes" ]; then
22 | ## Package pyflink app
23 | echo "package pyflink app"
24 | zip -r kda-package.zip processor.py package/lib package/site_packages
25 | fi
26 |
27 | #### Steps to create the sink connector
28 | CONN_PATH=$SCRIPT_DIR/connectors
29 | rm -rf $CONN_PATH && mkdir $CONN_PATH
30 |
31 | ## Download camel dynamodb sink connector
32 | echo "download camel dynamodb sink connector..."
33 | CONNECTOR_SRC_DOWNLOAD_URL=https://repo.maven.apache.org/maven2/org/apache/camel/kafkaconnector/camel-aws-ddb-sink-kafka-connector/3.20.3/camel-aws-ddb-sink-kafka-connector-3.20.3-package.tar.gz
34 |
35 | ## decompress and zip contents to create custom plugin of msk connect later
36 | curl -o $CONN_PATH/camel-aws-ddb-sink-kafka-connector.tar.gz $CONNECTOR_SRC_DOWNLOAD_URL \
37 | && tar -xvzf $CONN_PATH/camel-aws-ddb-sink-kafka-connector.tar.gz -C $CONN_PATH \
38 | && cd $CONN_PATH/camel-aws-ddb-sink-kafka-connector \
39 | && zip -r camel-aws-ddb-sink-kafka-connector.zip . \
40 | && mv camel-aws-ddb-sink-kafka-connector.zip $CONN_PATH \
41 | && rm $CONN_PATH/camel-aws-ddb-sink-kafka-connector.tar.gz
--------------------------------------------------------------------------------
/pyflink-doc/data.py:
--------------------------------------------------------------------------------
1 | word_count_data = [
2 | "To be, or not to be,--that is the question:--",
3 | "Whether 'tis nobler in the mind to suffer",
4 | "The slings and arrows of outrageous fortune",
5 | "Or to take arms against a sea of troubles,",
6 | "And by opposing end them?--To die,--to sleep,--",
7 | "No more; and by a sleep to say we end",
8 | "The heartache, and the thousand natural shocks",
9 | "That flesh is heir to,--'tis a consummation",
10 | "Devoutly to be wish'd. To die,--to sleep;--",
11 | "To sleep! perchance to dream:--ay, there's the rub;",
12 | "For in that sleep of death what dreams may come,",
13 | "When we have shuffled off this mortal coil,",
14 | "Must give us pause: there's the respect",
15 | "That makes calamity of so long life;",
16 | "For who would bear the whips and scorns of time,",
17 | "The oppressor's wrong, the proud man's contumely,",
18 | "The pangs of despis'd love, the law's delay,",
19 | "The insolence of office, and the spurns",
20 | "That patient merit of the unworthy takes,",
21 | "When he himself might his quietus make",
22 | "With a bare bodkin? who would these fardels bear,",
23 | "To grunt and sweat under a weary life,",
24 | "But that the dread of something after death,--",
25 | "The undiscover'd country, from whose bourn",
26 | "No traveller returns,--puzzles the will,",
27 | "And makes us rather bear those ills we have",
28 | "Than fly to others that we know not of?",
29 | "Thus conscience does make cowards of us all;",
30 | "And thus the native hue of resolution",
31 | "Is sicklied o'er with the pale cast of thought;",
32 | "And enterprises of great pith and moment,",
33 | "With this regard, their currents turn awry,",
34 | "And lose the name of action.--Soft you now!",
35 | "The fair Ophelia!--Nymph, in thy orisons",
36 | "Be all my sins remember'd.",
37 | ]
38 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/infra/variables.tf:
--------------------------------------------------------------------------------
1 | variable "vpn_to_create" {
2 | description = "Flag to indicate whether to create VPN"
3 | type = bool
4 | default = true
5 | }
6 |
7 | variable "vpn_to_use_spot" {
8 | description = "Flag to indicate whether to use a spot instance for VPN"
9 | type = bool
10 | default = false
11 | }
12 |
13 | variable "vpn_to_limit_vpn_ingress" {
14 | description = "Flag to indicate whether to limit ingress from the current machine's IP address"
15 | type = bool
16 | default = true
17 | }
18 |
19 | locals {
20 | name = "kda-getting-started"
21 | region = data.aws_region.current.name
22 | environment = "dev"
23 |
24 | vpc = {
25 | cidr = "10.0.0.0/16"
26 | azs = slice(data.aws_availability_zones.available.names, 0, 3)
27 | }
28 |
29 | default_bucket = {
30 | name = "${local.name}-${data.aws_caller_identity.current.account_id}-${local.region}"
31 | to_set_acl = false
32 | }
33 |
34 | vpn = {
35 | to_create = var.vpn_to_create
36 | to_use_spot = var.vpn_to_use_spot
37 | ingress_cidr = var.vpn_to_limit_vpn_ingress ? "${data.http.local_ip_address.response_body}/32" : "0.0.0.0/0"
38 | spot_override = [
39 | { instance_type : "t3.small" },
40 | { instance_type : "t3a.small" },
41 | ]
42 | }
43 |
44 | msk = {
45 | version = "2.8.1"
46 | instance_size = "kafka.m5.large"
47 | ebs_volume_size = 20
48 | log_retention_ms = 604800000 # 7 days
49 | number_of_broker_nodes = 2
50 | num_partitions = 2
51 | default_replication_factor = 2
52 | }
53 |
54 | kda = {
55 | to_create = false
56 | runtime_env = "FLINK-1_15"
57 | package_name = "kda-package.zip"
58 | }
59 |
60 | tags = {
61 | Name = local.name
62 | Environment = local.environment
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/forwarder/flinksql.sql:
--------------------------------------------------------------------------------
1 | -- docker exec -it jobmanager ./bin/sql-client.sh
2 |
3 | SET 'state.checkpoints.dir' = 'file:///tmp/checkpoints/';
4 | SET 'execution.checkpointing.interval' = '5000';
5 |
6 | ADD JAR '/etc/flink/package/lib/lab4-pipeline-1.0.0.jar';
7 |
8 | CREATE TABLE taxi_rides_src (
9 | id VARCHAR,
10 | vendor_id INT,
11 | pickup_date VARCHAR,
12 | dropoff_date VARCHAR,
13 | passenger_count INT,
14 | pickup_longitude VARCHAR,
15 | pickup_latitude VARCHAR,
16 | dropoff_longitude VARCHAR,
17 | dropoff_latitude VARCHAR,
18 | store_and_fwd_flag VARCHAR,
19 | gc_distance INT,
20 | trip_duration INT,
21 | google_distance INT,
22 | google_duration INT,
23 | process_time AS PROCTIME()
24 | ) WITH (
25 | 'connector' = 'kafka',
26 | 'topic' = 'taxi-rides',
27 | 'properties.bootstrap.servers' = 'kafka-0:9092',
28 | 'properties.group.id' = 'soruce-group',
29 | 'format' = 'json',
30 | 'scan.startup.mode' = 'latest-offset'
31 | );
32 |
33 | CREATE TABLE taxi_rides_sink (
34 | vendor_id VARCHAR,
35 | trip_count BIGINT NOT NULL,
36 | passenger_count INT,
37 | trip_duration INT,
38 | window_start TIMESTAMP(3) NOT NULL,
39 | window_end TIMESTAMP(3) NOT NULL
40 | ) WITH (
41 | 'connector' = 'opensearch',
42 | 'hosts' = 'http://opensearch:9200',
43 | 'index' = 'trip_stats'
44 | );
45 |
46 | INSERT INTO taxi_rides_sink
47 | SELECT
48 | CAST(vendor_id AS STRING) AS vendor_id,
49 | COUNT(id) AS trip_count,
50 | SUM(passenger_count) AS passenger_count,
51 | SUM(trip_duration) AS trip_duration,
52 | window_start,
53 | window_end
54 | FROM TABLE(
55 | TUMBLE(TABLE taxi_rides_src, DESCRIPTOR(process_time), INTERVAL '5' SECONDS))
56 | GROUP BY vendor_id, window_start, window_end;
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/compose-flink.yml:
--------------------------------------------------------------------------------
1 | version: "3.5"
2 |
3 | services:
4 | jobmanager:
5 | image: pyflink:1.15.2-scala_2.12
6 | container_name: jobmanager
7 | command: jobmanager
8 | ports:
9 | - "8081:8081"
10 | networks:
11 | - flinknet
12 | environment:
13 | - |
14 | FLINK_PROPERTIES=
15 | jobmanager.rpc.address: jobmanager
16 | state.backend: filesystem
17 | state.checkpoints.dir: file:///tmp/flink-checkpoints
18 | heartbeat.interval: 1000
19 | heartbeat.timeout: 5000
20 | rest.flamegraph.enabled: true
21 | web.backpressure.refresh-interval: 10000
22 | - RUNTIME_ENV=DOCKER
23 | - BOOTSTRAP_SERVERS=$BOOTSTRAP_SERVERS
24 | - AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID
25 | - AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY
26 | - AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
27 | volumes:
28 | - $PWD:/etc/flink
29 | taskmanager:
30 | image: pyflink:1.15.2-scala_2.12
31 | container_name: taskmanager
32 | command: taskmanager
33 | networks:
34 | - flinknet
35 | volumes:
36 | - flink_data:/tmp/
37 | - $PWD:/etc/flink
38 | environment:
39 | - |
40 | FLINK_PROPERTIES=
41 | jobmanager.rpc.address: jobmanager
42 | taskmanager.numberOfTaskSlots: 3
43 | state.backend: filesystem
44 | state.checkpoints.dir: file:///tmp/flink-checkpoints
45 | heartbeat.interval: 1000
46 | heartbeat.timeout: 5000
47 | - RUNTIME_ENV=DOCKER
48 | - BOOTSTRAP_SERVERS=$BOOTSTRAP_SERVERS
49 | - AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID
50 | - AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY
51 | - AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
52 | depends_on:
53 | - jobmanager
54 |
55 | networks:
56 | flinknet:
57 | name: flink-network
58 |
59 | volumes:
60 | flink_data:
61 | driver: local
62 | name: flink_data
63 |
--------------------------------------------------------------------------------
/pyflink-udemy/s4_04_aggregations.py:
--------------------------------------------------------------------------------
1 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource
2 | from pyflink.table.expressions import col
3 |
4 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode())
5 |
6 | field_names = "seller_id,product,quantity,product_price,sales_date".split(",")
7 | field_types = [
8 | DataTypes.STRING(),
9 | DataTypes.STRING(),
10 | DataTypes.INT(),
11 | DataTypes.DOUBLE(),
12 | DataTypes.DATE(),
13 | ]
14 | source = CsvTableSource("./csv-input", field_names, field_types, ignore_first_line=True)
15 |
16 | tbl_env.register_table_source("product_locale_sales", source)
17 | tbl = tbl_env.from_path("product_locale_sales")
18 |
19 | tbl.order_by(col("quantity").asc).execute().print()
20 | tbl.order_by(col("quantity").asc).offset(1).fetch(2).execute().print()
21 | tbl.order_by(col("quantity").asc).fetch(8).execute().print()
22 |
23 | avg_price = (
24 | tbl.select(col("product_price")).distinct().select(col("product_price").avg.alias("avg_price"))
25 | )
26 | print("\navg_price data")
27 | avg_price.execute().print()
28 |
29 | avg_price2 = tbl_env.sql_query(
30 | """
31 | SELECT avg(product_price) AS avg_price
32 | FROM product_locale_sales
33 | """
34 | )
35 | print("\navg_price2 data")
36 | avg_price2.execute().print()
37 |
38 | seller_revenue = (
39 | tbl.select(
40 | col("seller_id"), col("product"), (col("product_price") * col("quantity")).alias("sales")
41 | )
42 | .group_by(col("seller_id"))
43 | .select(col("seller_id"), col("sales").sum.alias("seller_revenue"))
44 | )
45 | print("\nseller_revenue data")
46 | seller_revenue.execute().print()
47 |
48 | seller_revenue2 = tbl_env.sql_query(
49 | """
50 | SELECT seller_id, sum(product_price * quantity) AS seller_revenue
51 | FROM product_locale_sales
52 | GROUP BY seller_id
53 | """
54 | )
55 | print("\nseller_revenue2 data")
56 | seller_revenue2.execute().print()
57 |
--------------------------------------------------------------------------------
/sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/records/Ride.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Ververica GmbH
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.ververica.sql_training.data_producer.records;
18 |
19 | import com.fasterxml.jackson.annotation.JsonFormat;
20 |
21 | import java.util.Date;
22 |
23 | /**
24 | * POJO for a Ride record.
25 | */
26 | public class Ride implements TaxiRecord {
27 |
28 | @JsonFormat
29 | private long rideId;
30 | @JsonFormat
31 | private boolean isStart;
32 | @JsonFormat
33 | private long taxiId;
34 | @JsonFormat(shape = JsonFormat.Shape.STRING, pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'")
35 | private Date eventTime;
36 | @JsonFormat
37 | private double lon;
38 | @JsonFormat
39 | private double lat;
40 | @JsonFormat
41 | private byte psgCnt;
42 |
43 | public Ride() {}
44 |
45 | public Ride(long rideId, boolean isStart, long taxiId, Date eventTime, double lon, double lat, byte psgCnt) {
46 | this.rideId = rideId;
47 | this.isStart = isStart;
48 | this.taxiId = taxiId;
49 | this.eventTime = eventTime;
50 | this.lon = lon;
51 | this.lat = lat;
52 | this.psgCnt = psgCnt;
53 | }
54 |
55 | @Override
56 | public Date getEventTime() {
57 | return eventTime;
58 | }
59 |
60 | }
61 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/chapter5/RollingSum.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.chapter5
2 |
3 | import org.apache.flink.api.java.tuple.Tuple3
4 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
5 |
6 | /**
7 | * This Flink job demonstrates how to compute a "rolling sum" using the `.sum()`
8 | * aggregation function on a KeyedStream.
9 | *
10 | * A rolling sum is a continuous aggregation that is updated for every input event,
11 | * as opposed to a windowed aggregation which only emits a result at the end of a window.
12 | *
13 | * The pipeline works as follows:
14 | * 1. **Source**: A simple, static stream of `Tuple3` is created, where the first
15 | * element is the key and the second is the value to be summed.
16 | * 2. **KeyBy**: The stream is partitioned by the first field (`f0`) using a type-safe lambda.
17 | * 3. **Sum**: The `.sum(1)` operator maintains a running sum of the second field (at index 1)
18 | * for each key. It is a concise way to perform this specific aggregation.
19 | * 4. **Sink**: The resulting stream of continuous, updated sums is printed to the console.
20 | *
21 | * **Note on Best Practices:** While `.sum()` is convenient for Tuples, the modern,
22 | * recommended approach for most streaming applications is to use the more flexible and
23 | * fully type-safe `.reduce()` operator, especially when working with custom data classes.
24 | */
25 | object RollingSum {
26 | @JvmStatic
27 | fun main(args: Array) {
28 | val env = StreamExecutionEnvironment.getExecutionEnvironment()
29 |
30 | val inputStream =
31 | env.fromData(
32 | Tuple3(1, 2, 2),
33 | Tuple3(2, 3, 1),
34 | Tuple3(2, 2, 4),
35 | Tuple3(1, 5, 3),
36 | )
37 |
38 | val resultStream = inputStream.keyBy { it.f0 }.sum(1)
39 |
40 | resultStream.print()
41 |
42 | env.execute("Rolling Sum Example")
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/misc/ControlStreamGenerator.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.misc
2 |
3 | import org.apache.flink.api.common.eventtime.WatermarkStrategy
4 | import org.apache.flink.api.common.typeinfo.TypeInformation
5 | import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy
6 | import org.apache.flink.connector.datagen.source.DataGeneratorSource
7 | import org.apache.flink.streaming.api.datastream.DataStreamSource
8 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
9 |
10 | /**
11 | * A generic data generator for creating mock control streams in Flink.
12 | * This object provides a single, reusable method to generate a DataStreamSource
13 | * from a provided list of data, emitting one element per second.
14 | */
15 | object ControlStreamGenerator {
16 | /**
17 | * Creates a Flink DataStreamSource from a list of elements.
18 | *
19 | * @param T The type of elements in the stream.
20 | * @param env The Flink StreamExecutionEnvironment.
21 | * @param sourceName A descriptive name for the Flink source.
22 | * @param data The list of data to be emitted by the source.
23 | * @param typeInfo The Flink TypeInformation for the data type T.
24 | * @return A DataStreamSource that will emit the elements from the data list.
25 | */
26 | fun createSource(
27 | env: StreamExecutionEnvironment,
28 | sourceName: String,
29 | data: List,
30 | typeInfo: TypeInformation,
31 | ): DataStreamSource {
32 | val generatorSource =
33 | DataGeneratorSource(
34 | { index -> data[index.toInt()] },
35 | data.size.toLong(),
36 | RateLimiterStrategy.perSecond(1.0),
37 | typeInfo,
38 | )
39 |
40 | return env.fromSource(
41 | generatorSource,
42 | WatermarkStrategy.noWatermarks(),
43 | sourceName,
44 | )
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/fraud-detection/remote/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | shopt -s extglob
3 |
4 | PKG_ALL="${PKG_ALL:-yes}"
5 | SCRIPT_DIR="$(cd $(dirname "$0"); pwd)"
6 |
7 | #### Steps to package the flink app
8 | # remove contents under $SRC_PATH (except for uber-jar-for-pyflink) and kda-package.zip file
9 | SRC_PATH=$SCRIPT_DIR/package
10 | rm -rf $SRC_PATH/!(uber-jar-for-pyflink) kda-package.zip
11 |
12 | ## Generate Uber Jar for PyFlink app for MSK cluster with IAM authN
13 | echo "generate Uber jar for PyFlink app..."
14 | mkdir $SRC_PATH/lib
15 | mvn clean install -f $SRC_PATH/uber-jar-for-pyflink/pom.xml \
16 | && mv $SRC_PATH/uber-jar-for-pyflink/target/pyflink-getting-started-1.0.0.jar $SRC_PATH/lib \
17 | && rm -rf $SRC_PATH/uber-jar-for-pyflink/target
18 |
19 | ## Install pip packages
20 | echo "install and zip pip packages..."
21 | pip install -r requirements.txt --target $SRC_PATH/site_packages
22 |
23 | if [ $PKG_ALL == "yes" ]; then
24 | ## Package pyflink app
25 | echo "package pyflink app"
26 | zip -r kda-package.zip processor.py package/lib package/site_packages
27 | fi
28 |
29 | #### Steps to create the sink connector
30 | CONN_PATH=$SCRIPT_DIR/connectors
31 | rm -rf $CONN_PATH && mkdir $CONN_PATH
32 |
33 | ## Download camel dynamodb sink connector
34 | echo "download camel dynamodb sink connector..."
35 | CONNECTOR_SRC_DOWNLOAD_URL=https://repo.maven.apache.org/maven2/org/apache/camel/kafkaconnector/camel-aws-ddb-sink-kafka-connector/3.20.3/camel-aws-ddb-sink-kafka-connector-3.20.3-package.tar.gz
36 |
37 | ## decompress and zip contents to create custom plugin of msk connect later
38 | curl -o $CONN_PATH/camel-aws-ddb-sink-kafka-connector.tar.gz $CONNECTOR_SRC_DOWNLOAD_URL \
39 | && tar -xvzf $CONN_PATH/camel-aws-ddb-sink-kafka-connector.tar.gz -C $CONN_PATH \
40 | && cd $CONN_PATH/camel-aws-ddb-sink-kafka-connector \
41 | && zip -r camel-aws-ddb-sink-kafka-connector.zip . \
42 | && mv camel-aws-ddb-sink-kafka-connector.zip $CONN_PATH \
43 | && rm $CONN_PATH/camel-aws-ddb-sink-kafka-connector.tar.gz
--------------------------------------------------------------------------------
/pyflink-udemy/s4_01_projections.py:
--------------------------------------------------------------------------------
1 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource
2 | from pyflink.table.expressions import col
3 |
4 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode())
5 |
6 | field_names = "seller_id,product,quantity,product_price,sales_date".split(",")
7 | field_types = [
8 | DataTypes.STRING(),
9 | DataTypes.STRING(),
10 | DataTypes.INT(),
11 | DataTypes.DOUBLE(),
12 | DataTypes.DATE(),
13 | ]
14 | source = CsvTableSource("./csv-input", field_names, field_types, ignore_first_line=True)
15 |
16 | tbl_env.register_table_source("product_locale_sales", source)
17 | tbl = tbl_env.from_path("product_locale_sales")
18 |
19 | redundant_prices = tbl.select(col("product"), col("product_price").alias("price"))
20 | print("\nredundant_prices data")
21 | redundant_prices.execute().print()
22 |
23 | redundant_prices2 = tbl_env.sql_query(
24 | f"SELECT product, product_price As price FROM product_locale_sales"
25 | )
26 | print("\nredundant_prices2 data")
27 | redundant_prices2.execute().print()
28 |
29 | distinct_prices = tbl.select(col("product"), col("product_price").alias("price")).distinct()
30 | print("\ndistinct_prices data")
31 | distinct_prices.execute().print()
32 |
33 | distinct_prices2 = tbl_env.sql_query(
34 | "SELECT DISTINCT product, product_price AS price FROM product_locale_sales"
35 | )
36 | print("\ndistinct_prices2 data")
37 | distinct_prices2.execute().print()
38 |
39 | product_sales = tbl.select(
40 | col("sales_date"),
41 | col("seller_id"),
42 | col("product"),
43 | (col("product_price") * col("quantity")).alias("sales"),
44 | ).distinct()
45 | print("\nproduct_sales data")
46 | product_sales.execute().print()
47 |
48 | product_sales2 = tbl_env.sql_query(
49 | """
50 | SELECT DISTINCT
51 | sales_date, seller_id, product, product_price * quantity AS sales
52 | FROM product_locale_sales
53 | """
54 | )
55 | print("\nproduct_sales2 data")
56 | product_sales2.execute().print()
57 |
--------------------------------------------------------------------------------
/pyflink-udemy/s3_04_kafka_source.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from pyflink.table import EnvironmentSettings, TableEnvironment
4 |
5 | BOOTSTRAP_SERVERS = os.getenv("BOOTSTRAP_SERVERS", "localhost:29092")
6 | # https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/connectors/table/kafka/
7 | version_map = {"15": "1.15.4", "16": "1.16.0"}
8 | FLINK_VERSION = version_map[os.getenv("MINOR_VERSION", "15")]
9 | FLINK_SQL_CONNECTOR_KAFKA = f"flink-sql-connector-kafka-{FLINK_VERSION}.jar"
10 |
11 | env_settings = EnvironmentSettings.in_streaming_mode()
12 | table_env = TableEnvironment.create(env_settings)
13 | # https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/dev/python/dependency_management/
14 | kafka_jar = os.path.join(os.path.abspath(os.path.dirname(__file__)), FLINK_SQL_CONNECTOR_KAFKA)
15 | table_env.get_config().set("pipeline.jars", f"file://{kafka_jar}")
16 |
17 | ## create kafka source table
18 | table_env.execute_sql(
19 | f"""
20 | CREATE TABLE product_sales (
21 | `seller_id` VARCHAR,
22 | `product` VARCHAR,
23 | `quantity` INT,
24 | `product_price` DOUBLE,
25 | `sales_date` VARCHAR
26 | ) WITH (
27 | 'connector' = 'kafka',
28 | 'topic' = 'product_sales',
29 | 'properties.bootstrap.servers' = '{BOOTSTRAP_SERVERS}',
30 | 'properties.group.id' = 'source-demo',
31 | 'format' = 'json',
32 | 'scan.startup.mode' = 'earliest-offset',
33 | 'json.fail-on-missing-field' = 'false',
34 | 'json.ignore-parse-errors' = 'true'
35 | )
36 | """
37 | )
38 |
39 | ## create print sink table
40 | table_env.execute_sql(
41 | f"""
42 | CREATE TABLE print (
43 | `seller_id` VARCHAR,
44 | `product` VARCHAR,
45 | `quantity` INT,
46 | `product_price` DOUBLE,
47 | `sales_date` VARCHAR
48 | ) WITH (
49 | 'connector' = 'print'
50 | )
51 | """
52 | )
53 |
54 | ## insert into sink table
55 | tbl = table_env.from_path("product_sales")
56 | tbl.execute_insert("print").wait()
57 |
--------------------------------------------------------------------------------
/sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/records/Fare.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Ververica GmbH
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.ververica.sql_training.data_producer.records;
18 |
19 | import com.fasterxml.jackson.annotation.JsonFormat;
20 |
21 | import java.util.Date;
22 |
23 | /**
24 | * POJO for a Fare record.
25 | */
26 | public class Fare implements TaxiRecord {
27 |
28 | @JsonFormat
29 | private long rideId;
30 | @JsonFormat(shape = JsonFormat.Shape.STRING, pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'")
31 | private Date eventTime;
32 | @JsonFormat(shape = JsonFormat.Shape.STRING)
33 | private PayMethod payMethod;
34 | @JsonFormat
35 | private double fare;
36 | @JsonFormat
37 | private double toll;
38 | @JsonFormat
39 | private double tip;
40 |
41 | public Fare() {}
42 |
43 | public Fare(long rideId, Date eventTime, PayMethod payMethod, double fare, double toll, double tip) {
44 | this.rideId = rideId;
45 | this.eventTime = eventTime;
46 | this.payMethod = payMethod;
47 | this.fare = fare;
48 | this.toll = toll;
49 | this.tip = tip;
50 | }
51 |
52 | @Override
53 | public Date getEventTime() {
54 | return eventTime;
55 | }
56 |
57 | public static enum PayMethod {
58 | CSH,
59 | CRD,
60 | DIS,
61 | NOC,
62 | UNK
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/sql-training/receipts.md:
--------------------------------------------------------------------------------
1 | ## Apache Flink® SQL Training
2 |
3 | - [GitHub](https://github.com/ververica/sql-training/tree/master)
4 |
5 | ### Sessions
6 |
7 | ```sql
8 | CREATE TABLE rides (
9 | rideId INT,
10 | taxiId INT,
11 | isStart BOOLEAN,
12 | lon FLOAT,
13 | lat FLOAT,
14 | psgCnt INT,
15 | eventTime STRING,
16 | rideTime AS TO_TIMESTAMP(eventTime, 'yyyy-MM-dd''T''HH:mm:ss''Z'''),
17 | WATERMARK FOR rideTime AS rideTime - INTERVAL '60' SECOND
18 | )
19 | WITH (
20 | 'connector' = 'kafka',
21 | 'topic' = 'Rides',
22 | 'properties.bootstrap.servers' = 'kafka-0:9092',
23 | 'properties.group.id' = 'rides',
24 | 'scan.startup.mode' = 'earliest-offset',
25 | 'format' = 'json'
26 | );
27 |
28 | CREATE TABLE fairs (
29 | rideId INT,
30 | payMethod STRING,
31 | tip FLOAT,
32 | toll FLOAT,
33 | fare FLOAT,
34 | eventTime STRING,
35 | payTime AS TO_TIMESTAMP(eventTime, 'yyyy-MM-dd''T''HH:mm:ss''Z'''),
36 | WATERMARK FOR payTime AS payTime - INTERVAL '60' SECOND
37 | )
38 | WITH (
39 | 'connector' = 'kafka',
40 | 'topic' = 'Fares',
41 | 'properties.bootstrap.servers' = 'kafka-0:9092',
42 | 'properties.group.id' = 'fares',
43 | 'scan.startup.mode' = 'earliest-offset',
44 | 'format' = 'json'
45 | );
46 |
47 | CREATE TABLE driver_changes (
48 | taxiId INT,
49 | driverId INT,
50 | eventTime STRING,
51 | usageStartTime AS TO_TIMESTAMP(eventTime, 'yyyy-MM-dd''T''HH:mm:ss''Z'''),
52 | WATERMARK FOR usageStartTime AS usageStartTime - INTERVAL '60' SECOND
53 | )
54 | WITH (
55 | 'connector' = 'kafka',
56 | 'topic' = 'DriverChanges',
57 | 'properties.bootstrap.servers' = 'kafka-0:9092',
58 | 'properties.group.id' = 'driver-changes',
59 | 'scan.startup.mode' = 'earliest-offset',
60 | 'format' = 'json'
61 | );
62 | ```
63 |
64 | #### Introduction to SQL on Flink
65 |
66 | #### Querying Dynamic Tables with SQL
67 |
68 | #### Queries and Time
69 |
70 | #### Joining Dynamic Tables
71 |
72 | #### Pattern Matching with MATCH_RECOGNIZE
73 |
74 | #### Creating Tables & Writing Query Results to External Systems
75 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter6/utils/model.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import dataclasses
3 | from typing import Iterable, Tuple
4 |
5 | from pyflink.common import Row
6 | from pyflink.common.typeinfo import Types
7 |
8 | from .type_helper import TypeMapping, set_type_info
9 |
10 |
11 | @dataclasses.dataclass
12 | class SensorReading(TypeMapping):
13 | id: str
14 | timestamp: int
15 | num_records: int
16 | temperature: float
17 |
18 | def to_row(self):
19 | return Row(**dataclasses.asdict(self))
20 |
21 | @classmethod
22 | def from_row(cls, row: Row):
23 | return cls(**row.as_dict())
24 |
25 | @classmethod
26 | def from_tuple(cls, tup: Tuple[int, int, datetime.datetime]):
27 | return cls(
28 | id=f"sensor_{tup[0]}",
29 | timestamp=int(tup[2].strftime("%s")) * 1000,
30 | num_records=1,
31 | temperature=65 + (tup[1] / 100 * 20),
32 | )
33 |
34 | @staticmethod
35 | def process_elements(elements: Iterable[Tuple[int, int, datetime.datetime]]):
36 | id, count, temperature = None, 0, 0
37 | for e in elements:
38 | next_id = f"sensor_{e[0]}"
39 | if id is not None:
40 | assert id == next_id
41 | id = next_id
42 | count += 1
43 | temperature += 65 + (e[1] / 100 * 20)
44 | return id, count, temperature
45 |
46 | @staticmethod
47 | def type_mapping():
48 | return {
49 | "id": Types.STRING(),
50 | "timestamp": Types.LONG(),
51 | "num_records": Types.INT(),
52 | "temperature": Types.DOUBLE(),
53 | }
54 |
55 | @staticmethod
56 | def set_key_type_info():
57 | return set_type_info(SensorReading.type_mapping(), selects=["id"])
58 |
59 | @staticmethod
60 | def set_value_type_info():
61 | return set_type_info(SensorReading.type_mapping())
62 |
63 |
64 | @dataclasses.dataclass
65 | class MinMaxTemp:
66 | id: str
67 | min_temp: float
68 | max_temp: float
69 | num_records: int
70 | timestamp: int
71 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter7/utils/model.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import dataclasses
3 | from typing import Iterable, Tuple
4 |
5 | from pyflink.common import Row
6 | from pyflink.common.typeinfo import Types
7 |
8 | from .type_helper import TypeMapping, set_type_info
9 |
10 |
11 | @dataclasses.dataclass
12 | class SensorReading(TypeMapping):
13 | id: str
14 | timestamp: int
15 | num_records: int
16 | temperature: float
17 |
18 | def to_row(self):
19 | return Row(**dataclasses.asdict(self))
20 |
21 | @classmethod
22 | def from_row(cls, row: Row):
23 | return cls(**row.as_dict())
24 |
25 | @classmethod
26 | def from_tuple(cls, tup: Tuple[int, int, datetime.datetime]):
27 | return cls(
28 | id=f"sensor_{tup[0]}",
29 | timestamp=int(tup[2].strftime("%s")) * 1000,
30 | num_records=1,
31 | temperature=65 + (tup[1] / 100 * 20),
32 | )
33 |
34 | @staticmethod
35 | def process_elements(elements: Iterable[Tuple[int, int, datetime.datetime]]):
36 | id, count, temperature = None, 0, 0
37 | for e in elements:
38 | next_id = f"sensor_{e[0]}"
39 | if id is not None:
40 | assert id == next_id
41 | id = next_id
42 | count += 1
43 | temperature += 65 + (e[1] / 100 * 20)
44 | return id, count, temperature
45 |
46 | @staticmethod
47 | def type_mapping():
48 | return {
49 | "id": Types.STRING(),
50 | "timestamp": Types.LONG(),
51 | "num_records": Types.INT(),
52 | "temperature": Types.DOUBLE(),
53 | }
54 |
55 | @staticmethod
56 | def set_key_type_info():
57 | return set_type_info(SensorReading.type_mapping(), selects=["id"])
58 |
59 | @staticmethod
60 | def set_value_type_info():
61 | return set_type_info(SensorReading.type_mapping())
62 |
63 |
64 | @dataclasses.dataclass
65 | class MinMaxTemp:
66 | id: str
67 | min_temp: float
68 | max_temp: float
69 | num_records: int
70 | timestamp: int
71 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/chapter5/KeyedTransformations.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.chapter5
2 |
3 | import me.jaehyeon.sensor.SensorReading
4 | import me.jaehyeon.sensor.SensorSource
5 | import org.apache.flink.api.common.eventtime.WatermarkStrategy
6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
7 | import java.time.Duration
8 |
9 | /**
10 | * This Flink job demonstrates transformations on a `KeyedStream`.
11 | *
12 | * It showcases the `reduce` operator, a powerful tool for maintaining running aggregates
13 | * for each key in a stream.
14 | *
15 | * The pipeline is as follows:
16 | * 1. **Source**: Ingests a stream of `SensorReading` events.
17 | * 2. **KeyBy**: Partitions the stream by the `id` of each sensor. All subsequent
18 | * operations will run independently for each sensor.
19 | * 3. **Reduce**: For each key, this operator maintains a running state of the `SensorReading`
20 | * with the maximum temperature seen so far. For every new reading that arrives, it
21 | * compares it to the current maximum and emits the new maximum downstream.
22 | * 4. **Sink**: Prints the continuous stream of running maximums for each sensor to the console.
23 | */
24 | object KeyedTransformations {
25 | @JvmStatic
26 | fun main(args: Array) {
27 | val env = StreamExecutionEnvironment.getExecutionEnvironment()
28 |
29 | val readings =
30 | env.fromSource(
31 | SensorSource(),
32 | WatermarkStrategy
33 | .forBoundedOutOfOrderness(Duration.ofSeconds(5))
34 | .withTimestampAssigner { reading, _ ->
35 | reading.timestamp
36 | },
37 | "Sensor Source",
38 | )
39 |
40 | val keyed = readings.keyBy { it.id }
41 | val maxTempPerSensor =
42 | keyed.reduce { r1, r2 ->
43 | if (r1.temperature > r2.temperature) r1 else r2
44 | }
45 |
46 | maxTempPerSensor.print()
47 |
48 | env.execute("Keyed Transformations Example")
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/pyflink-getting-started-on-aws/remote/infra/outputs.tf:
--------------------------------------------------------------------------------
1 | # VPC
2 | output "vpc_id" {
3 | description = "The ID of the VPC"
4 | value = module.vpc.vpc_id
5 | }
6 |
7 | output "vpc_cidr_block" {
8 | description = "The CIDR block of the VPC"
9 | value = module.vpc.vpc_cidr_block
10 | }
11 |
12 | output "private_subnets" {
13 | description = "List of IDs of private subnets"
14 | value = module.vpc.private_subnets
15 | }
16 |
17 | output "public_subnets" {
18 | description = "List of IDs of public subnets"
19 | value = module.vpc.public_subnets
20 | }
21 |
22 | output "nat_public_ips" {
23 | description = "List of public Elastic IPs created for AWS NAT Gateway"
24 | value = module.vpc.nat_public_ips
25 | }
26 |
27 | output "azs" {
28 | description = "A list of availability zones specified as argument to this module"
29 | value = module.vpc.azs
30 | }
31 |
32 | # Default bucket
33 | output "default_bucket_name" {
34 | description = "Default bucket name"
35 | value = aws_s3_bucket.default_bucket.id
36 | }
37 |
38 | # VPN
39 | output "vpn_launch_template_arn" {
40 | description = "The ARN of the VPN launch template"
41 | value = {
42 | for k, v in module.vpn : k => v.launch_template_arn
43 | }
44 | }
45 |
46 | output "vpn_autoscaling_group_id" {
47 | description = "VPN autoscaling group id"
48 | value = {
49 | for k, v in module.vpn : k => v.autoscaling_group_id
50 | }
51 | }
52 |
53 | output "vpn_autoscaling_group_name" {
54 | description = "VPN autoscaling group name"
55 | value = {
56 | for k, v in module.vpn : k => v.autoscaling_group_name
57 | }
58 | }
59 |
60 | # MSK
61 | output "msk_arn" {
62 | description = "Amazon Resource Name (ARN) of the MSK cluster"
63 | value = aws_msk_cluster.msk_data_cluster.arn
64 | }
65 |
66 | output "msk_bootstrap_brokers_sasl_iam" {
67 | description = "One or more DNS names (or IP addresses) and SASL IAM port pairs"
68 | value = aws_msk_cluster.msk_data_cluster.bootstrap_brokers_sasl_iam
69 | }
70 |
71 | # KDA
72 | output "kda_app_arn" {
73 | description = "Kinesis Application ARN"
74 | value = local.kda.to_create ? aws_kinesisanalyticsv2_application.kda_app[0].arn : "NA"
75 | }
76 |
--------------------------------------------------------------------------------
/pyflink-udemy/s4_05_producer.py:
--------------------------------------------------------------------------------
1 | import os
2 | import datetime
3 | import time
4 | import json
5 | import random
6 |
7 | from kafka import KafkaProducer
8 |
9 |
10 | class Sales:
11 | def __init__(self):
12 | self.products = [
13 | {"product": "Toothpaste", "product_price": 4.99},
14 | {"product": "Toothbrush", "product_price": 3.99},
15 | {"product": "Dental Floss", "product_price": 1.99},
16 | ]
17 | self.sellers = ["LNK", "OMA", "KC", "DEN"]
18 |
19 | def make_sales_item(self):
20 | return {
21 | **{
22 | "seller_id": random.choice(self.sellers),
23 | "quantity": random.randint(1, 5),
24 | "sale_ts": int(time.time() * 1000),
25 | },
26 | **random.choice(self.products),
27 | }
28 |
29 | def create(self, num: int):
30 | return [self.make_sales_item() for _ in range(num)]
31 |
32 |
33 | class Producer:
34 | def __init__(self, bootstrap_servers: list, topic: str):
35 | self.bootstrap_servers = bootstrap_servers
36 | self.topic = topic
37 | self.producer = self.create()
38 |
39 | def create(self):
40 | return KafkaProducer(
41 | bootstrap_servers=self.bootstrap_servers,
42 | value_serializer=lambda v: json.dumps(v, default=self.serialize).encode("utf-8"),
43 | )
44 |
45 | def send(self, sales_items: list):
46 | for item in sales_items:
47 | self.producer.send(self.topic, value=item)
48 | self.producer.flush()
49 |
50 | def serialize(self, obj):
51 | if isinstance(obj, datetime.datetime):
52 | return obj.isoformat()
53 | if isinstance(obj, datetime.date):
54 | return str(obj)
55 | return obj
56 |
57 |
58 | if __name__ == "__main__":
59 | producer = Producer(
60 | bootstrap_servers=os.getenv("BOOTSTRAP_SERVERS", "localhost:29092").split(","),
61 | topic=os.getenv("TOPIC_NAME", "sales_items"),
62 | )
63 |
64 | while True:
65 | sales_items = Sales().create(10)
66 | producer.send(sales_items)
67 | secs = random.randint(5, 10)
68 | print(f"messages sent... wait {secs} seconds")
69 | time.sleep(secs)
70 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/chapter5/BasicTransformations.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.chapter5
2 |
3 | import me.jaehyeon.sensor.SensorReading
4 | import me.jaehyeon.sensor.SensorSource
5 | import org.apache.flink.api.common.eventtime.WatermarkStrategy
6 | import org.apache.flink.api.common.typeinfo.Types
7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
8 | import org.apache.flink.util.Collector
9 | import java.time.Duration
10 |
11 | /**
12 | * This Flink job demonstrates basic, non-keyed transformations on a DataStream.
13 | *
14 | * It showcases a simple pipeline:
15 | * 1. **Source**: Ingests a stream of `SensorReading` events from a custom source.
16 | * 2. **Filter**: Discards readings with a temperature below 25.
17 | * 3. **Map**: Transforms the remaining `SensorReading` objects into just their String IDs.
18 | * 4. **FlatMap**: Splits each String ID into its constituent parts (e.g., "sensor_1" -> "sensor", "1").
19 | * 5. **Sink**: Prints the final stream of ID parts to the console.
20 | *
21 | * This example highlights stateless, one-to-one (map, filter) and one-to-many (flatMap) transformations.
22 | */
23 | object BasicTransformations {
24 | @JvmStatic
25 | fun main(args: Array) {
26 | val env = StreamExecutionEnvironment.getExecutionEnvironment()
27 |
28 | val readings =
29 | env.fromSource(
30 | SensorSource(),
31 | WatermarkStrategy
32 | .forBoundedOutOfOrderness(Duration.ofSeconds(5))
33 | .withTimestampAssigner { reading, _ ->
34 | reading.timestamp
35 | },
36 | "Sensor Source",
37 | )
38 |
39 | val filteredSensors = readings.filter { r -> r.temperature >= 25 }
40 | val sensorIds = filteredSensors.map { r -> r.id }
41 | val splitIds =
42 | sensorIds
43 | .flatMap { id, out: Collector ->
44 | id.split("_").forEach { part ->
45 | out.collect(part)
46 | }
47 | }.returns(Types.STRING)
48 |
49 | splitIds.print()
50 |
51 | env.execute("Basic Transformations Example")
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/pyflink-udemy/s4_02_filtering.py:
--------------------------------------------------------------------------------
1 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource
2 | from pyflink.table.expressions import col, lit, and_
3 |
4 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode())
5 |
6 | field_names = "seller_id,product,quantity,product_price,sales_date".split(",")
7 | field_types = [
8 | DataTypes.STRING(),
9 | DataTypes.STRING(),
10 | DataTypes.INT(),
11 | DataTypes.DOUBLE(),
12 | DataTypes.DATE(),
13 | ]
14 | source = CsvTableSource("./csv-input", field_names, field_types, ignore_first_line=True)
15 |
16 | tbl_env.register_table_source("product_locale_sales", source)
17 | tbl = tbl_env.from_path("product_locale_sales")
18 |
19 | high_sales = (
20 | tbl.select(
21 | col("sales_date"),
22 | col("seller_id"),
23 | col("product"),
24 | (col("product_price") * col("quantity")).alias("sales"),
25 | )
26 | .distinct()
27 | .where(col("sales") >= 80)
28 | )
29 | print("\nhigh_sales data")
30 | high_sales.execute().print()
31 |
32 | high_sales2 = tbl_env.sql_query(
33 | """
34 | WITH distinct_sales AS (
35 | SELECT DISTINCT
36 | sales_date, seller_id, product, product_price * quantity AS sales
37 | FROM product_locale_sales
38 | )
39 | SELECT *
40 | FROM distinct_sales
41 | WHERE sales >= 80
42 | """
43 | )
44 | print("\nhigh_sales2 data")
45 | high_sales2.execute().print()
46 |
47 | july1_high_sales = (
48 | tbl.select(
49 | col("sales_date"),
50 | col("seller_id"),
51 | col("product"),
52 | (col("product_price") * col("quantity")).alias("sales"),
53 | )
54 | .distinct()
55 | .where(and_(col("sales") >= 80, col("sales_date") == lit("2021-07-01").to_date))
56 | )
57 | print("\njuly1_high_sales data")
58 | july1_high_sales.execute().print()
59 |
60 | july1_high_sales2 = tbl_env.sql_query(
61 | """
62 | WITH distinct_sales AS (
63 | SELECT DISTINCT
64 | sales_date, seller_id, product, product_price * quantity AS sales
65 | FROM product_locale_sales
66 | )
67 | SELECT *
68 | FROM distinct_sales
69 | WHERE sales >= 80 and sales_date = '2021-07-01'
70 | """
71 | )
72 | print("\njuly1_high_sales2 data")
73 | july1_high_sales2.execute().print()
74 |
--------------------------------------------------------------------------------
/fraud-detection/remote/infra/variables.tf:
--------------------------------------------------------------------------------
1 | variable "vpn_to_create" {
2 | description = "Flag to indicate whether to create VPN"
3 | type = bool
4 | default = true
5 | }
6 |
7 | variable "vpn_to_use_spot" {
8 | description = "Flag to indicate whether to use a spot instance for VPN"
9 | type = bool
10 | default = false
11 | }
12 |
13 | variable "vpn_to_limit_vpn_ingress" {
14 | description = "Flag to indicate whether to limit ingress from the current machine's IP address"
15 | type = bool
16 | default = true
17 | }
18 |
19 | locals {
20 | name = "fraud-detection"
21 | region = data.aws_region.current.name
22 | environment = "dev"
23 |
24 | vpc = {
25 | cidr = "10.0.0.0/16"
26 | azs = slice(data.aws_availability_zones.available.names, 0, 3)
27 | }
28 |
29 | default_bucket = {
30 | name = "${local.name}-${data.aws_caller_identity.current.account_id}-${local.region}"
31 | to_set_acl = false
32 | }
33 |
34 | vpn = {
35 | to_create = var.vpn_to_create
36 | to_use_spot = var.vpn_to_use_spot
37 | ingress_cidr = var.vpn_to_limit_vpn_ingress ? "${data.http.local_ip_address.response_body}/32" : "0.0.0.0/0"
38 | spot_override = [
39 | { instance_type : "t3.small" },
40 | { instance_type : "t3a.small" },
41 | ]
42 | }
43 |
44 | msk = {
45 | version = "2.8.1"
46 | instance_size = "kafka.m5.large"
47 | ebs_volume_size = 20
48 | log_retention_ms = 604800000 # 7 days
49 | number_of_broker_nodes = 2
50 | num_partitions = 2
51 | default_replication_factor = 2
52 | }
53 |
54 | msk_connect = {
55 | package_name = "camel-aws-ddb-sink-kafka-connector.zip"
56 | }
57 |
58 | kda = {
59 | runtime_env = "FLINK-1_15"
60 | package_name = "kda-package.zip"
61 | consumer_0 = {
62 | table_name = "flagged_accounts"
63 | topic_name = "flagged-accounts"
64 | }
65 | consumer_1 = {
66 | table_name = "transactions"
67 | topic_name = "transactions"
68 | }
69 | producer_0 = {
70 | table_name = "flagged_transactions"
71 | topic_name = "flagged-transactions"
72 | }
73 | }
74 |
75 | tags = {
76 | Name = local.name
77 | Environment = local.environment
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/sql-cookbook/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM flink:1.17.1
2 |
3 | ARG PYTHON_VERSION
4 | ENV PYTHON_VERSION=${PYTHON_VERSION:-3.8.10}
5 | ARG FLINK_VERSION
6 | ENV FLINK_VERSION=${FLINK_VERSION:-1.17.1}
7 |
8 | RUN wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-kafka/$FLINK_VERSION/flink-connector-kafka-$FLINK_VERSION.jar; \
9 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/kafka/kafka-clients/3.2.3/kafka-clients-3.2.3.jar; \
10 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/$FLINK_VERSION/flink-sql-connector-kafka-$FLINK_VERSION.jar; \
11 | wget -P /opt/flink/lib/ https://github.com/knaufk/flink-faker/releases/download/v0.5.3/flink-faker-0.5.3.jar;
12 |
13 | ## Python version (3.7, 3.8, 3.9 or 3.10) is required, apt repo
14 |
15 | # Python 3.3 and later versions provide the lzma module.
16 | # However, if Python is installed using the source code and the lzma-dev package is not installed in the system,
17 | # the lzma module will not be installed.
18 | # https://support.huawei.com/enterprise/en/doc/EDOC1100289998/db0db8f0/modulenotfounderror-no-module-named-_lzma-
19 | # INFO:root:Starting up Python harness in a standalone process.
20 | # Traceback (most recent call last):
21 | # File "/usr/local/lib/python3.8/site-packages/fastavro/read.py", line 2, in
22 | # from . import _read
23 | # File "fastavro/_read.pyx", line 11, in init fastavro._read
24 | # File "/usr/local/lib/python3.8/lzma.py", line 27, in
25 | # from _lzma import *
26 | # ModuleNotFoundError: No module named '_lzma'
27 |
28 | RUN apt-get update -y && \
29 | apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev libffi-dev liblzma-dev && \
30 | wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz && \
31 | tar -xvf Python-${PYTHON_VERSION}.tgz && \
32 | cd Python-${PYTHON_VERSION} && \
33 | ./configure --without-tests --enable-shared && \
34 | make -j6 && \
35 | make install && \
36 | ldconfig /usr/local/lib && \
37 | cd .. && rm -f Python-${PYTHON_VERSION}.tgz && rm -rf Python-${PYTHON_VERSION} && \
38 | ln -s /usr/local/bin/python3 /usr/local/bin/python && \
39 | apt-get clean && \
40 | rm -rf /var/lib/apt/lists/*
41 |
42 | # install PyFlink
43 | RUN pip3 install apache-flink==${FLINK_VERSION}
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/connector/HttpSplitEnumerator.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.connector
2 |
3 | import org.apache.flink.api.connector.source.SplitEnumerator
4 | import org.apache.flink.api.connector.source.SplitEnumeratorContext
5 |
6 | /**
7 | * The coordinator for the source. It runs on the JobManager.
8 | * Its job is to generate splits on demand from a configurable numeric range
9 | * and assign them to readers. It checkpoints its position in the cycle (`lastSeenId`).
10 | */
11 | class HttpSplitEnumerator(
12 | private val context: SplitEnumeratorContext,
13 | private val baseUrlPattern: String,
14 | private val startId: Long,
15 | private val maxId: Long,
16 | restoredLastSeenId: Long?,
17 | ) : SplitEnumerator {
18 | private var lastSeenId: Long = restoredLastSeenId ?: (startId - 1)
19 |
20 | override fun start() {}
21 |
22 | override fun handleSplitRequest(
23 | subtaskId: Int,
24 | requesterHostname: String?,
25 | ) {
26 | // 1. Calculate the next ID in the cycle.
27 | var nextId = lastSeenId + 1
28 | // 2. Apply the configurable wrap-around rule.
29 | if (nextId > maxId) {
30 | nextId = startId
31 | }
32 |
33 | // 3. Generate the URL and create the split.
34 | val url = baseUrlPattern.replace("{id}", nextId.toString())
35 | val split = HttpSplit(url)
36 |
37 | // 4. Assign the split to the requesting reader.
38 | context.assignSplit(split, subtaskId)
39 |
40 | // 5. CRITICAL: Update the state for the next request.
41 | this.lastSeenId = nextId
42 | }
43 |
44 | override fun addSplitsBack(
45 | splits: MutableList,
46 | subtaskId: Int,
47 | ) {
48 | // This source is cyclical and state-based, not queue-based. If a reader
49 | // fails, we don't need to re-add its specific splits. The cyclical logic
50 | // will naturally re-assign a split for that ID when its turn comes again.
51 | }
52 |
53 | override fun addReader(subtaskId: Int) {}
54 |
55 | // --- Checkpointing ---
56 | override fun snapshotState(checkpointId: Long): Long {
57 | // On checkpoint, save the last ID that was successfully assigned.
58 | return lastSeenId
59 | }
60 |
61 | override fun close() {}
62 | }
63 |
--------------------------------------------------------------------------------
/pyflink-udemy/s4_03_joining.py:
--------------------------------------------------------------------------------
1 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource
2 | from pyflink.table.expressions import col
3 |
4 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode())
5 |
6 | # sales source
7 | sales_field_names = "seller_id,product,quantity,product_price,sales_date".split(",")
8 | sales_field_types = [
9 | DataTypes.STRING(),
10 | DataTypes.STRING(),
11 | DataTypes.INT(),
12 | DataTypes.DOUBLE(),
13 | DataTypes.DATE(),
14 | ]
15 | sales_source = CsvTableSource(
16 | "./csv-input", sales_field_names, sales_field_types, ignore_first_line=True
17 | )
18 | tbl_env.register_table_source("product_locale_sales", sales_source)
19 |
20 | # sellers source
21 | sellers_field_names = "id,city,state".split(",")
22 | sellers_field_types = [DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING()]
23 | sellers_source = CsvTableSource(
24 | "./seller-input", sellers_field_names, sellers_field_types, ignore_first_line=True
25 | )
26 | tbl_env.register_table_source("seller_locales", sellers_source)
27 |
28 | sales_tbl = tbl_env.from_path("product_locale_sales")
29 | sellers_tbl = tbl_env.from_path("seller_locales")
30 |
31 | seller_products = (
32 | sales_tbl.join(sellers_tbl, col("seller_id") == col("id"))
33 | .select(col("city"), col("state"), col("product"), col("product_price"))
34 | .distinct()
35 | )
36 | print("\nseller_products data")
37 | seller_products.execute().print()
38 |
39 | seller_products2 = tbl_env.sql_query(
40 | """
41 | SELECT DISTINCT city, state, product, product_price
42 | FROM product_locale_sales l
43 | JOIN seller_locales r ON l.seller_id = r.id
44 | """
45 | )
46 | print("\nseller_products2 data")
47 | seller_products2.execute().print()
48 |
49 | sellers_no_sales = (
50 | sales_tbl.right_outer_join(sellers_tbl, col("seller_id") == col("id"))
51 | .where(col("product").is_null)
52 | .select(col("city"), col("state"), col("product"))
53 | .distinct()
54 | )
55 | print("\nsellers_no_sales data")
56 | sellers_no_sales.execute().print()
57 |
58 | sellers_no_sales2 = tbl_env.sql_query(
59 | """
60 | SELECT city, state, product
61 | FROM product_locale_sales l
62 | RIGHT JOIN seller_locales r ON l.seller_id = r.id
63 | WHERE product IS NULL
64 | """
65 | )
66 | print("\nsellers_no_sales2 data")
67 | sellers_no_sales2.execute().print()
68 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/smoke/SmokeLevelSourceReader.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.smoke
2 |
3 | import org.apache.flink.api.connector.source.ReaderOutput
4 | import org.apache.flink.api.connector.source.SourceReader
5 | import org.apache.flink.api.connector.source.SourceReaderContext
6 | import org.apache.flink.core.io.InputStatus
7 | import org.apache.flink.util.concurrent.FutureUtils
8 | import java.util.Random
9 | import java.util.concurrent.CompletableFuture
10 | import java.util.concurrent.LinkedBlockingQueue
11 |
12 | /**
13 | * The SourceReader for the SmokeLevelSource. It runs on TaskManagers and
14 | * generates the stream of SmokeLevel events in a background thread.
15 | */
16 | class SmokeLevelSourceReader(
17 | private val readerContext: SourceReaderContext,
18 | ) : SourceReader {
19 | private val buffer = LinkedBlockingQueue(1)
20 |
21 | @Volatile
22 | private var running = false
23 | private var generatorThread: Thread? = null
24 |
25 | override fun start() {
26 | running = true
27 | }
28 |
29 | override fun addSplits(splits: List) {
30 | generatorThread =
31 | Thread {
32 | val rand = Random()
33 | try {
34 | while (running) {
35 | val smokeLevel = if (rand.nextGaussian() > 0.8) SmokeLevel.High else SmokeLevel.Low
36 | buffer.put(smokeLevel)
37 | Thread.sleep(1000)
38 | }
39 | } catch (e: InterruptedException) {
40 | // Thread interrupted, exit
41 | }
42 | }
43 | generatorThread?.start()
44 | }
45 |
46 | override fun pollNext(output: ReaderOutput): InputStatus? {
47 | val level = buffer.poll()
48 | return if (level != null) {
49 | output.collect(level)
50 | InputStatus.MORE_AVAILABLE
51 | } else {
52 | InputStatus.NOTHING_AVAILABLE
53 | }
54 | }
55 |
56 | override fun isAvailable(): CompletableFuture = FutureUtils.completedVoidFuture()
57 |
58 | override fun snapshotState(checkpointId: Long): List = mutableListOf()
59 |
60 | override fun notifyNoMoreSplits() {}
61 |
62 | override fun close() {
63 | running == false
64 | generatorThread?.interrupt()
65 | generatorThread?.join()
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/fraud-detection/local/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3.5"
2 |
3 | services:
4 | zookeeper:
5 | image: bitnami/zookeeper:3.5
6 | container_name: zookeeper
7 | ports:
8 | - "2181"
9 | networks:
10 | - kafkanet
11 | environment:
12 | - ALLOW_ANONYMOUS_LOGIN=yes
13 | volumes:
14 | - zookeeper_data:/bitnami/zookeeper
15 | kafka-0:
16 | image: bitnami/kafka:2.8.1
17 | container_name: kafka-0
18 | expose:
19 | - 9092
20 | ports:
21 | - "29092:29092"
22 | networks:
23 | - kafkanet
24 | environment:
25 | - ALLOW_PLAINTEXT_LISTENER=yes
26 | - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181
27 | - KAFKA_CFG_BROKER_ID=0
28 | - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT
29 | - KAFKA_CFG_LISTENERS=INTERNAL://:9092,EXTERNAL://:29092
30 | - KAFKA_CFG_ADVERTISED_LISTENERS=INTERNAL://kafka-0:9092,EXTERNAL://localhost:29092
31 | - KAFKA_CFG_INTER_BROKER_LISTENER_NAME=INTERNAL
32 | - KAFKA_CFG_NUM_PARTITIONS=2
33 | volumes:
34 | - kafka_0_data:/bitnami/kafka
35 | depends_on:
36 | - zookeeper
37 | kafka-connect:
38 | image: bitnami/kafka:2.8.1
39 | container_name: connect
40 | command: >
41 | /opt/bitnami/kafka/bin/connect-distributed.sh
42 | /opt/bitnami/kafka/config/connect-distributed.properties
43 | ports:
44 | - "8083:8083"
45 | networks:
46 | - kafkanet
47 | environment:
48 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID
49 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY
50 | AWS_SESSION_TOKEN: $AWS_SESSION_TOKEN
51 | volumes:
52 | - "./configs/connect-distributed.properties:/opt/bitnami/kafka/config/connect-distributed.properties"
53 | - "./connectors/camel-aws-ddb-sink-kafka-connector:/opt/connectors/camel-aws-ddb-sink-kafka-connector"
54 | depends_on:
55 | - zookeeper
56 | - kafka-0
57 | kpow:
58 | image: factorhouse/kpow-ce:91.2.1
59 | container_name: kpow
60 | ports:
61 | - "3000:3000"
62 | networks:
63 | - kafkanet
64 | environment:
65 | BOOTSTRAP: kafka-0:9092
66 | CONNECT_REST_URL: http://kafka-connect:8083
67 | depends_on:
68 | - zookeeper
69 | - kafka-0
70 | - kafka-connect
71 |
72 | networks:
73 | kafkanet:
74 | name: kafka-network
75 |
76 | volumes:
77 | zookeeper_data:
78 | driver: local
79 | name: zookeeper_data
80 | kafka_0_data:
81 | driver: local
82 | name: kafka_0_data
83 |
--------------------------------------------------------------------------------
/sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/FileReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 Ververica GmbH
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.ververica.sql_training.data_producer;
18 |
19 | import com.ververica.sql_training.data_producer.json_serde.JsonDeserializer;
20 | import com.ververica.sql_training.data_producer.records.TaxiRecord;
21 |
22 | import java.io.*;
23 | import java.nio.charset.StandardCharsets;
24 | import java.util.Iterator;
25 | import java.util.NoSuchElementException;
26 | import java.util.function.Supplier;
27 | import java.util.stream.Stream;
28 | import java.util.zip.GZIPInputStream;
29 |
30 | /**
31 | * Reads JSON-encoded TaxiRecords from a gzipped text file.
32 | */
33 | public class FileReader implements Supplier {
34 |
35 | private final Iterator records;
36 | private final String filePath;
37 |
38 | public FileReader(String filePath, Class extends TaxiRecord> recordClazz) throws IOException {
39 |
40 | this.filePath = filePath;
41 | JsonDeserializer> deserializer = new JsonDeserializer<>(recordClazz);
42 | try {
43 |
44 | BufferedReader reader = new BufferedReader(
45 | new InputStreamReader(new GZIPInputStream(new FileInputStream(filePath)), StandardCharsets.UTF_8));
46 |
47 | Stream lines = reader.lines().sequential();
48 | records = lines.map(l -> (TaxiRecord) deserializer.parseFromString(l)).iterator();
49 |
50 | } catch (IOException e) {
51 | throw new IOException("Error reading TaxiRecords from file: " + filePath, e);
52 | }
53 | }
54 |
55 | @Override
56 | public TaxiRecord get() {
57 |
58 | if (records.hasNext()) {
59 | return records.next();
60 | } else {
61 | throw new NoSuchElementException("All records read from " + filePath);
62 | }
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/src/main/kotlin/me/jaehyeon/chapter6/MarkerBasedWatermarkGeneration.kt:
--------------------------------------------------------------------------------
1 | package me.jaehyeon.chapter6
2 |
3 | import me.jaehyeon.sensor.SensorReading
4 | import me.jaehyeon.sensor.SensorSource
5 | import org.apache.flink.api.common.eventtime.Watermark
6 | import org.apache.flink.api.common.eventtime.WatermarkGenerator
7 | import org.apache.flink.api.common.eventtime.WatermarkOutput
8 | import org.apache.flink.api.common.eventtime.WatermarkStrategy
9 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
10 |
11 | /**
12 | * This Flink job demonstrates a custom, event-driven watermark generation strategy.
13 | *
14 | * This pattern, formerly known as "punctuated", emits a watermark on-the-fly
15 | * whenever it sees a specific "marker" event in the stream, rather than on a
16 | * periodic interval.
17 | */
18 | object MarkerBasedWatermarkGeneration {
19 | @JvmStatic
20 | fun main(args: Array) {
21 | val env = StreamExecutionEnvironment.getExecutionEnvironment()
22 |
23 | val readings =
24 | env.fromSource(
25 | SensorSource(),
26 | WatermarkStrategy
27 | .forGenerator { ctx -> MarkerBasedWatermarkGenerator() }
28 | .withTimestampAssigner { reading, _ -> reading.timestamp },
29 | "Sensor Source",
30 | )
31 |
32 | readings.print()
33 | env.execute("Marker-Based Watermark Generation")
34 | }
35 | }
36 |
37 | /**
38 | * A custom WatermarkGenerator that emits a new watermark every time it sees a
39 | * specific marker event (in this case, a reading from "sensor_1").
40 | */
41 | class MarkerBasedWatermarkGenerator : WatermarkGenerator {
42 | /**
43 | * This method is called for every event. We inspect the event and decide whether to emit a watermark.
44 | */
45 | override fun onEvent(
46 | event: SensorReading,
47 | eventTimestamp: Long,
48 | output: WatermarkOutput,
49 | ) {
50 | // Emit a new watermark if the event is from our marker, "sensor_1".
51 | if (event.id == "sensor_1") {
52 | output.emitWatermark(Watermark(eventTimestamp))
53 | }
54 | }
55 |
56 | /**
57 | * This method is called periodically. Since our logic is purely event-driven,
58 | * we don't need to do anything here.
59 | */
60 | override fun onPeriodicEmit(output: WatermarkOutput) {
61 | // This is not a periodic generator, so we do nothing here.
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/fraud-detection/remote/infra/outputs.tf:
--------------------------------------------------------------------------------
1 | # VPC
2 | output "vpc_id" {
3 | description = "The ID of the VPC"
4 | value = module.vpc.vpc_id
5 | }
6 |
7 | output "vpc_cidr_block" {
8 | description = "The CIDR block of the VPC"
9 | value = module.vpc.vpc_cidr_block
10 | }
11 |
12 | output "private_subnets" {
13 | description = "List of IDs of private subnets"
14 | value = module.vpc.private_subnets
15 | }
16 |
17 | output "public_subnets" {
18 | description = "List of IDs of public subnets"
19 | value = module.vpc.public_subnets
20 | }
21 |
22 | output "nat_public_ips" {
23 | description = "List of public Elastic IPs created for AWS NAT Gateway"
24 | value = module.vpc.nat_public_ips
25 | }
26 |
27 | output "azs" {
28 | description = "A list of availability zones specified as argument to this module"
29 | value = module.vpc.azs
30 | }
31 |
32 | # Default bucket
33 | output "default_bucket_name" {
34 | description = "Default bucket name"
35 | value = aws_s3_bucket.default_bucket.id
36 | }
37 |
38 | # VPN
39 | output "vpn_launch_template_arn" {
40 | description = "The ARN of the VPN launch template"
41 | value = {
42 | for k, v in module.vpn : k => v.launch_template_arn
43 | }
44 | }
45 |
46 | output "vpn_autoscaling_group_id" {
47 | description = "VPN autoscaling group id"
48 | value = {
49 | for k, v in module.vpn : k => v.autoscaling_group_id
50 | }
51 | }
52 |
53 | output "vpn_autoscaling_group_name" {
54 | description = "VPN autoscaling group name"
55 | value = {
56 | for k, v in module.vpn : k => v.autoscaling_group_name
57 | }
58 | }
59 |
60 | # MSK
61 | output "msk_arn" {
62 | description = "Amazon Resource Name (ARN) of the MSK cluster"
63 | value = aws_msk_cluster.msk_data_cluster.arn
64 | }
65 |
66 | output "msk_bootstrap_brokers_sasl_iam" {
67 | description = "One or more DNS names (or IP addresses) and SASL IAM port pairs"
68 | value = aws_msk_cluster.msk_data_cluster.bootstrap_brokers_sasl_iam
69 | }
70 |
71 | # MSK Connect
72 | output "ddb_sink_arn" {
73 | description = "Amazon Resource Name (ARN) of the Camel DyanmoDB Sink connector"
74 | value = aws_mskconnect_connector.camel_ddb_sink.arn
75 | }
76 |
77 | output "ddb_sink_version" {
78 | description = "Current version of the Camel DyanmoDB Sink connector"
79 | value = aws_mskconnect_connector.camel_ddb_sink.version
80 | }
81 |
82 | # KDA
83 | output "kda_app_arn" {
84 | description = "Kinesis Application ARN"
85 | value = aws_kinesisanalyticsv2_application.kda_app.arn
86 | }
87 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM flink:1.17.1
2 |
3 | ARG PYTHON_VERSION
4 | ENV PYTHON_VERSION=${PYTHON_VERSION:-3.8.10}
5 | ARG FLINK_VERSION
6 | ENV FLINK_VERSION=${FLINK_VERSION:-1.17.1}
7 |
8 | RUN wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-kafka/$FLINK_VERSION/flink-connector-kafka-$FLINK_VERSION.jar; \
9 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/kafka/kafka-clients/3.2.3/kafka-clients-3.2.3.jar; \
10 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/$FLINK_VERSION/flink-sql-connector-kafka-$FLINK_VERSION.jar; \
11 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-jdbc/3.1.0-1.17/flink-connector-jdbc-3.1.0-1.17.jar; \
12 | wget -P /opt/flink/lib/ https://jdbc.postgresql.org/download/postgresql-42.6.0.jar; \
13 | wget -P /opt/flink/lib/ https://github.com/knaufk/flink-faker/releases/download/v0.5.3/flink-faker-0.5.3.jar;
14 |
15 | ## Python version (3.7, 3.8, 3.9 or 3.10) is required, apt repo
16 | # Python 3.3 and later versions provide the lzma module.
17 | # However, if Python is installed using the source code and the lzma-dev package is not installed in the system,
18 | # the lzma module will not be installed.
19 | # https://support.huawei.com/enterprise/en/doc/EDOC1100289998/db0db8f0/modulenotfounderror-no-module-named-_lzma-
20 | # INFO:root:Starting up Python harness in a standalone process.
21 | # Traceback (most recent call last):
22 | # File "/usr/local/lib/python3.8/site-packages/fastavro/read.py", line 2, in
23 | # from . import _read
24 | # File "fastavro/_read.pyx", line 11, in init fastavro._read
25 | # File "/usr/local/lib/python3.8/lzma.py", line 27, in
26 | # from _lzma import *
27 | # ModuleNotFoundError: No module named '_lzma'
28 |
29 | RUN apt-get update -y && \
30 | apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev libffi-dev liblzma-dev && \
31 | wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz && \
32 | tar -xvf Python-${PYTHON_VERSION}.tgz && \
33 | cd Python-${PYTHON_VERSION} && \
34 | ./configure --without-tests --enable-shared && \
35 | make -j6 && \
36 | make install && \
37 | ldconfig /usr/local/lib && \
38 | cd .. && rm -f Python-${PYTHON_VERSION}.tgz && rm -rf Python-${PYTHON_VERSION} && \
39 | ln -s /usr/local/bin/python3 /usr/local/bin/python && \
40 | apt-get clean && \
41 | rm -rf /var/lib/apt/lists/*
42 |
43 | # install PyFlink
44 | RUN pip3 install apache-flink==${FLINK_VERSION}
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/manage_topics.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import typing
4 | import logging
5 |
6 | from kafka import KafkaAdminClient
7 | from kafka.admin import NewTopic
8 | from kafka.errors import KafkaError, UnknownTopicOrPartitionError
9 |
10 |
11 | class KafkaClient:
12 | def __init__(self, bootstrap_servers: str) -> None:
13 | self.bootstrap_servers = bootstrap_servers
14 | self.admin_client = self.create_admin()
15 |
16 | def create_admin(self):
17 | return KafkaAdminClient(bootstrap_servers=self.bootstrap_servers)
18 |
19 | def delete_topics(self, topic_names: typing.List[str]):
20 | for name in topic_names:
21 | try:
22 | self.admin_client.delete_topics([name])
23 | except UnknownTopicOrPartitionError:
24 | pass
25 | except Exception as err:
26 | raise RuntimeError(f"fails to delete topic - {name}") from err
27 |
28 | def create_topics(self, topics: typing.List[NewTopic], to_recreate: bool = True):
29 | if to_recreate:
30 | self.delete_topics([t.name for t in topics])
31 | for topic in topics:
32 | try:
33 | resp = self.admin_client.create_topics([topic])
34 | name, error_code, error_message = resp.topic_errors[0]
35 | logging.info(
36 | f"topic created, name - {name}, error code - {error_code}, error message - {error_message}"
37 | )
38 | except KafkaError as err:
39 | raise RuntimeError(
40 | f"fails to create topics - {', '.join(t.name for t in topics)}"
41 | ) from err
42 | logging.info(f"topics created successfully - {', '.join([t.name for t in topics])}")
43 |
44 |
45 | if __name__ == "__main__":
46 | logging.basicConfig(
47 | level=logging.INFO,
48 | format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s:%(message)s",
49 | datefmt="%Y-%m-%d %H:%M:%S",
50 | )
51 |
52 | parser = argparse.ArgumentParser()
53 | parser.add_argument("--delete", action="store_true")
54 | parser.set_defaults(delete=False)
55 | parser.add_argument("--create", action="store_true")
56 | parser.set_defaults(create=False)
57 | args = parser.parse_args()
58 |
59 | client = KafkaClient(os.getenv("BOOTSTRAP_SERVERS", "localhost:29092"))
60 |
61 | topics = [NewTopic(name="sensor-reading", num_partitions=3, replication_factor=1)]
62 |
63 | if args.delete:
64 | client.delete_topics(topics)
65 | if args.create:
66 | client.create_topics(topics, to_recreate=True)
67 |
--------------------------------------------------------------------------------
/stream-processing-with-flink/build.gradle.kts:
--------------------------------------------------------------------------------
1 | import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar
2 | import org.gradle.api.tasks.JavaExec
3 | import org.gradle.api.tasks.testing.Test
4 |
5 | plugins {
6 | kotlin("jvm") version "2.2.20"
7 | application
8 | id("com.github.johnrengelman.shadow") version "8.1.1"
9 | kotlin("plugin.serialization") version "2.2.20"
10 | }
11 |
12 | group = "me.jaehyeon"
13 | version = "1.0-SNAPSHOT"
14 |
15 | val localRunClasspath by configurations.creating {
16 | extendsFrom(configurations.implementation.get(), configurations.compileOnly.get(), configurations.runtimeOnly.get())
17 | }
18 |
19 | repositories {
20 | mavenCentral()
21 | }
22 |
23 | val flinkVersion = "1.20.1"
24 | val ktorVersion = "3.3.3"
25 |
26 | dependencies {
27 | // Flink Dependencies
28 | compileOnly("org.apache.flink:flink-streaming-java:$flinkVersion")
29 | compileOnly("org.apache.flink:flink-clients:$flinkVersion")
30 | compileOnly("org.apache.flink:flink-connector-base:$flinkVersion")
31 | // 'testImplementation' makes Flink available for test source compilation and execution.
32 | testImplementation("org.apache.flink:flink-streaming-java:$flinkVersion")
33 | testImplementation("org.apache.flink:flink-clients:$flinkVersion")
34 | testImplementation("org.apache.flink:flink-connector-base:$flinkVersion")
35 | // Ktor
36 | implementation("io.ktor:ktor-client-core:$ktorVersion")
37 | implementation("io.ktor:ktor-client-cio:$ktorVersion")
38 | implementation("io.ktor:ktor-client-content-negotiation:$ktorVersion")
39 | implementation("io.ktor:ktor-serialization-kotlinx-json:$ktorVersion")
40 | // Logging
41 | implementation("org.slf4j:slf4j-simple:2.0.17")
42 | // Testing
43 | testImplementation(kotlin("test"))
44 | testImplementation("org.junit.jupiter:junit-jupiter-api:5.14.1")
45 | testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine:5.14.1")
46 | }
47 |
48 | kotlin {
49 | jvmToolchain(11)
50 | }
51 |
52 | application {
53 | mainClass.set(
54 | project.findProperty("mainClass")?.toString()
55 | ?: "me.jaehyeon.chapter1.AverageSensorReadings",
56 | )
57 | }
58 |
59 | tasks.named("run") {
60 | // Classpath = All library dependencies + The application's compiled code.
61 | classpath = localRunClasspath + sourceSets.main.get().output
62 | }
63 |
64 | tasks.withType {
65 | useJUnitPlatform()
66 | }
67 |
68 | tasks.withType {
69 | archiveBaseName.set(rootProject.name)
70 | archiveClassifier.set("")
71 | archiveVersion.set("1.0")
72 | mergeServiceFiles()
73 | }
74 |
75 | tasks.named("build") {
76 | dependsOn("shadowJar")
77 | }
78 |
--------------------------------------------------------------------------------
/flink-sql-cookbook/README.md:
--------------------------------------------------------------------------------
1 | ## Flink SQL Cookbook on Docker
2 |
3 | A Flink cluster that can be used to run queries of the [Apache Flink SQL Cookbook](https://github.com/ververica/flink-sql-cookbook/tree/main) repo from Ververica.
4 |
5 | The Flink Docker image is updated with the [Flink SQL Faker Connector](https://github.com/knaufk/flink-faker) for fake data generation. Note that the example SQL queries are based on an old version of the connector, and some of them have to be modified.
6 |
7 | ### Flink Cluster on Docker
8 |
9 | The cookbook generates sample records using the [Flink SQL Faker Connector](https://github.com/knaufk/flink-faker), and we use a custom Docker image that downloads its source to the `/opt/flink/lib/` folder. In this way, we don't have to specify the connector source whenever we start the [SQL client](https://nightlies.apache.org/flink/flink-docs-master/docs/dev/table/sqlclient/).
10 |
11 | ```Dockerfile
12 | FROM flink:1.20.1
13 |
14 | # add faker connector
15 | RUN wget -P /opt/flink/lib/ \
16 | https://github.com/knaufk/flink-faker/releases/download/v0.5.3/flink-faker-0.5.3.jar
17 | ```
18 |
19 | A local Apache Flink cluster can be deployed using Docker Compose.
20 |
21 | ```bash
22 | # start containers
23 | $ docker compose up -d
24 |
25 | # list containers
26 | $ docker-compose ps
27 | # NAME COMMAND SERVICE STATUS PORTS
28 | # jobmanager "/docker-entrypoint.…" jobmanager running (healthy) 6123/tcp, 0.0.0.0:8081->8081/tcp, :::8081->8081/tcp
29 | # taskmanager-1 "/docker-entrypoint.…" taskmanager-1 running 6123/tcp, 8081/tcp
30 | # taskmanager-2 "/docker-entrypoint.…" taskmanager-2 running 6123/tcp, 8081/tcp
31 | # taskmanager-3 "/docker-entrypoint.…" taskmanager-3 running 6123/tcp, 8081/tcp
32 | ```
33 |
34 | ### Flink SQL Client
35 |
36 | ```sql
37 | -- // create a temporary table
38 | CREATE TEMPORARY TABLE heros (
39 | `name` STRING,
40 | `power` STRING,
41 | `age` INT
42 | ) WITH (
43 | 'connector' = 'faker',
44 | 'fields.name.expression' = '#{superhero.name}',
45 | 'fields.power.expression' = '#{superhero.power}',
46 | 'fields.power.null-rate' = '0.05',
47 | 'fields.age.expression' = '#{number.numberBetween ''0'',''1000''}'
48 | );
49 | -- [INFO] Execute statement succeeded.
50 |
51 | -- list tables
52 | SHOW TABLES;
53 | -- +------------+
54 | -- | table name |
55 | -- +------------+
56 | -- | heros |
57 | -- +------------+
58 | -- 1 row in set
59 |
60 | -- query records from the heros table
61 | -- hit 'q' to exit the record view
62 | SELECT * FROM heros;
63 |
64 | -- quit sql shell
65 | quit;
66 | ```
67 |
68 | 
69 |
--------------------------------------------------------------------------------
/datorios/README.md:
--------------------------------------------------------------------------------
1 | [](https://datorios.con "See The Data Behind Your Data - Data Observability for Apache Flink")
2 |
3 |
4 |
5 | # See The Data Behind Your Data - Data Observability for Apache Flink ®
6 |
7 | ### **Unified Investigation Platform:**
8 | When all of your pipeline data is available in one place, you no longer have the need to waste time flipping between different places to access source data, application logging, and pipeline metrics. The precious time spent flipping between platforms could be time spent bringing your pipelines back to a healthy state.
9 |
10 | ### **Effortless Debugging of Operator Functionality & State:**
11 | See every record that passed through each of your operators, unlocking the ability to see how the record altered state while it was being processed.
12 |
13 | ### **Better Integration Testing:**
14 | A lot is left to be desired when testing your pipelines. Unit testing only gives us half the story, by seeing how all of your operators work together when the data flows you can be confident in the results provided by your pipelines.
15 |
16 | ### **Peace of Mind in Production Monitoring:**
17 | Access to your mission-critical performance metrics gives you the peace of mind knowing your jobs are resourced correctly and won't fall over when more data starts flowing in.
18 |
19 | ### **Breeze Through Window Investigation:**
20 | With the Window Investigation tool you can magnify the problems you are encountering with windowing. Problems can include late events, incorrect watermark settings, or even your aggregation functions.
21 |
22 | # **High level Architecture:**
23 | Datorios consists of two components:
24 |
25 |
26 | - Datorios client running on Docker Compose - The client will install the Apache Flink engine on your local/cloud machine, where your jobs will be deployed (embedding Datorios to your current Flink is coming up soon).
27 | - A cloud observability service is used for deep investigation and debugging.
28 |
29 | 
30 |
31 | [Signup](https://app.datorios.com/signup) to download the Datorios cluster - You can use your own or the demo Flink jobs in [this repository](https://github.com/metrolinkai/Datorios/tree/main/flink-examples) for a test run
32 |
33 | # **K8S Architecture:**
34 |
35 | 
36 |
37 | [SaaS side](https://github.com/metrolinkai/Datorios/blob/main/resources/SAAS.drawio.png)
38 |
39 | [.png)](https://datorios.con "Making your Flink transparent")
40 |
--------------------------------------------------------------------------------
/pyflink-udemy/s3_06_kafka_sink.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from pyflink.table import EnvironmentSettings, TableEnvironment
4 |
5 | BOOTSTRAP_SERVERS = os.getenv("BOOTSTRAP_SERVERS", "localhost:29092")
6 | # https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/connectors/table/kafka/
7 | version_map = {"15": "1.15.4", "16": "1.16.0"}
8 | FLINK_VERSION = version_map[os.getenv("MINOR_VERSION", "15")]
9 | FLINK_SQL_CONNECTOR_KAFKA = f"flink-sql-connector-kafka-{FLINK_VERSION}.jar"
10 |
11 | env_settings = EnvironmentSettings.in_streaming_mode()
12 | table_env = TableEnvironment.create(env_settings)
13 | # https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/dev/python/dependency_management/
14 | kafka_jar = os.path.join(os.path.abspath(os.path.dirname(__file__)), FLINK_SQL_CONNECTOR_KAFKA)
15 | table_env.get_config().set("pipeline.jars", f"file://{kafka_jar}")
16 |
17 | ## create kafka source table
18 | table_env.execute_sql(
19 | f"""
20 | CREATE TABLE product_sales (
21 | `seller_id` VARCHAR,
22 | `product` VARCHAR,
23 | `quantity` INT,
24 | `product_price` DOUBLE,
25 | `sales_date` VARCHAR
26 | ) WITH (
27 | 'connector' = 'kafka',
28 | 'topic' = 'product_sales',
29 | 'properties.bootstrap.servers' = '{BOOTSTRAP_SERVERS}',
30 | 'properties.group.id' = 'source-demo',
31 | 'format' = 'json',
32 | 'scan.startup.mode' = 'earliest-offset',
33 | 'json.fail-on-missing-field' = 'false',
34 | 'json.ignore-parse-errors' = 'true'
35 | )
36 | """
37 | )
38 |
39 | ## create print sink table
40 | table_env.execute_sql(
41 | f"""
42 | CREATE TABLE print (
43 | `seller_id` VARCHAR,
44 | `product` VARCHAR,
45 | `quantity` INT,
46 | `product_price` DOUBLE,
47 | `sales_date` VARCHAR
48 | ) WITH (
49 | 'connector' = 'print'
50 | )
51 | """
52 | )
53 |
54 | ## create kafka sink table
55 | table_env.execute_sql(
56 | f"""
57 | CREATE TABLE product_sales_sink (
58 | `seller_id` VARCHAR,
59 | `product` VARCHAR,
60 | `quantity` INT,
61 | `product_price` DOUBLE,
62 | `sales_date` VARCHAR
63 | ) WITH (
64 | 'connector' = 'kafka',
65 | 'topic' = 'product_sales_sink',
66 | 'properties.bootstrap.servers' = '{BOOTSTRAP_SERVERS}',
67 | 'format' = 'json',
68 | 'json.fail-on-missing-field' = 'false',
69 | 'json.ignore-parse-errors' = 'true'
70 | )
71 | """
72 | )
73 |
74 | ## insert into sink tables
75 | tbl = table_env.from_path("product_sales")
76 | statement_set = table_env.create_statement_set()
77 | statement_set.add_insert("print", tbl)
78 | statement_set.add_insert("product_sales_sink", tbl)
79 | statement_set.execute().wait()
80 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/compose-msk.yml:
--------------------------------------------------------------------------------
1 | version: "3.5"
2 |
3 | services:
4 | jobmanager:
5 | image: real-time-streaming-aws:1.17.1
6 | command: jobmanager
7 | container_name: jobmanager
8 | ports:
9 | - "8081:8081"
10 | networks:
11 | - appnet
12 | volumes:
13 | - ./:/etc/flink
14 | environment:
15 | - BOOTSTRAP_SERVERS=${BOOTSTRAP_SERVERS:-not_set}
16 | - OPENSEARCH_HOSTS=${OPENSEARCH_HOSTS:-not_set}
17 | - RUNTIME_ENV=DOCKER
18 | - AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID
19 | - AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY
20 | # - AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
21 | - |
22 | FLINK_PROPERTIES=
23 | jobmanager.rpc.address: jobmanager
24 | state.backend: filesystem
25 | state.checkpoints.dir: file:///tmp/flink-checkpoints
26 | heartbeat.interval: 1000
27 | heartbeat.timeout: 5000
28 | rest.flamegraph.enabled: true
29 | web.backpressure.refresh-interval: 10000
30 | taskmanager:
31 | image: real-time-streaming-aws:1.17.1
32 | command: taskmanager
33 | container_name: taskmanager
34 | networks:
35 | - appnet
36 | volumes:
37 | - flink_data:/tmp/
38 | - ./:/etc/flink
39 | environment:
40 | - BOOTSTRAP_SERVERS=${BOOTSTRAP_SERVERS:-not_set}
41 | - OPENSEARCH_HOSTS=${OPENSEARCH_HOSTS:-not_set}
42 | - RUNTIME_ENV=DOCKER
43 | - AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID
44 | - AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY
45 | # - AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN
46 | - |
47 | FLINK_PROPERTIES=
48 | jobmanager.rpc.address: jobmanager
49 | taskmanager.numberOfTaskSlots: 5
50 | state.backend: filesystem
51 | state.checkpoints.dir: file:///tmp/flink-checkpoints
52 | heartbeat.interval: 1000
53 | heartbeat.timeout: 5000
54 | depends_on:
55 | - jobmanager
56 | kpow:
57 | image: factorhouse/kpow-ce:91.5.1
58 | container_name: kpow
59 | ports:
60 | - "3000:3000"
61 | networks:
62 | - appnet
63 | environment:
64 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID
65 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY
66 | # AWS_SESSION_TOKEN: $AWS_SESSION_TOKEN
67 | BOOTSTRAP: ${BOOTSTRAP_SERVERS:-not_set}
68 | SECURITY_PROTOCOL: SASL_SSL
69 | SASL_MECHANISM: AWS_MSK_IAM
70 | SASL_JAAS_CONFIG: software.amazon.msk.auth.iam.IAMLoginModule required;
71 | SASL_CLIENT_CALLBACK_HANDLER_CLASS: software.amazon.msk.auth.iam.IAMClientCallbackHandler
72 | env_file: # https://kpow.io/get-started/#individual
73 | - ./kpow.env
74 |
75 | networks:
76 | appnet:
77 | name: app-network
78 |
79 | volumes:
80 | flink_data:
81 | driver: local
82 | name: flink_data
83 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter5/utils/model.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import dataclasses
3 | from typing import Iterable, Tuple
4 |
5 | from pyflink.common import Row
6 | from pyflink.common.typeinfo import Types
7 |
8 | from .type_helper import TypeMapping, set_type_info
9 |
10 |
11 | @dataclasses.dataclass
12 | class SensorReading(TypeMapping):
13 | id: str
14 | timestamp: int
15 | num_records: int
16 | temperature: float
17 |
18 | def to_row(self):
19 | return Row(**dataclasses.asdict(self))
20 |
21 | @classmethod
22 | def from_row(cls, row: Row):
23 | return cls(**row.as_dict())
24 |
25 | @classmethod
26 | def from_tuple(cls, tup: Tuple[int, int, datetime.datetime]):
27 | return cls(
28 | id=f"sensor_{tup[0]}",
29 | timestamp=int(tup[2].strftime("%s")) * 1000,
30 | num_records=1,
31 | temperature=65 + (tup[1] / 100 * 20),
32 | )
33 |
34 | @staticmethod
35 | def process_elements(elements: Iterable[Tuple[int, int, datetime.datetime]]):
36 | id, count, temperature = None, 0, 0
37 | for e in elements:
38 | next_id = f"sensor_{e[0]}"
39 | if id is not None:
40 | assert id == next_id
41 | id = next_id
42 | count += 1
43 | temperature += 65 + (e[1] / 100 * 20)
44 | return id, count, temperature
45 |
46 | @staticmethod
47 | def type_mapping():
48 | return {
49 | "id": Types.STRING(),
50 | "timestamp": Types.LONG(),
51 | "num_records": Types.INT(),
52 | "temperature": Types.DOUBLE(),
53 | }
54 |
55 | @staticmethod
56 | def set_key_type_info():
57 | return set_type_info(SensorReading.type_mapping(), selects=["id"])
58 |
59 | @staticmethod
60 | def set_value_type_info():
61 | return set_type_info(SensorReading.type_mapping())
62 |
63 |
64 | @dataclasses.dataclass
65 | class SmokeLevel(TypeMapping):
66 | value: str
67 |
68 | @classmethod
69 | def from_tuple(cls, tup: Tuple[int]):
70 | return cls(value="High" if tup[0] / 100 > 0.8 else "Low")
71 |
72 | @staticmethod
73 | def type_mapping():
74 | return {"value": Types.STRING()}
75 |
76 | @staticmethod
77 | def set_value_type_info():
78 | return set_type_info(SmokeLevel.type_mapping())
79 |
80 |
81 | @dataclasses.dataclass
82 | class Alert(TypeMapping):
83 | message: str
84 | timestamp: int
85 | temperature: float
86 |
87 | @staticmethod
88 | def type_mapping():
89 | return {"message": Types.STRING(), "timestamp": Types.LONG(), "temperature": Types.DOUBLE()}
90 |
91 | @staticmethod
92 | def set_value_type_info():
93 | return set_type_info(Alert.type_mapping())
94 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Flink Demos
2 |
3 | This repository contains the source of the following posts, along with additional learning resources.
4 |
5 | - [Getting Started With Pyflink on AWS](https://jaehyeon.me/blog/2023-08-17-getting-started-with-pyflink-on-aws-part-1/)
6 | - Apache Flink is widely used for building real-time stream processing applications. On AWS, Amazon Managed Service for Apache Flink is the easiest option to develop a Flink app as it provides the underlying infrastructure. Updating a guide from AWS, this series of posts discuss how to develop and deploy a Flink (Pyflink) application on AWS where the data source and sink are Kafka topics.
7 | - [Kafka, Flink and DynamoDB for Real Time Fraud Detection](https://jaehyeon.me/blog/2023-08-10-fraud-detection-part-1/)
8 | - Re-implementing a solution from an AWS workshop, this series of posts discuss how to develop and deploy a fraud detection app using Kafka, Flink and DynamoDB. Part 1 covers local development using Docker while deployment on AWS will be discussed in part 2.
9 | - [Building Apache Flink Applications in Python](https://jaehyeon.me/blog/2023-10-19-build-pyflink-apps/)
10 | - Building Apache Flink Applications in Java by Confluent is a course to introduce Apache Flink through a series of hands-on exercises. Utilising the Flink DataStream API, the course develops three Flink applications from ingesting source data into calculating usage statistics. As part of learning the Flink DataStream API in Pyflink, I converted the Java apps into Python equivalent while performing the course exercises in Pyflink. This post summarises the progress of the conversion and shows the final output.
11 | - [Run Flink SQL Cookbook in Docker](https://jaehyeon.me/blog/2025-04-15-sql-cookbook/)
12 | - The [Flink SQL Cookbook](https://github.com/ververica/flink-sql-cookbook) is a practical guide packed with self-contained examples for learning [Apache Flink SQL](https://nightlies.apache.org/flink/flink-docs-master/docs/dev/table/sql/overview/), covering everything from basic queries to advanced stream processing patterns. Since it's designed for the Ververica Platform and lacks cluster setup instructions, this post shows how to run the examples locally using Docker Compose for a smooth, hands-on experience.
13 | - [Stream Processing With Flink in Kotlin](https://jaehyeon.me/blog/2025-12-10-streaming-processing-with-flink-in-kotlin/)
14 | - A couple of years ago, I read [Stream Processing with Apache Flink](https://www.oreilly.com/library/view/stream-processing-with/9781491974285/) and worked through the examples using PyFlink. While the book offered a solid introduction to Flink, I frequently hit limitations with the Python API, as many features from the book weren't supported. This time, I decided to revisit the material, but using Kotlin. The experience has been much more rewarding and fun.
15 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter5/basic_transformations.py:
--------------------------------------------------------------------------------
1 | import os
2 | import datetime
3 | from typing import Tuple
4 |
5 | from pyflink.common import WatermarkStrategy
6 | from pyflink.common.typeinfo import Types
7 | from pyflink.common.watermark_strategy import TimestampAssigner, Duration
8 | from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode
9 | from pyflink.table import StreamTableEnvironment
10 |
11 | from utils.model import SensorReading
12 |
13 |
14 | if __name__ == "__main__":
15 | """
16 | ## local execution
17 | python src/chapter5/basic_transformations.py
18 | """
19 |
20 | RUNTIME_ENV = os.getenv("RUNTIME_ENV", "local")
21 |
22 | env = StreamExecutionEnvironment.get_execution_environment()
23 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
24 | if RUNTIME_ENV == "local":
25 | SRC_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
26 | jar_files = ["flink-faker-0.5.3.jar"]
27 | jar_paths = tuple([f"file://{os.path.join(SRC_DIR, 'jars', name)}" for name in jar_files])
28 | print(jar_paths)
29 | env.add_jars(*jar_paths)
30 |
31 | t_env = StreamTableEnvironment.create(stream_execution_environment=env)
32 | t_env.get_config().set_local_timezone("Australia/Sydney")
33 | t_env.execute_sql(
34 | """
35 | CREATE TABLE sensor_source (
36 | `id` INT,
37 | `rn` INT,
38 | `log_time` TIMESTAMP_LTZ(3)
39 | )
40 | WITH (
41 | 'connector' = 'faker',
42 | 'rows-per-second' = '1',
43 | 'fields.id.expression' = '#{number.numberBetween ''0'',''20''}',
44 | 'fields.rn.expression' = '#{number.numberBetween ''0'',''100''}',
45 | 'fields.log_time.expression' = '#{date.past ''10'',''5'',''SECONDS''}'
46 | );
47 | """
48 | )
49 |
50 | class SourceTimestampAssigner(TimestampAssigner):
51 | def extract_timestamp(
52 | self, value: Tuple[int, int, datetime.datetime], record_timestamp: int
53 | ):
54 | return int(value[2].strftime("%s")) * 1000
55 |
56 | source_stream = t_env.to_append_stream(
57 | t_env.from_path("sensor_source"),
58 | Types.TUPLE([Types.INT(), Types.INT(), Types.SQL_TIMESTAMP()]),
59 | ).assign_timestamps_and_watermarks(
60 | WatermarkStrategy.for_bounded_out_of_orderness(
61 | Duration.of_seconds(5)
62 | ).with_timestamp_assigner(SourceTimestampAssigner())
63 | )
64 |
65 | filtered_sensors = source_stream.map(SensorReading.from_tuple).filter(
66 | lambda e: e.temperature >= 25
67 | )
68 |
69 | split_ids = filtered_sensors.flat_map(lambda e: e.id.split("_"))
70 |
71 | split_ids.print()
72 |
73 | env.execute("Basic Transformations Example")
74 |
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter6/test_window_functions_reduce.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | from typing import Tuple
3 |
4 | import pytest
5 | from pyflink.common import WatermarkStrategy
6 | from pyflink.common.watermark_strategy import TimestampAssigner, Duration
7 | from pyflink.datastream import DataStream, StreamExecutionEnvironment
8 |
9 | from window_functions_reduce import define_workflow
10 |
11 |
12 | @pytest.fixture(scope="module")
13 | def env():
14 | env = StreamExecutionEnvironment.get_execution_environment()
15 | yield env
16 |
17 |
18 | def test_define_workflow_should_return_records_having_mininum_temperature_by_id(env):
19 | source_1 = (1, 0, datetime.datetime.now())
20 | source_2 = (1, 50, datetime.datetime.now())
21 | source_3 = (2, 20, datetime.datetime.now())
22 | source_4 = (2, 100, datetime.datetime.now())
23 |
24 | class SourceTimestampAssigner(TimestampAssigner):
25 | def extract_timestamp(
26 | self, value: Tuple[int, int, datetime.datetime], record_timestamp: int
27 | ):
28 | return int(value[2].strftime("%s")) * 1000
29 |
30 | source_stream: DataStream = env.from_collection(
31 | collection=[source_1, source_2, source_3, source_4]
32 | ).assign_timestamps_and_watermarks(
33 | WatermarkStrategy.for_bounded_out_of_orderness(
34 | Duration.of_seconds(5)
35 | ).with_timestamp_assigner(SourceTimestampAssigner())
36 | )
37 |
38 | elements = list(define_workflow(source_stream).execute_and_collect())
39 | assert len(elements) == 2
40 | for e in elements:
41 | if e.id == "sensor_1":
42 | assert e.temperature == 65
43 | else:
44 | assert e.temperature == 69
45 |
46 |
47 | def test_define_workflow_should_return_records_having_mininum_temperature_within_window(env):
48 | source_1 = (1, 0, datetime.datetime.now())
49 | source_2 = (1, 50, datetime.datetime.now() + datetime.timedelta(milliseconds=100))
50 | source_3 = (1, 100, datetime.datetime.now() + datetime.timedelta(milliseconds=1000))
51 |
52 | class SourceTimestampAssigner(TimestampAssigner):
53 | def extract_timestamp(
54 | self, value: Tuple[int, int, datetime.datetime], record_timestamp: int
55 | ):
56 | return int(value[2].strftime("%s")) * 1000
57 |
58 | source_stream: DataStream = env.from_collection(
59 | collection=[source_1, source_2, source_3]
60 | ).assign_timestamps_and_watermarks(
61 | WatermarkStrategy.for_bounded_out_of_orderness(
62 | Duration.of_seconds(5)
63 | ).with_timestamp_assigner(SourceTimestampAssigner())
64 | )
65 |
66 | elements = list(define_workflow(source_stream).execute_and_collect())
67 | assert len(elements) == 2
68 | assert elements[0].temperature == 65
69 | assert elements[1].temperature == 85
70 |
--------------------------------------------------------------------------------
/real-time-streaming-aws/exporter/flinksql.sql:
--------------------------------------------------------------------------------
1 | -- docker exec -it jobmanager ./bin/sql-client.sh
2 |
3 | SET 'state.checkpoints.dir' = 'file:///tmp/checkpoints/';
4 | SET 'execution.checkpointing.interval' = '60000';
5 |
6 | ADD JAR '/etc/lib/kafka-clients-3.2.3.jar';
7 | ADD JAR '/etc/flink/package/lib/lab3-pipeline-1.0.0.jar';
8 |
9 | CREATE TABLE taxi_rides_src (
10 | id VARCHAR,
11 | vendor_id INT,
12 | pickup_date VARCHAR,
13 | pickup_datetime AS TO_TIMESTAMP(REPLACE(pickup_date, 'T', ' ')),
14 | dropoff_date VARCHAR,
15 | dropoff_datetime AS TO_TIMESTAMP(REPLACE(dropoff_date, 'T', ' ')),
16 | passenger_count INT,
17 | pickup_longitude VARCHAR,
18 | pickup_latitude VARCHAR,
19 | dropoff_longitude VARCHAR,
20 | dropoff_latitude VARCHAR,
21 | store_and_fwd_flag VARCHAR,
22 | gc_distance INT,
23 | trip_duration INT,
24 | google_distance INT,
25 | google_duration INT
26 | ) WITH (
27 | 'connector' = 'kafka',
28 | 'topic' = 'taxi-rides',
29 | 'properties.bootstrap.servers' = 'kafka-0:9092',
30 | 'properties.group.id' = 'soruce-group',
31 | 'format' = 'json',
32 | 'scan.startup.mode' = 'latest-offset'
33 | );
34 |
35 | CREATE TABLE taxi_rides_sink (
36 | id VARCHAR,
37 | vendor_id INT,
38 | pickup_datetime TIMESTAMP,
39 | dropoff_datetime TIMESTAMP,
40 | passenger_count INT,
41 | pickup_longitude VARCHAR,
42 | pickup_latitude VARCHAR,
43 | dropoff_longitude VARCHAR,
44 | dropoff_latitude VARCHAR,
45 | store_and_fwd_flag VARCHAR,
46 | gc_distance INT,
47 | trip_duration INT,
48 | google_distance INT,
49 | google_duration INT,
50 | `year` VARCHAR,
51 | `month` VARCHAR,
52 | `date` VARCHAR,
53 | `hour` VARCHAR
54 | ) PARTITIONED BY (`year`, `month`, `date`, `hour`) WITH (
55 | 'connector' = 'filesystem',
56 | 'path' = 's3://real-time-streaming-ap-southeast-2/taxi-rides/',
57 | 'format' = 'parquet',
58 | 'sink.partition-commit.delay'='1 h',
59 | 'sink.partition-commit.policy.kind'='success-file'
60 | );
61 |
62 | -- 'path' = '/tmp/taxi_rides',
63 |
64 | INSERT INTO taxi_rides_sink
65 | SELECT
66 | id,
67 | vendor_id,
68 | pickup_datetime,
69 | dropoff_datetime,
70 | passenger_count,
71 | pickup_longitude,
72 | pickup_latitude,
73 | dropoff_longitude,
74 | dropoff_latitude,
75 | store_and_fwd_flag,
76 | gc_distance,
77 | trip_duration,
78 | google_distance,
79 | google_duration,
80 | DATE_FORMAT(pickup_datetime, 'yyyy') AS `year`,
81 | DATE_FORMAT(pickup_datetime, 'MM') AS `month`,
82 | DATE_FORMAT(pickup_datetime, 'dd') AS `date`,
83 | DATE_FORMAT(pickup_datetime, 'HH') AS `hour`
84 | FROM taxi_rides_src;
--------------------------------------------------------------------------------
/stream-processing-with-pyflink/src/chapter5/keyed_transformations.py:
--------------------------------------------------------------------------------
1 | import os
2 | import datetime
3 | from typing import Tuple
4 |
5 | from pyflink.common import WatermarkStrategy
6 | from pyflink.common.typeinfo import Types
7 | from pyflink.common.watermark_strategy import TimestampAssigner, Duration
8 | from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode
9 | from pyflink.table import StreamTableEnvironment
10 |
11 | from utils.model import SensorReading
12 |
13 |
14 | if __name__ == "__main__":
15 | """
16 | ## local execution
17 | python src/chapter5/keyed_transformations.py
18 | """
19 |
20 | RUNTIME_ENV = os.getenv("RUNTIME_ENV", "local")
21 |
22 | env = StreamExecutionEnvironment.get_execution_environment()
23 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
24 | if RUNTIME_ENV == "local":
25 | SRC_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
26 | jar_files = ["flink-faker-0.5.3.jar"]
27 | jar_paths = tuple([f"file://{os.path.join(SRC_DIR, 'jars', name)}" for name in jar_files])
28 | print(jar_paths)
29 | env.add_jars(*jar_paths)
30 |
31 | t_env = StreamTableEnvironment.create(stream_execution_environment=env)
32 | t_env.get_config().set_local_timezone("Australia/Sydney")
33 | t_env.execute_sql(
34 | """
35 | CREATE TABLE sensor_source (
36 | `id` INT,
37 | `rn` INT,
38 | `log_time` TIMESTAMP_LTZ(3)
39 | )
40 | WITH (
41 | 'connector' = 'faker',
42 | 'rows-per-second' = '1',
43 | 'fields.id.expression' = '#{number.numberBetween ''0'',''20''}',
44 | 'fields.rn.expression' = '#{number.numberBetween ''0'',''100''}',
45 | 'fields.log_time.expression' = '#{date.past ''10'',''5'',''SECONDS''}'
46 | );
47 | """
48 | )
49 |
50 | class SourceTimestampAssigner(TimestampAssigner):
51 | def extract_timestamp(
52 | self, value: Tuple[int, int, datetime.datetime], record_timestamp: int
53 | ):
54 | return int(value[2].strftime("%s")) * 1000
55 |
56 | source_stream = t_env.to_append_stream(
57 | t_env.from_path("sensor_source"),
58 | Types.TUPLE([Types.INT(), Types.INT(), Types.SQL_TIMESTAMP()]),
59 | ).assign_timestamps_and_watermarks(
60 | WatermarkStrategy.for_bounded_out_of_orderness(
61 | Duration.of_seconds(5)
62 | ).with_timestamp_assigner(SourceTimestampAssigner())
63 | )
64 |
65 | keyed = source_stream.map(SensorReading.from_tuple).key_by(lambda e: e.id)
66 |
67 | max_temp_per_sensor = keyed.reduce(lambda r1, r2: r1 if r1.temperature > r2.temperature else r2)
68 |
69 | # max_temp_per_sensor.filter(lambda e: e.id == "sensor_1").print()
70 | max_temp_per_sensor.print()
71 |
72 | env.execute("Keyed Transformations Example")
73 |
--------------------------------------------------------------------------------
/flink-sql-cookbook/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 |
3 | services:
4 | jobmanager:
5 | image: flink-sql-cookbook
6 | build: .
7 | command: jobmanager
8 | container_name: jobmanager
9 | ports:
10 | - "8081:8081"
11 | networks:
12 | - cookbook
13 | environment:
14 | - |
15 | FLINK_PROPERTIES=
16 | jobmanager.rpc.address: jobmanager
17 | state.backend: filesystem
18 | state.checkpoints.dir: file:///tmp/flink-checkpoints
19 | state.savepoints.dir: file:///tmp/flink-savepoints
20 | heartbeat.interval: 1000
21 | heartbeat.timeout: 5000
22 | rest.flamegraph.enabled: true
23 | web.backpressure.refresh-interval: 10000
24 | healthcheck:
25 | test: ["CMD", "curl", "-f", "http://localhost:8081/config"]
26 | interval: 5s
27 | timeout: 5s
28 | retries: 5
29 |
30 | taskmanager-1:
31 | image: flink-sql-cookbook
32 | build: .
33 | command: taskmanager
34 | container_name: taskmanager-1
35 | networks:
36 | - cookbook
37 | depends_on:
38 | jobmanager:
39 | condition: service_healthy
40 | environment:
41 | - |
42 | FLINK_PROPERTIES=
43 | jobmanager.rpc.address: jobmanager
44 | taskmanager.numberOfTaskSlots: 10
45 | state.backend: filesystem
46 | state.checkpoints.dir: file:///tmp/flink-checkpoints
47 | state.savepoints.dir: file:///tmp/flink-savepoints
48 | heartbeat.interval: 1000
49 | heartbeat.timeout: 5000
50 |
51 | taskmanager-2:
52 | image: flink-sql-cookbook
53 | build: .
54 | command: taskmanager
55 | container_name: taskmanager-2
56 | networks:
57 | - cookbook
58 | depends_on:
59 | jobmanager:
60 | condition: service_healthy
61 | environment:
62 | - |
63 | FLINK_PROPERTIES=
64 | jobmanager.rpc.address: jobmanager
65 | taskmanager.numberOfTaskSlots: 10
66 | state.backend: filesystem
67 | state.checkpoints.dir: file:///tmp/flink-checkpoints
68 | state.savepoints.dir: file:///tmp/flink-savepoints
69 | heartbeat.interval: 1000
70 | heartbeat.timeout: 5000
71 |
72 | taskmanager-3:
73 | image: flink-sql-cookbook
74 | build: .
75 | command: taskmanager
76 | container_name: taskmanager-3
77 | networks:
78 | - cookbook
79 | depends_on:
80 | jobmanager:
81 | condition: service_healthy
82 | environment:
83 | - |
84 | FLINK_PROPERTIES=
85 | jobmanager.rpc.address: jobmanager
86 | taskmanager.numberOfTaskSlots: 10
87 | state.backend: filesystem
88 | state.checkpoints.dir: file:///tmp/flink-checkpoints
89 | state.savepoints.dir: file:///tmp/flink-savepoints
90 | heartbeat.interval: 1000
91 | heartbeat.timeout: 5000
92 |
93 | networks:
94 | cookbook:
95 | name: flink-sql-cookbook
96 |
--------------------------------------------------------------------------------