├── pyflink-doc ├── requirements.txt ├── requirements-dev.txt └── data.py ├── learning-materials ├── cep │ └── .gitkeep └── table_sql │ └── .gitkeep ├── sql-cookbook ├── requirements.txt ├── requirements-dev.txt ├── note.sh ├── src │ └── python_udf.py ├── compose-kafka.yml └── Dockerfile ├── .vscode ├── ltex.dictionary.en-US.txt └── settings.json ├── sql-training ├── client-image │ ├── VERSION │ ├── sql-client │ │ └── sql-client.sh │ ├── conf │ │ └── flink-conf.yaml │ └── java │ │ ├── sql-training-data-producer │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── ververica │ │ │ └── sql_training │ │ │ └── data_producer │ │ │ ├── records │ │ │ ├── TaxiRecord.java │ │ │ ├── DriverChange.java │ │ │ ├── Ride.java │ │ │ └── Fare.java │ │ │ ├── ConsolePrinter.java │ │ │ ├── json_serde │ │ │ ├── JsonDeserializer.java │ │ │ └── JsonSerializer.java │ │ │ └── FileReader.java │ │ └── sql-training-udfs │ │ ├── src │ │ └── main │ │ │ └── java │ │ │ └── com │ │ │ └── ververica │ │ │ └── sql_training │ │ │ └── udfs │ │ │ ├── IsInNYC.java │ │ │ ├── ToAreaId.java │ │ │ └── ToCoords.java │ │ └── pom.xml ├── mysql │ └── create_tables.sql ├── minio │ └── data │ │ └── .minio.sys │ │ ├── pool.bin │ │ └── xl.meta │ │ ├── config │ │ ├── config.json │ │ │ └── xl.meta │ │ └── iam │ │ │ └── format.json │ │ │ └── xl.meta │ │ ├── buckets │ │ ├── .usage.json │ │ │ └── xl.meta │ │ └── .bloomcycle.bin │ │ │ └── xl.meta │ │ └── format.json └── receipts.md ├── fraud-detection ├── remote │ ├── infra │ │ ├── key-pair │ │ │ └── .gitkeep │ │ ├── providers.tf │ │ ├── s3.tf │ │ ├── vpc.tf │ │ ├── data.tf │ │ ├── ddb.tf │ │ ├── scripts │ │ │ └── bootstrap.sh │ │ ├── variables.tf │ │ └── outputs.tf │ ├── requirements.txt │ ├── requirements-dev.txt │ ├── docker-compose.yml │ ├── application_properties.json │ ├── package │ │ └── uber-jar-for-pyflink │ │ │ └── src │ │ │ └── main │ │ │ └── resources │ │ │ └── log4j2.properties │ └── build.sh └── local │ ├── requirements.txt │ ├── requirements-dev.txt │ ├── configs │ ├── sink.json │ └── ddb.json │ ├── compose-connect.yml │ ├── application_properties.json │ ├── build.sh │ └── docker-compose.yml ├── real-time-streaming-aws ├── infra │ ├── key-pair │ │ └── .gitkeep │ ├── providers.tf │ ├── vpc.tf │ ├── data.tf │ ├── s3.tf │ └── scripts │ │ └── bootstrap.sh ├── requirements-dev.txt ├── producer │ └── requirements.txt ├── configs │ ├── ddb.json │ ├── note.sh │ └── sink.json ├── loader │ └── application_properties.json ├── exporter │ ├── application_properties.json │ ├── athena.sql │ └── flinksql.sql ├── forwarder │ ├── application_properties.json │ └── flinksql.sql ├── compose-ui.yml ├── build.sh ├── download.sh ├── Dockerfile ├── package │ ├── lab2-pipeline │ │ └── src │ │ │ └── main │ │ │ └── resources │ │ │ └── log4j2.properties │ ├── lab3-pipeline │ │ └── src │ │ │ └── main │ │ │ └── resources │ │ │ └── log4j2.properties │ └── lab4-pipeline │ │ └── src │ │ └── main │ │ └── resources │ │ └── log4j2.properties ├── compose-extra.yml └── compose-msk.yml ├── stream-processing-with-pyflink ├── src │ ├── jars │ │ └── .gitkeep │ ├── chapter7 │ │ ├── queryable_state.py │ │ ├── utils │ │ │ ├── type_helper.py │ │ │ └── model.py │ │ ├── checkpointed_function.py │ │ └── operator_list_state_function.py │ ├── chapter1 │ │ └── utils │ │ │ ├── type_helper.py │ │ │ └── model.py │ ├── chapter5 │ │ ├── utils │ │ │ ├── type_helper.py │ │ │ └── model.py │ │ ├── rolling_sum.py │ │ ├── basic_transformations.py │ │ └── keyed_transformations.py │ ├── chapter6 │ │ ├── utils │ │ │ ├── type_helper.py │ │ │ └── model.py │ │ └── test_window_functions_reduce.py │ └── manage_topics.py ├── requirements.txt ├── requirements-dev.txt ├── README.md └── Dockerfile ├── pyflink-getting-started-on-aws ├── remote │ ├── infra │ │ ├── key-pair │ │ │ └── .gitkeep │ │ ├── providers.tf │ │ ├── s3.tf │ │ ├── vpc.tf │ │ ├── data.tf │ │ ├── scripts │ │ │ └── bootstrap.sh │ │ ├── variables.tf │ │ └── outputs.tf │ ├── requirements-dev.txt │ ├── requirements.txt │ ├── compose-ui.yml │ ├── application_properties.json │ ├── build.sh │ ├── Dockerfile │ ├── package │ │ └── uber-jar-for-pyflink │ │ │ └── src │ │ │ └── main │ │ │ └── resources │ │ │ └── log4j2.properties │ └── compose-flink.yml └── local │ ├── requirements.txt │ ├── requirements-dev.txt │ ├── build.sh │ ├── application_properties.json │ ├── Dockerfile │ ├── compose-flink.yml │ └── compose-kafka.yml ├── stream-processing-with-flink ├── gradle.properties ├── .idea │ ├── .gitignore │ ├── dictionaries │ │ └── project.xml │ ├── kotlinc.xml │ ├── vcs.xml │ ├── ktlint-plugin.xml │ ├── misc.xml │ └── gradle.xml ├── gradle │ └── wrapper │ │ ├── gradle-wrapper.jar │ │ └── gradle-wrapper.properties ├── settings.gradle.kts ├── src │ ├── main │ │ ├── kotlin │ │ │ └── me │ │ │ │ └── jaehyeon │ │ │ │ ├── smoke │ │ │ │ ├── SmokeLevel.kt │ │ │ │ ├── SmokeLevelSplit.kt │ │ │ │ ├── Alert.kt │ │ │ │ └── SmokeLevelSourceReader.kt │ │ │ │ ├── connector │ │ │ │ ├── Post.kt │ │ │ │ ├── HttpSink.kt │ │ │ │ ├── HttpSplit.kt │ │ │ │ └── HttpSplitEnumerator.kt │ │ │ │ ├── sensor │ │ │ │ ├── SensorReading.kt │ │ │ │ └── SensorSplit.kt │ │ │ │ ├── chapter8 │ │ │ │ └── CustomConnectors.kt │ │ │ │ ├── chapter5 │ │ │ │ ├── RollingSum.kt │ │ │ │ ├── KeyedTransformations.kt │ │ │ │ └── BasicTransformations.kt │ │ │ │ ├── misc │ │ │ │ └── ControlStreamGenerator.kt │ │ │ │ └── chapter6 │ │ │ │ └── MarkerBasedWatermarkGeneration.kt │ │ └── resources │ │ │ └── simplelogger.properties │ └── test │ │ └── kotlin │ │ └── me │ │ └── jaehyeon │ │ └── chapter1 │ │ └── AverageSensorReadingsTest.kt ├── .gitignore └── build.gradle.kts ├── pyflink-udemy ├── requirements-16.txt ├── requirements-16-dev.txt ├── requirements-15-dev.txt ├── seller-input │ └── sellers.csv ├── quarterly-sales-input │ └── quarterly_sales.csv ├── requirements-15.txt ├── s3_01_tbl_env.py ├── csv-input │ └── locale-sales.csv ├── s3_03_csv_source.py ├── s3_02_python_source.py ├── s3_05_csv_sink.py ├── s4_13_row_operations.py ├── s4_04_aggregations.py ├── s4_01_projections.py ├── s3_04_kafka_source.py ├── s4_05_producer.py ├── s4_02_filtering.py ├── s4_03_joining.py └── s3_06_kafka_sink.py ├── flink-sql-cookbook ├── img │ └── sql-client.gif ├── Dockerfile ├── README.md └── docker-compose.yml ├── datorios ├── note.sh ├── docker-compose.yml └── README.md ├── confluent-flink-101 ├── notes.sh ├── Dockerfile ├── notes.md └── compose-flink-standalone.yml └── README.md /pyflink-doc/requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /learning-materials/cep/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sql-cookbook/requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /learning-materials/table_sql/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.vscode/ltex.dictionary.en-US.txt: -------------------------------------------------------------------------------- 1 | Flink 2 | -------------------------------------------------------------------------------- /sql-training/client-image/VERSION: -------------------------------------------------------------------------------- 1 | 1.0 2 | -------------------------------------------------------------------------------- /fraud-detection/remote/infra/key-pair/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /real-time-streaming-aws/infra/key-pair/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/jars/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /fraud-detection/local/requirements.txt: -------------------------------------------------------------------------------- 1 | kafka-python==2.0.2 2 | -------------------------------------------------------------------------------- /fraud-detection/remote/requirements.txt: -------------------------------------------------------------------------------- 1 | kafka-python==2.0.2 2 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/infra/key-pair/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/requirements.txt: -------------------------------------------------------------------------------- 1 | kafka-python==2.0.2 -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/local/requirements.txt: -------------------------------------------------------------------------------- 1 | kafka-python==2.0.2 -------------------------------------------------------------------------------- /stream-processing-with-flink/gradle.properties: -------------------------------------------------------------------------------- 1 | kotlin.code.style=official 2 | -------------------------------------------------------------------------------- /pyflink-udemy/requirements-16.txt: -------------------------------------------------------------------------------- 1 | apache-flink==1.16.1 2 | kafka-python==2.0.2 3 | -------------------------------------------------------------------------------- /pyflink-udemy/requirements-16-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements-16.txt 2 | black 3 | pytest 4 | pytest-cov -------------------------------------------------------------------------------- /pyflink-udemy/requirements-15-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements-15.txt 2 | black==19.10b0 3 | pytest 4 | pytest-cov -------------------------------------------------------------------------------- /stream-processing-with-flink/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /pyflink-doc/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | apache-flink==1.17.1 3 | black 4 | pytest 5 | pytest-cov -------------------------------------------------------------------------------- /sql-cookbook/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | apache-flink==1.17.1 3 | black 4 | pytest 5 | pytest-cov -------------------------------------------------------------------------------- /sql-cookbook/note.sh: -------------------------------------------------------------------------------- 1 | docker build -t flink-sql-cookbook:1.17.1-scala_2.12 . 2 | 3 | docker-compose run sql-client 4 | 5 | -------------------------------------------------------------------------------- /sql-training/mysql/create_tables.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE AreaCnts (areaId INT NOT NULL, cnt BIGINT NOT NULL, PRIMARY KEY (areaId)); 2 | -------------------------------------------------------------------------------- /fraud-detection/local/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | apache-flink==1.15.2 3 | black==19.10b0 4 | pytest 5 | pytest-cov -------------------------------------------------------------------------------- /fraud-detection/remote/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | apache-flink==1.15.2 3 | black==19.10b0 4 | pytest 5 | pytest-cov -------------------------------------------------------------------------------- /flink-sql-cookbook/img/sql-client.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/flink-sql-cookbook/img/sql-client.gif -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/local/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | apache-flink==1.15.2 3 | black==19.10b0 4 | pytest 5 | pytest-cov -------------------------------------------------------------------------------- /stream-processing-with-pyflink/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | apache-flink==1.17.1 3 | black 4 | pytest 5 | pytest-cov 6 | ipdb 7 | -------------------------------------------------------------------------------- /pyflink-udemy/seller-input/sellers.csv: -------------------------------------------------------------------------------- 1 | id,city,state 2 | LNK,Lincoln,Nebraska 3 | OMA,Omaha,Nebraska 4 | KC,Kansas City,Missouri 5 | DEN,Denver,Colorado -------------------------------------------------------------------------------- /real-time-streaming-aws/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r producer/requirements.txt 2 | apache-flink==1.17.1 3 | black 4 | boto3 5 | pytest 6 | pytest-cov 7 | ipdb -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | apache-flink==1.15.2 3 | black==19.10b0 4 | boto3 5 | pytest 6 | pytest-cov -------------------------------------------------------------------------------- /sql-training/minio/data/.minio.sys/pool.bin/xl.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/sql-training/minio/data/.minio.sys/pool.bin/xl.meta -------------------------------------------------------------------------------- /sql-training/client-image/sql-client/sql-client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ${FLINK_HOME}/bin/sql-client.sh embedded -d ${FLINK_HOME}/conf/sql-client-conf.yaml -l ${SQL_CLIENT_HOME}/lib -------------------------------------------------------------------------------- /sql-training/minio/data/.minio.sys/config/config.json/xl.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/sql-training/minio/data/.minio.sys/config/config.json/xl.meta -------------------------------------------------------------------------------- /sql-training/minio/data/.minio.sys/buckets/.usage.json/xl.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/sql-training/minio/data/.minio.sys/buckets/.usage.json/xl.meta -------------------------------------------------------------------------------- /stream-processing-with-flink/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/stream-processing-with-flink/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /sql-training/minio/data/.minio.sys/buckets/.bloomcycle.bin/xl.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/sql-training/minio/data/.minio.sys/buckets/.bloomcycle.bin/xl.meta -------------------------------------------------------------------------------- /sql-training/minio/data/.minio.sys/config/iam/format.json/xl.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jaehyeon-kim/flink-demos/HEAD/sql-training/minio/data/.minio.sys/config/iam/format.json/xl.meta -------------------------------------------------------------------------------- /flink-sql-cookbook/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM flink:1.20.1 2 | 3 | # add faker connector 4 | RUN wget -P /opt/flink/lib/ \ 5 | https://github.com/knaufk/flink-faker/releases/download/v0.5.3/flink-faker-0.5.3.jar -------------------------------------------------------------------------------- /pyflink-udemy/quarterly-sales-input/quarterly_sales.csv: -------------------------------------------------------------------------------- 1 | seller_id,q1,q2,q3,q4 2 | LNK,10000,12300,9600,13200 3 | OMA,18100,17600,11800,15000 4 | KC,19700,18600,21800,17300 5 | DEN,18500,19600,17200,22800 -------------------------------------------------------------------------------- /stream-processing-with-flink/settings.gradle.kts: -------------------------------------------------------------------------------- 1 | plugins { 2 | id("org.gradle.toolchains.foojay-resolver-convention") version "0.8.0" 3 | } 4 | rootProject.name = "stream-processing-with-flink" 5 | -------------------------------------------------------------------------------- /stream-processing-with-flink/.idea/dictionaries/project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | sideoutput 5 | 6 | 7 | -------------------------------------------------------------------------------- /real-time-streaming-aws/producer/requirements.txt: -------------------------------------------------------------------------------- 1 | # kafka-python with IAM auth support - https://github.com/dpkp/kafka-python/pull/2255 2 | https://github.com/mattoberle/kafka-python/archive/7ff323727d99e0c33a68423300e7f88a9cf3f830.tar.gz -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/requirements.txt: -------------------------------------------------------------------------------- 1 | # kafka-python with IAM auth support - https://github.com/dpkp/kafka-python/pull/2255 2 | https://github.com/mattoberle/kafka-python/archive/7ff323727d99e0c33a68423300e7f88a9cf3f830.tar.gz -------------------------------------------------------------------------------- /stream-processing-with-flink/.idea/kotlinc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /stream-processing-with-flink/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /pyflink-udemy/requirements-15.txt: -------------------------------------------------------------------------------- 1 | apache-flink==1.15.4 2 | # kafka-python with IAM auth support - https://github.com/dpkp/kafka-python/pull/2255 3 | https://github.com/mattoberle/kafka-python/archive/7ff323727d99e0c33a68423300e7f88a9cf3f830.tar.gz 4 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter7/queryable_state.py: -------------------------------------------------------------------------------- 1 | # https://flink.apache.org/2023/10/24/announcing-the-release-of-apache-flink-1.18/#important-deprecations 2 | # Queryable State is now officially deprecated and will be dropped in Flink 2.0. 3 | -------------------------------------------------------------------------------- /sql-training/minio/data/.minio.sys/format.json: -------------------------------------------------------------------------------- 1 | {"version":"1","format":"xl-single","id":"e28d208f-a22f-4eb4-933b-2de241ef0141","xl":{"version":"3","this":"d3684101-1f69-493b-90c9-56c4e2763ebd","sets":[["d3684101-1f69-493b-90c9-56c4e2763ebd"]],"distributionAlgo":"SIPMOD+PARITY"}} -------------------------------------------------------------------------------- /stream-processing-with-flink/.idea/ktlint-plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | DISTRACT_FREE 5 | DEFAULT 6 | 7 | -------------------------------------------------------------------------------- /stream-processing-with-flink/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Fri Nov 21 17:12:54 AEDT 2025 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | distributionUrl=https\://services.gradle.org/distributions/gradle-8.14-bin.zip 5 | zipStoreBase=GRADLE_USER_HOME 6 | zipStorePath=wrapper/dists 7 | -------------------------------------------------------------------------------- /fraud-detection/remote/infra/providers.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.0.1" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = ">= 3.72, < 5.0" 8 | } 9 | 10 | random = { 11 | source = "hashicorp/random" 12 | version = ">= 3.0.1" 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /real-time-streaming-aws/infra/providers.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.0.1" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = ">= 3.72, < 5.0" 8 | } 9 | 10 | random = { 11 | source = "hashicorp/random" 12 | version = ">= 3.0.1" 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /real-time-streaming-aws/configs/ddb.json: -------------------------------------------------------------------------------- 1 | { 2 | "TableName": "real-time-streaming-taxi-rides", 3 | "KeySchema": [{ "AttributeName": "id", "KeyType": "HASH" }], 4 | "AttributeDefinitions": [{ "AttributeName": "id", "AttributeType": "S" }], 5 | "ProvisionedThroughput": { 6 | "ReadCapacityUnits": 1, 7 | "WriteCapacityUnits": 1 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/infra/providers.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = ">= 1.0.1" 3 | 4 | required_providers { 5 | aws = { 6 | source = "hashicorp/aws" 7 | version = ">= 3.72, < 5.0" 8 | } 9 | 10 | random = { 11 | source = "hashicorp/random" 12 | version = ">= 3.0.1" 13 | } 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/smoke/SmokeLevel.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.smoke 2 | 3 | /** 4 | * Represents the two possible levels of smoke detection. 5 | * This is the standard, type-safe, and idiomatic way to define 6 | * a fixed set of constants in modern Kotlin. 7 | */ 8 | enum class SmokeLevel { 9 | High, 10 | Low, 11 | } 12 | -------------------------------------------------------------------------------- /sql-cookbook/src/python_udf.py: -------------------------------------------------------------------------------- 1 | from pyflink.table import DataTypes 2 | from pyflink.table.udf import udf 3 | 4 | us_cities = {"Chicago", "Portland", "Seattle", "New York"} 5 | 6 | 7 | @udf(input_types=[DataTypes.STRING(), DataTypes.FLOAT()], result_type=DataTypes.FLOAT()) 8 | def to_fahr(city, temperature): 9 | return temperature if city not in us_cities else (temperature * 9.0 / 5.0) + 32.0 10 | -------------------------------------------------------------------------------- /stream-processing-with-flink/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /datorios/note.sh: -------------------------------------------------------------------------------- 1 | /opt/flink/examples/python/datastream/datorios 2 | cat /opt/flink/examples/python/datastream/datorios/tumbling_count_window.py 3 | 4 | 5 | docker exec 6 | 7 | ./datorios.sh my-cluster start 8 | ./datorios.sh list 9 | ./datorios.sh my-cluster flink run /flink_jobs/CarData.jar 10 | ./datorios.sh my-cluster stop 11 | 12 | ./datorios.sh my-cluster flink run \ 13 | -py /opt/flink/apps/tumbling_count_window.py 14 | -------------------------------------------------------------------------------- /real-time-streaming-aws/configs/note.sh: -------------------------------------------------------------------------------- 1 | curl -i -X POST -H "Accept:application/json" -H "Content-Type:application/json" \ 2 | http://localhost:8083/connectors/ -d @configs/sink.json 3 | 4 | curl http://localhost:8083/connectors/real-time-streaming-taxi-rides-sink/status 5 | 6 | curl -X DELETE http://localhost:8083/connectors/real-time-streaming-taxi-rides-sink 7 | 8 | aws dynamodb create-table --cli-input-json file://configs/ddb.json 9 | aws dynamodb delete-table --table-name real-time-streaming-taxi-rides -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/resources/simplelogger.properties: -------------------------------------------------------------------------------- 1 | # Default log level for all loggers 2 | org.slf4j.simpleLogger.defaultLogLevel=warn 3 | 4 | # Log level for a specific package 5 | org.slf4j.simpleLogger.log.me.jaehyeon=info 6 | 7 | # Show date and time 8 | org.slf4j.simpleLogger.showDateTime=true 9 | 10 | # Format for date and time 11 | org.slf4j.simpleLogger.dateTimeFormat=yyyy-MM-dd HH:mm:ss.SSS 12 | 13 | # Show the thread name 14 | org.slf4j.simpleLogger.showThreadName=false -------------------------------------------------------------------------------- /confluent-flink-101/notes.sh: -------------------------------------------------------------------------------- 1 | # build docker image 2 | docker build -t=confluent-flink-101:1.15.4 . 3 | 4 | # start docker compose services and run sql client 5 | docker-compose -f compose-kafka.yml up -d 6 | docker-compose -f compose-flink-linked.yml up -d 7 | docker-compose -f compose-flink-linked.yml run sql-client 8 | 9 | 10 | docker run --rm -it --network=kafka-network bitnami/kafka:2.8.1 \ 11 | /opt/bitnami/kafka/bin/kafka-topics.sh \ 12 | --bootstrap-server kafka-0:9092 \ 13 | --create --topic pageviews 14 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/smoke/SmokeLevelSplit.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.smoke 2 | 3 | import org.apache.flink.api.connector.source.SourceSplit 4 | import java.io.Serializable 5 | 6 | /** 7 | * Represents a split for the SmokeLevelSource. 8 | * 9 | * Since each reader instance behaves identically, we only need a single split type. 10 | */ 11 | data class SmokeLevelSplit( 12 | private val id: String = "smoke-level-split", 13 | ) : SourceSplit, 14 | Serializable { 15 | override fun splitId(): String = id 16 | } 17 | -------------------------------------------------------------------------------- /pyflink-udemy/s3_01_tbl_env.py: -------------------------------------------------------------------------------- 1 | # batch/stream table env 2 | from pyflink.table import EnvironmentSettings, TableEnvironment 3 | 4 | batch_tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode()) 5 | stream_tbl_env = TableEnvironment.create(EnvironmentSettings.in_streaming_mode()) 6 | 7 | # from DataStram environment 8 | from pyflink.datastream import StreamExecutionEnvironment 9 | from pyflink.table import StreamTableEnvironment 10 | 11 | ds_env = StreamExecutionEnvironment.get_execution_environment() 12 | tbl_env = StreamTableEnvironment.create(ds_env) 13 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/smoke/Alert.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.smoke 2 | 3 | import java.io.Serializable 4 | 5 | /** 6 | * Represents a smoke level alert. 7 | * A `data class` is the idiomatic Kotlin way to create a class 8 | * whose primary purpose is to hold data. The compiler automatically 9 | * generates `equals()`, `hashCode()`, `toString()`, and `copy()` methods, 10 | * which is crucial for Flink operations. 11 | */ 12 | data class Alert( 13 | val sensorId: String, 14 | val timestamp: Long, 15 | val message: String, 16 | ) : Serializable 17 | -------------------------------------------------------------------------------- /confluent-flink-101/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM flink:1.15.4-scala_2.12 2 | 3 | RUN wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-kafka/1.15.4/flink-connector-kafka-1.15.4.jar; \ 4 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/kafka/kafka-clients/2.8.1/kafka-clients-2.8.1.jar; \ 5 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/1.15.4/flink-sql-connector-kafka-1.15.4.jar; \ 6 | wget -P /opt/flink/lib/ https://github.com/knaufk/flink-faker/releases/download/v0.5.0/flink-faker-0.5.0.jar; 7 | -------------------------------------------------------------------------------- /fraud-detection/remote/infra/s3.tf: -------------------------------------------------------------------------------- 1 | resource "aws_s3_bucket" "default_bucket" { 2 | bucket = local.default_bucket.name 3 | 4 | force_destroy = true 5 | 6 | tags = local.tags 7 | } 8 | 9 | resource "aws_s3_bucket_acl" "default_bucket" { 10 | count = local.default_bucket.to_set_acl ? 1 : 0 11 | 12 | bucket = aws_s3_bucket.default_bucket.id 13 | acl = "private" 14 | } 15 | 16 | resource "aws_s3_bucket_server_side_encryption_configuration" "default_bucket" { 17 | bucket = aws_s3_bucket.default_bucket.bucket 18 | 19 | rule { 20 | apply_server_side_encryption_by_default { 21 | sse_algorithm = "AES256" 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /fraud-detection/remote/infra/vpc.tf: -------------------------------------------------------------------------------- 1 | module "vpc" { 2 | source = "terraform-aws-modules/vpc/aws" 3 | version = "~> 3.14" 4 | 5 | name = "${local.name}-vpc" 6 | cidr = local.vpc.cidr 7 | 8 | azs = local.vpc.azs 9 | public_subnets = [for k, v in local.vpc.azs : cidrsubnet(local.vpc.cidr, 3, k)] 10 | private_subnets = [for k, v in local.vpc.azs : cidrsubnet(local.vpc.cidr, 3, k + 3)] 11 | 12 | enable_nat_gateway = true 13 | create_igw = true 14 | enable_dns_hostnames = true 15 | single_nat_gateway = true 16 | 17 | private_subnet_tags = { 18 | "Tier" = "Private" 19 | } 20 | 21 | tags = local.tags 22 | } 23 | -------------------------------------------------------------------------------- /real-time-streaming-aws/infra/vpc.tf: -------------------------------------------------------------------------------- 1 | module "vpc" { 2 | source = "terraform-aws-modules/vpc/aws" 3 | version = "~> 3.14" 4 | 5 | name = "${local.name}-vpc" 6 | cidr = local.vpc.cidr 7 | 8 | azs = local.vpc.azs 9 | public_subnets = [for k, v in local.vpc.azs : cidrsubnet(local.vpc.cidr, 3, k)] 10 | private_subnets = [for k, v in local.vpc.azs : cidrsubnet(local.vpc.cidr, 3, k + 3)] 11 | 12 | enable_nat_gateway = true 13 | create_igw = true 14 | enable_dns_hostnames = true 15 | single_nat_gateway = true 16 | 17 | private_subnet_tags = { 18 | "Tier" = "Private" 19 | } 20 | 21 | tags = local.tags 22 | } 23 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/connector/Post.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.connector 2 | 3 | import kotlinx.serialization.Serializable 4 | import java.io.Serializable as JavaSerializable 5 | 6 | /** 7 | * Data class representing a Post, fetched from the external API. 8 | * 9 | * - @Serializable: For Ktor to deserialize JSON into this object. 10 | * - JavaSerializable: For Flink to send objects between TaskManagers. 11 | * An alias is used to avoid a name clash. 12 | */ 13 | @Serializable 14 | data class Post( 15 | val userId: Int, 16 | val id: Int, 17 | val title: String, 18 | val body: String, 19 | ) : JavaSerializable 20 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/infra/s3.tf: -------------------------------------------------------------------------------- 1 | resource "aws_s3_bucket" "default_bucket" { 2 | bucket = local.default_bucket.name 3 | 4 | force_destroy = true 5 | 6 | tags = local.tags 7 | } 8 | 9 | resource "aws_s3_bucket_acl" "default_bucket" { 10 | count = local.default_bucket.to_set_acl ? 1 : 0 11 | 12 | bucket = aws_s3_bucket.default_bucket.id 13 | acl = "private" 14 | } 15 | 16 | resource "aws_s3_bucket_server_side_encryption_configuration" "default_bucket" { 17 | bucket = aws_s3_bucket.default_bucket.bucket 18 | 19 | rule { 20 | apply_server_side_encryption_by_default { 21 | sse_algorithm = "AES256" 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/infra/vpc.tf: -------------------------------------------------------------------------------- 1 | module "vpc" { 2 | source = "terraform-aws-modules/vpc/aws" 3 | version = "~> 3.14" 4 | 5 | name = "${local.name}-vpc" 6 | cidr = local.vpc.cidr 7 | 8 | azs = local.vpc.azs 9 | public_subnets = [for k, v in local.vpc.azs : cidrsubnet(local.vpc.cidr, 3, k)] 10 | private_subnets = [for k, v in local.vpc.azs : cidrsubnet(local.vpc.cidr, 3, k + 3)] 11 | 12 | enable_nat_gateway = true 13 | create_igw = true 14 | enable_dns_hostnames = true 15 | single_nat_gateway = true 16 | 17 | private_subnet_tags = { 18 | "Tier" = "Private" 19 | } 20 | 21 | tags = local.tags 22 | } 23 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter1/utils/type_helper.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractstaticmethod 2 | from typing import List, Dict 3 | 4 | from pyflink.common.typeinfo import Types, TypeInformation 5 | 6 | 7 | class TypeMapping(ABC): 8 | @abstractstaticmethod 9 | def type_mapping(): 10 | pass 11 | 12 | 13 | def set_type_info(type_mapping: Dict[str, TypeInformation], selects: List[str] = []): 14 | names, types = [], [] 15 | for key in type_mapping.keys(): 16 | if not selects or key in selects: 17 | names.append(key) 18 | types.append(type_mapping[key]) 19 | return Types.ROW_NAMED(field_names=names, field_types=types) 20 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter5/utils/type_helper.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractstaticmethod 2 | from typing import List, Dict 3 | 4 | from pyflink.common.typeinfo import Types, TypeInformation 5 | 6 | 7 | class TypeMapping(ABC): 8 | @abstractstaticmethod 9 | def type_mapping(): 10 | pass 11 | 12 | 13 | def set_type_info(type_mapping: Dict[str, TypeInformation], selects: List[str] = []): 14 | names, types = [], [] 15 | for key in type_mapping.keys(): 16 | if not selects or key in selects: 17 | names.append(key) 18 | types.append(type_mapping[key]) 19 | return Types.ROW_NAMED(field_names=names, field_types=types) 20 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter6/utils/type_helper.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractstaticmethod 2 | from typing import List, Dict 3 | 4 | from pyflink.common.typeinfo import Types, TypeInformation 5 | 6 | 7 | class TypeMapping(ABC): 8 | @abstractstaticmethod 9 | def type_mapping(): 10 | pass 11 | 12 | 13 | def set_type_info(type_mapping: Dict[str, TypeInformation], selects: List[str] = []): 14 | names, types = [], [] 15 | for key in type_mapping.keys(): 16 | if not selects or key in selects: 17 | names.append(key) 18 | types.append(type_mapping[key]) 19 | return Types.ROW_NAMED(field_names=names, field_types=types) 20 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter7/utils/type_helper.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractstaticmethod 2 | from typing import List, Dict 3 | 4 | from pyflink.common.typeinfo import Types, TypeInformation 5 | 6 | 7 | class TypeMapping(ABC): 8 | @abstractstaticmethod 9 | def type_mapping(): 10 | pass 11 | 12 | 13 | def set_type_info(type_mapping: Dict[str, TypeInformation], selects: List[str] = []): 14 | names, types = [], [] 15 | for key in type_mapping.keys(): 16 | if not selects or key in selects: 17 | names.append(key) 18 | types.append(type_mapping[key]) 19 | return Types.ROW_NAMED(field_names=names, field_types=types) 20 | -------------------------------------------------------------------------------- /real-time-streaming-aws/loader/application_properties.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "PropertyGroupId": "kinesis.analytics.flink.run.options", 4 | "PropertyMap": { 5 | "python": "processor.py", 6 | "jarfile": "package/lib/lab2-pipeline-1.0.0.jar" 7 | } 8 | }, 9 | { 10 | "PropertyGroupId": "source.config.0", 11 | "PropertyMap": { 12 | "table.name": "taxi_trip_source", 13 | "file.path": "s3://real-time-streaming-ap-southeast-2/taxi-csv/" 14 | } 15 | }, 16 | { 17 | "PropertyGroupId": "sink.config.0", 18 | "PropertyMap": { 19 | "table.name": "taxi_trip_sink", 20 | "topic.name": "taxi-trip", 21 | "bootstrap.servers": "localhost:29092" 22 | } 23 | } 24 | ] 25 | -------------------------------------------------------------------------------- /real-time-streaming-aws/exporter/application_properties.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "PropertyGroupId": "kinesis.analytics.flink.run.options", 4 | "PropertyMap": { 5 | "python": "processor.py", 6 | "jarfile": "package/lib/lab3-pipeline-1.0.0.jar" 7 | } 8 | }, 9 | { 10 | "PropertyGroupId": "source.config.0", 11 | "PropertyMap": { 12 | "table.name": "taxi_rides_src", 13 | "topic.name": "taxi-rides", 14 | "bootstrap.servers": "localhost:29092" 15 | } 16 | }, 17 | { 18 | "PropertyGroupId": "sink.config.0", 19 | "PropertyMap": { 20 | "table.name": "taxi_rides_sink", 21 | "file.path": "s3://real-time-streaming-ap-southeast-2/taxi-rides/" 22 | } 23 | } 24 | ] 25 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/sensor/SensorReading.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.sensor 2 | 3 | import java.io.Serializable 4 | 5 | /** 6 | * Represents a single sensor reading. 7 | * 8 | * This data class is the fundamental event type used throughout the Flink job. 9 | * It must be Serializable to be sent across the Flink cluster. 10 | * 11 | * @property id The unique identifier of the sensor. 12 | * @property timestamp The timestamp of the reading, in milliseconds since the epoch. 13 | * @property temperature The temperature value of the reading. 14 | */ 15 | data class SensorReading( 16 | val id: String, 17 | val timestamp: Long, 18 | val temperature: Double, 19 | ) : Serializable 20 | -------------------------------------------------------------------------------- /real-time-streaming-aws/forwarder/application_properties.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "PropertyGroupId": "kinesis.analytics.flink.run.options", 4 | "PropertyMap": { 5 | "python": "processor.py", 6 | "jarfile": "package/lib/lab4-pipeline-1.0.0.jar" 7 | } 8 | }, 9 | { 10 | "PropertyGroupId": "source.config.0", 11 | "PropertyMap": { 12 | "table.name": "taxi_rides_src", 13 | "topic.name": "taxi-rides", 14 | "bootstrap.servers": "localhost:29092" 15 | } 16 | }, 17 | { 18 | "PropertyGroupId": "sink.config.0", 19 | "PropertyMap": { 20 | "table.name": "trip_stats_sink", 21 | "os_hosts": "http://opensearch:9200", 22 | "os_index": "trip_stats" 23 | } 24 | } 25 | ] 26 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter5/rolling_sum.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode 4 | 5 | if __name__ == "__main__": 6 | """ 7 | ## local execution 8 | python src/chapter5/rolling_sum.py 9 | """ 10 | 11 | RUNTIME_ENV = os.getenv("RUNTIME_ENV", "local") 12 | 13 | env = StreamExecutionEnvironment.get_execution_environment() 14 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING) 15 | 16 | input_stream = env.from_collection(collection=[(1, 2, 2), (2, 3, 1), (2, 2, 4), (1, 5, 3)]) 17 | 18 | result_stream = input_stream.key_by(lambda e: e[0]).sum(1) 19 | 20 | result_stream.print() 21 | 22 | env.execute("Rolling Sum Example") 23 | -------------------------------------------------------------------------------- /fraud-detection/remote/infra/data.tf: -------------------------------------------------------------------------------- 1 | ## data sources for general resources 2 | # Find the user currently in use by AWS 3 | data "aws_caller_identity" "current" {} 4 | 5 | # Region in which to deploy the solution 6 | data "aws_region" "current" {} 7 | 8 | # Availability zones to use in our soultion 9 | data "aws_availability_zones" "available" { 10 | state = "available" 11 | } 12 | 13 | ## data sources for VPN 14 | # Local ip address 15 | data "http" "local_ip_address" { 16 | url = "https://ifconfig.me/ip" 17 | } 18 | 19 | # Latest Amazon linux 2 AMI 20 | data "aws_ami" "amazon_linux_2" { 21 | owners = ["amazon"] 22 | most_recent = true 23 | 24 | filter { 25 | name = "name" 26 | values = ["amzn2-ami-hvm-*-x86_64-ebs"] 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /stream-processing-with-flink/.idea/gradle.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 17 | 18 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/infra/data.tf: -------------------------------------------------------------------------------- 1 | ## data sources for general resources 2 | # Find the user currently in use by AWS 3 | data "aws_caller_identity" "current" {} 4 | 5 | # Region in which to deploy the solution 6 | data "aws_region" "current" {} 7 | 8 | # Availability zones to use in our soultion 9 | data "aws_availability_zones" "available" { 10 | state = "available" 11 | } 12 | 13 | ## data sources for VPN 14 | # Local ip address 15 | data "http" "local_ip_address" { 16 | url = "https://ifconfig.me/ip" 17 | } 18 | 19 | # Latest Amazon linux 2 AMI 20 | data "aws_ami" "amazon_linux_2" { 21 | owners = ["amazon"] 22 | most_recent = true 23 | 24 | filter { 25 | name = "name" 26 | values = ["amzn2-ami-hvm-*-x86_64-ebs"] 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /pyflink-udemy/csv-input/locale-sales.csv: -------------------------------------------------------------------------------- 1 | seller_id,product,quantity,product_price,sales_date 2 | LNK,Toothbrush,22,3.99,2021-07-01 3 | LNK,Dental Floss,17,1.99,2021-07-01 4 | LNK,Toothpaste,8,4.99,2021-07-01 5 | OMA,Toothbrush,29,3.99,2021-07-01 6 | OMA,Toothpaste,9,4.99,2021-07-01 7 | OMA,Dental Floss,23,1.99,2021-07-01 8 | LNK,Toothbrush,25,3.99,2021-07-02 9 | LNK,Dental Floss,16,1.99,2021-07-02 10 | LNK,Toothpaste,9,4.99,2021-07-02 11 | OMA,Toothbrush,32,3.99,2021-07-02 12 | OMA,Toothpaste,13,4.99,2021-07-02 13 | OMA,Dental Floss,18,1.99,2021-07-02 14 | LNK,Toothbrush,20,3.99,2021-07-03 15 | LNK,Dental Floss,15,1.99,2021-07-03 16 | LNK,Toothpaste,11,4.99,2021-07-03 17 | OMA,Toothbrush,31,3.99,2021-07-03 18 | OMA,Toothpaste,10,4.99,2021-07-03 19 | OMA,Dental Floss,21,1.99,2021-07-03 -------------------------------------------------------------------------------- /pyflink-udemy/s3_03_csv_source.py: -------------------------------------------------------------------------------- 1 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource 2 | 3 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode()) 4 | 5 | field_names = "seller_id,product,quantity,product_price,sales_date".split(",") 6 | field_types = [ 7 | DataTypes.STRING(), 8 | DataTypes.STRING(), 9 | DataTypes.INT(), 10 | DataTypes.DOUBLE(), 11 | DataTypes.DATE(), 12 | ] 13 | source = CsvTableSource("./csv-input", field_names, field_types, ignore_first_line=True) 14 | tbl_env.register_table_source("product_locale_sales", source) 15 | tbl = tbl_env.from_path("product_locale_sales") 16 | print("\nProduct Sales Schema") 17 | tbl.print_schema() 18 | print("\nProduct Sales data") 19 | print(tbl.to_pandas()) 20 | -------------------------------------------------------------------------------- /stream-processing-with-flink/.gitignore: -------------------------------------------------------------------------------- 1 | .gradle 2 | build/ 3 | !gradle/wrapper/gradle-wrapper.jar 4 | !**/src/main/**/build/ 5 | !**/src/test/**/build/ 6 | 7 | ### IntelliJ IDEA ### 8 | .idea/modules.xml 9 | .idea/jarRepositories.xml 10 | .idea/compiler.xml 11 | .idea/libraries/ 12 | *.iws 13 | *.iml 14 | *.ipr 15 | out/ 16 | !**/src/main/**/out/ 17 | !**/src/test/**/out/ 18 | 19 | ### Kotlin ### 20 | .kotlin 21 | 22 | ### Eclipse ### 23 | .apt_generated 24 | .classpath 25 | .factorypath 26 | .project 27 | .settings 28 | .springBeans 29 | .sts4-cache 30 | bin/ 31 | !**/src/main/**/bin/ 32 | !**/src/test/**/bin/ 33 | 34 | ### NetBeans ### 35 | /nbproject/private/ 36 | /nbbuild/ 37 | /dist/ 38 | /nbdist/ 39 | /.nb-gradle/ 40 | 41 | ### VS Code ### 42 | .vscode/ 43 | 44 | ### Mac OS ### 45 | .DS_Store -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/compose-ui.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | kpow: 5 | image: factorhouse/kpow-ce:91.2.1 6 | container_name: kpow 7 | ports: 8 | - "3000:3000" 9 | networks: 10 | - appnet 11 | environment: 12 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID 13 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY 14 | AWS_SESSION_TOKEN: $AWS_SESSION_TOKEN 15 | # kafka cluster 16 | BOOTSTRAP: $BOOTSTRAP_SERVERS 17 | SECURITY_PROTOCOL: SASL_SSL 18 | SASL_MECHANISM: AWS_MSK_IAM 19 | SASL_CLIENT_CALLBACK_HANDLER_CLASS: software.amazon.msk.auth.iam.IAMClientCallbackHandler 20 | SASL_JAAS_CONFIG: software.amazon.msk.auth.iam.IAMLoginModule required; 21 | 22 | networks: 23 | appnet: 24 | name: app-network 25 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/local/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | SCRIPT_DIR="$(cd $(dirname "$0"); pwd)" 3 | SRC_PATH=$SCRIPT_DIR/package 4 | rm -rf $SRC_PATH && mkdir -p $SRC_PATH/lib 5 | 6 | ## Download flink sql connector kafka 7 | echo "download flink sql connector kafka..." 8 | VERSION=1.15.2 9 | FILE_NAME=flink-sql-connector-kafka-$VERSION 10 | DOWNLOAD_URL=https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/$VERSION/flink-sql-connector-kafka-$VERSION.jar 11 | curl -L -o $SRC_PATH/lib/$FILE_NAME.jar ${DOWNLOAD_URL} 12 | 13 | ## Install pip packages 14 | echo "install and zip pip packages..." 15 | pip install -r requirements.txt --target $SRC_PATH/site_packages 16 | 17 | ## Package pyflink app 18 | echo "package pyflink app" 19 | zip -r kda-package.zip processor.py package/lib package/site_packages 20 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/application_properties.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "PropertyGroupId": "kinesis.analytics.flink.run.options", 4 | "PropertyMap": { 5 | "python": "processor.py", 6 | "jarfile": "package/lib/pyflink-getting-started-1.0.0.jar", 7 | "pyFiles": "package/site_packages/" 8 | } 9 | }, 10 | { 11 | "PropertyGroupId": "consumer.config.0", 12 | "PropertyMap": { 13 | "table.name": "source_table", 14 | "topic.name": "stocks-in", 15 | "bootstrap.servers": "localhost:29092", 16 | "startup.mode": "earliest-offset" 17 | } 18 | }, 19 | { 20 | "PropertyGroupId": "producer.config.0", 21 | "PropertyMap": { 22 | "table.name": "sink_table", 23 | "topic.name": "stocks-out", 24 | "bootstrap.servers": "localhost:29092" 25 | } 26 | } 27 | ] 28 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/local/application_properties.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "PropertyGroupId": "kinesis.analytics.flink.run.options", 4 | "PropertyMap": { 5 | "python": "processor.py", 6 | "jarfile": "package/lib/flink-sql-connector-kinesis-1.15.2.jar", 7 | "pyFiles": "package/site_packages/" 8 | } 9 | }, 10 | { 11 | "PropertyGroupId": "consumer.config.0", 12 | "PropertyMap": { 13 | "table.name": "source_table", 14 | "topic.name": "stocks-in", 15 | "bootstrap.servers": "localhost:29092", 16 | "startup.mode": "earliest-offset" 17 | } 18 | }, 19 | { 20 | "PropertyGroupId": "producer.config.0", 21 | "PropertyMap": { 22 | "table.name": "sink_table", 23 | "topic.name": "stocks-out", 24 | "bootstrap.servers": "localhost:29092" 25 | } 26 | } 27 | ] 28 | -------------------------------------------------------------------------------- /real-time-streaming-aws/compose-ui.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | kpow: 5 | image: factorhouse/kpow-ce:91.5.1 6 | container_name: kpow 7 | ports: 8 | - "3000:3000" 9 | networks: 10 | - kafkanet 11 | environment: 12 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID 13 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY 14 | # AWS_SESSION_TOKEN: $AWS_SESSION_TOKEN 15 | BOOTSTRAP: $BOOTSTRAP_SERVERS 16 | SECURITY_PROTOCOL: SASL_SSL 17 | SASL_MECHANISM: AWS_MSK_IAM 18 | SASL_JAAS_CONFIG: software.amazon.msk.auth.iam.IAMLoginModule required; 19 | SASL_CLIENT_CALLBACK_HANDLER_CLASS: software.amazon.msk.auth.iam.IAMClientCallbackHandler 20 | env_file: # https://kpow.io/get-started/#individual 21 | - ./kpow.env 22 | 23 | networks: 24 | kafkanet: 25 | name: kafka-network 26 | -------------------------------------------------------------------------------- /fraud-detection/remote/infra/ddb.tf: -------------------------------------------------------------------------------- 1 | resource "aws_dynamodb_table" "transactions_table" { 2 | name = "${local.name}-flagged-transactions" 3 | billing_mode = "PROVISIONED" 4 | read_capacity = 2 5 | write_capacity = 2 6 | hash_key = "transaction_id" 7 | range_key = "transaction_date" 8 | 9 | attribute { 10 | name = "transaction_id" 11 | type = "S" 12 | } 13 | 14 | attribute { 15 | name = "account_id" 16 | type = "N" 17 | } 18 | 19 | attribute { 20 | name = "transaction_date" 21 | type = "S" 22 | } 23 | 24 | global_secondary_index { 25 | name = "account" 26 | hash_key = "account_id" 27 | range_key = "transaction_date" 28 | write_capacity = 2 29 | read_capacity = 2 30 | projection_type = "ALL" 31 | } 32 | 33 | tags = local.tags 34 | } 35 | -------------------------------------------------------------------------------- /fraud-detection/remote/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | kpow: 5 | image: factorhouse/kpow-ce:91.2.1 6 | container_name: kpow 7 | ports: 8 | - "3000:3000" 9 | networks: 10 | - appnet 11 | environment: 12 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID 13 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY 14 | AWS_SESSION_TOKEN: $AWS_SESSION_TOKEN 15 | # MSK cluster 16 | BOOTSTRAP: $BOOTSTRAP_SERVERS 17 | SECURITY_PROTOCOL: SASL_SSL 18 | SASL_MECHANISM: AWS_MSK_IAM 19 | SASL_CLIENT_CALLBACK_HANDLER_CLASS: software.amazon.msk.auth.iam.IAMClientCallbackHandler 20 | SASL_JAAS_CONFIG: software.amazon.msk.auth.iam.IAMLoginModule required; 21 | # MSK connect 22 | CONNECT_AWS_REGION: $AWS_DEFAULT_REGION 23 | 24 | networks: 25 | appnet: 26 | name: app-network 27 | -------------------------------------------------------------------------------- /sql-training/client-image/conf/flink-conf.yaml: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright 2019 Ververica GmbH 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | jobmanager.rpc.address: jobmanager 18 | -------------------------------------------------------------------------------- /fraud-detection/local/configs/sink.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "transactions-sink", 3 | "config": { 4 | "connector.class": "org.apache.camel.kafkaconnector.awsddbsink.CamelAwsddbsinkSinkConnector", 5 | "tasks.max": "2", 6 | "key.converter": "org.apache.kafka.connect.json.JsonConverter", 7 | "key.converter.schemas.enable": false, 8 | "value.converter": "org.apache.kafka.connect.json.JsonConverter", 9 | "value.converter.schemas.enable": false, 10 | "topics": "flagged-transactions", 11 | 12 | "camel.kamelet.aws-ddb-sink.table": "flagged-transactions", 13 | "camel.kamelet.aws-ddb-sink.region": "ap-southeast-2", 14 | "camel.kamelet.aws-ddb-sink.operation": "PutItem", 15 | "camel.kamelet.aws-ddb-sink.writeCapacity": 1, 16 | "camel.kamelet.aws-ddb-sink.useDefaultCredentialsProvider": true, 17 | "camel.sink.unmarshal": "jackson" 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /fraud-detection/local/compose-connect.yml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | 3 | services: 4 | kafka-connect: 5 | image: bitnami/kafka:2.8.1 6 | container_name: connect 7 | command: > 8 | /opt/bitnami/kafka/bin/connect-distributed.sh 9 | /opt/bitnami/kafka/config/connect-distributed.properties 10 | ports: 11 | - "8083:8083" 12 | networks: 13 | - kafkanet 14 | environment: 15 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID 16 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY 17 | AWS_SESSION_TOKEN: $AWS_SESSION_TOKEN 18 | volumes: 19 | - "./configs/connect-distributed.properties:/opt/bitnami/kafka/config/connect-distributed.properties" 20 | - "./connectors/camel-aws-ddb-sink-kafka-connector:/opt/connectors/camel-aws-ddb-sink-kafka-connector" 21 | 22 | networks: 23 | kafkanet: 24 | external: true 25 | name: kafka-network 26 | -------------------------------------------------------------------------------- /real-time-streaming-aws/infra/data.tf: -------------------------------------------------------------------------------- 1 | ## data sources for general resources 2 | # Find the user currently in use by AWS 3 | data "aws_caller_identity" "current" {} 4 | 5 | # Region in which to deploy the solution 6 | data "aws_region" "current" {} 7 | 8 | # Availability zones to use in our soultion 9 | data "aws_availability_zones" "available" { 10 | state = "available" 11 | } 12 | 13 | ## data sources for VPN 14 | # Local ip address 15 | data "http" "local_ip_address" { 16 | url = "https://ifconfig.me/ip" 17 | } 18 | 19 | # Latest Amazon linux 2 AMI 20 | data "aws_ami" "amazon_linux_2" { 21 | owners = ["amazon"] 22 | most_recent = true 23 | 24 | filter { 25 | name = "name" 26 | values = ["amzn2-ami-hvm-*-x86_64-ebs"] 27 | } 28 | } 29 | 30 | data "aws_iam_role" "opensearch_service_linked_role" { 31 | name = "AWSServiceRoleForAmazonOpenSearchService" 32 | } 33 | -------------------------------------------------------------------------------- /real-time-streaming-aws/exporter/athena.sql: -------------------------------------------------------------------------------- 1 | -- https://docs.aws.amazon.com/athena/latest/ug/partitions.html 2 | CREATE EXTERNAL TABLE taxi_rides ( 3 | id STRING, 4 | vendor_id INT, 5 | pickup_datetime TIMESTAMP, 6 | dropoff_datetime TIMESTAMP, 7 | passenger_count INT, 8 | pickup_longitude STRING, 9 | pickup_latitude STRING, 10 | dropoff_longitude STRING, 11 | dropoff_latitude STRING, 12 | store_and_fwd_flag STRING, 13 | gc_distance INT, 14 | trip_duration INT, 15 | google_distance INT, 16 | google_duration INT 17 | ) 18 | PARTITIONED BY (year STRING, month STRING, date STRING, hour STRING) 19 | STORED AS parquet 20 | LOCATION 's3://real-time-streaming-ap-southeast-2/taxi-rides/'; 21 | 22 | MSCK REPAIR TABLE taxi_rides; 23 | 24 | SELECT * FROM taxi_rides WHERE year='2023'; -------------------------------------------------------------------------------- /real-time-streaming-aws/configs/sink.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "real-time-streaming-taxi-rides-sink", 3 | "config": { 4 | "connector.class": "org.apache.camel.kafkaconnector.awsddbsink.CamelAwsddbsinkSinkConnector", 5 | "tasks.max": "1", 6 | "key.converter": "org.apache.kafka.connect.storage.StringConverter", 7 | "key.converter.schemas.enable": false, 8 | "value.converter": "org.apache.kafka.connect.json.JsonConverter", 9 | "value.converter.schemas.enable": false, 10 | "topics": "taxi-rides", 11 | 12 | "camel.kamelet.aws-ddb-sink.table": "real-time-streaming-taxi-rides", 13 | "camel.kamelet.aws-ddb-sink.region": "ap-southeast-2", 14 | "camel.kamelet.aws-ddb-sink.operation": "PutItem", 15 | "camel.kamelet.aws-ddb-sink.writeCapacity": 1, 16 | "camel.kamelet.aws-ddb-sink.useDefaultCredentialsProvider": true, 17 | "camel.sink.unmarshal": "jackson" 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /real-time-streaming-aws/infra/s3.tf: -------------------------------------------------------------------------------- 1 | resource "aws_s3_bucket" "default_bucket" { 2 | bucket = local.default_bucket.name 3 | 4 | force_destroy = true 5 | 6 | tags = local.tags 7 | } 8 | 9 | resource "aws_s3_bucket_acl" "default_bucket" { 10 | count = local.default_bucket.to_set_acl ? 1 : 0 11 | 12 | bucket = aws_s3_bucket.default_bucket.id 13 | acl = "private" 14 | } 15 | 16 | resource "aws_s3_bucket_server_side_encryption_configuration" "default_bucket" { 17 | bucket = aws_s3_bucket.default_bucket.bucket 18 | 19 | rule { 20 | apply_server_side_encryption_by_default { 21 | sse_algorithm = "AES256" 22 | } 23 | } 24 | } 25 | 26 | resource "aws_s3_object" "kda_package" { 27 | bucket = aws_s3_bucket.default_bucket.id 28 | key = "taxi-csv/taxi-trips.csv" 29 | source = "${dirname(path.cwd)}/data/taxi-trips.csv" 30 | 31 | etag = filemd5("${dirname(path.cwd)}/data/taxi-trips.csv") 32 | } 33 | -------------------------------------------------------------------------------- /real-time-streaming-aws/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | SCRIPT_DIR="$(cd $(dirname "$0"); pwd)" 3 | SRC_PATH=$SCRIPT_DIR/package 4 | 5 | # remove contents under $SRC_PATH (except for the folders beginging with lab) 6 | shopt -s extglob 7 | rm -rf $SRC_PATH/!(lab*) 8 | 9 | ## Generate Uber jar file for individual labs 10 | echo "generate Uber jar for PyFlink app..." 11 | mkdir $SRC_PATH/lib 12 | mvn clean install -f $SRC_PATH/lab2-pipeline/pom.xml \ 13 | && mv $SRC_PATH/lab2-pipeline/target/lab2-pipeline-1.0.0.jar $SRC_PATH/lib \ 14 | && rm -rf $SRC_PATH/lab2-pipeline/target 15 | 16 | mvn clean install -f $SRC_PATH/lab3-pipeline/pom.xml \ 17 | && mv $SRC_PATH/lab3-pipeline/target/lab3-pipeline-1.0.0.jar $SRC_PATH/lib \ 18 | && rm -rf $SRC_PATH/lab3-pipeline/target 19 | 20 | mvn clean install -f $SRC_PATH/lab4-pipeline/pom.xml \ 21 | && mv $SRC_PATH/lab4-pipeline/target/lab4-pipeline-1.0.0.jar $SRC_PATH/lib \ 22 | && rm -rf $SRC_PATH/lab4-pipeline/target 23 | -------------------------------------------------------------------------------- /sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/records/TaxiRecord.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Ververica GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.ververica.sql_training.data_producer.records; 18 | 19 | import java.util.Date; 20 | 21 | public interface TaxiRecord { 22 | 23 | Date getEventTime(); 24 | } 25 | -------------------------------------------------------------------------------- /pyflink-udemy/s3_02_python_source.py: -------------------------------------------------------------------------------- 1 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes 2 | 3 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode()) 4 | 5 | products = [("Toothbrush", 3.99), ("Dental Floss", 1.99), ("Toothpaste", 4.99)] 6 | 7 | ## tbl1 8 | tbl1 = tbl_env.from_elements(products) 9 | print("\ntbl1 schema") 10 | tbl1.print_schema() 11 | print("\ntbl1 data") 12 | print(tbl1.to_pandas()) 13 | 14 | ## tbl2 15 | col_names = ["product", "price"] 16 | tbl2 = tbl_env.from_elements(products, col_names) 17 | print("\ntbl2 schema") 18 | tbl2.print_schema() 19 | print("\ntbl2 data") 20 | print(tbl2.to_pandas()) 21 | 22 | ## tbl3 23 | schema = DataTypes.ROW( 24 | [DataTypes.FIELD("product", DataTypes.STRING()), DataTypes.FIELD("price", DataTypes.DOUBLE())] 25 | ) 26 | tbl3 = tbl_env.from_elements(products, schema) 27 | print("\ntbl3 schema") 28 | tbl3.print_schema() 29 | print("\ntbl3 data") 30 | print(tbl3.to_pandas()) 31 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | SCRIPT_DIR="$(cd $(dirname "$0"); pwd)" 3 | SRC_PATH=$SCRIPT_DIR/package 4 | 5 | # remove contents under $SRC_PATH (except for uber-jar-for-pyflink) and kda-package.zip file 6 | shopt -s extglob 7 | rm -rf $SRC_PATH/!(uber-jar-for-pyflink) kda-package.zip 8 | 9 | ## Generate Uber Jar for PyFlink app for MSK cluster with IAM authN 10 | echo "generate Uber jar for PyFlink app..." 11 | mkdir $SRC_PATH/lib 12 | mvn clean install -f $SRC_PATH/uber-jar-for-pyflink/pom.xml \ 13 | && mv $SRC_PATH/uber-jar-for-pyflink/target/pyflink-getting-started-1.0.0.jar $SRC_PATH/lib \ 14 | && rm -rf $SRC_PATH/uber-jar-for-pyflink/target 15 | 16 | ## Install pip packages 17 | echo "install and zip pip packages..." 18 | pip install -r requirements.txt --target $SRC_PATH/site_packages 19 | 20 | ## Package pyflink app 21 | echo "package pyflink app" 22 | zip -r kda-package.zip processor.py package/lib package/site_packages 23 | -------------------------------------------------------------------------------- /real-time-streaming-aws/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | SCRIPT_DIR="$(cd $(dirname "$0"); pwd)" 3 | 4 | SRC_PATH=${SCRIPT_DIR}/infra/connectors 5 | rm -rf ${SRC_PATH} && mkdir -p ${SRC_PATH} 6 | 7 | ## Download camel dynamodb sink connector 8 | echo "download camel dynamodb sink connector..." 9 | DOWNLOAD_URL=https://repo.maven.apache.org/maven2/org/apache/camel/kafkaconnector/camel-aws-ddb-sink-kafka-connector/3.20.3/camel-aws-ddb-sink-kafka-connector-3.20.3-package.tar.gz 10 | 11 | # decompress and zip contents to create custom plugin of msk connect later 12 | curl -o ${SRC_PATH}/camel-aws-ddb-sink-kafka-connector.tar.gz ${DOWNLOAD_URL} \ 13 | && tar -xvzf ${SRC_PATH}/camel-aws-ddb-sink-kafka-connector.tar.gz -C ${SRC_PATH} \ 14 | && cd ${SRC_PATH}/camel-aws-ddb-sink-kafka-connector \ 15 | && zip -r camel-aws-ddb-sink-kafka-connector.zip . \ 16 | && mv camel-aws-ddb-sink-kafka-connector.zip ${SRC_PATH} \ 17 | && rm ${SRC_PATH}/camel-aws-ddb-sink-kafka-connector.tar.gz -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/sensor/SensorSplit.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.sensor 2 | 3 | import org.apache.flink.api.connector.source.SourceSplit 4 | import java.io.Serializable 5 | 6 | /** 7 | * Represents a split of work for the SensorSource. 8 | * 9 | * In the context of this custom source, a "split" is a logical unit of work assigned 10 | * to a single parallel SourceReader instance. Here, it simply wraps the subtask index 11 | * to ensure each parallel reader generates unique sensor IDs. 12 | * It must be Serializable to be sent from the SplitEnumerator (on the JobManager) 13 | * to the SourceReaders (on the TaskManagers). 14 | * 15 | * @property subtaskIndex The parallel instance index this split is for. 16 | */ 17 | data class SensorSplit( 18 | val subtaskIndex: Int, 19 | ) : SourceSplit, 20 | Serializable { 21 | /** 22 | * Provides a unique identifier for this split. 23 | */ 24 | override fun splitId(): String = "split-$subtaskIndex" 25 | } 26 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/local/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM flink:1.15.2-scala_2.12 2 | 3 | ARG PYTHON_VERSION 4 | ENV PYTHON_VERSION=${PYTHON_VERSION:-3.8.10} 5 | ARG FLINK_VERSION 6 | ENV FLINK_VERSION=${FLINK_VERSION:-1.15.2} 7 | 8 | # Currently only Python 3.6, 3.7 and 3.8 are supported officially. 9 | RUN apt-get update -y && \ 10 | apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev libffi-dev && \ 11 | wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz && \ 12 | tar -xvf Python-${PYTHON_VERSION}.tgz && \ 13 | cd Python-${PYTHON_VERSION} && \ 14 | ./configure --without-tests --enable-shared && \ 15 | make -j6 && \ 16 | make install && \ 17 | ldconfig /usr/local/lib && \ 18 | cd .. && rm -f Python-${PYTHON_VERSION}.tgz && rm -rf Python-${PYTHON_VERSION} && \ 19 | ln -s /usr/local/bin/python3 /usr/local/bin/python && \ 20 | apt-get clean && \ 21 | rm -rf /var/lib/apt/lists/* 22 | 23 | # install PyFlink 24 | RUN pip3 install apache-flink==${FLINK_VERSION} 25 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM flink:1.15.4-scala_2.12 2 | 3 | ARG PYTHON_VERSION 4 | ENV PYTHON_VERSION=${PYTHON_VERSION:-3.8.10} 5 | ARG FLINK_VERSION 6 | ENV FLINK_VERSION=${FLINK_VERSION:-1.15.2} 7 | 8 | # Currently only Python 3.6, 3.7 and 3.8 are supported officially. 9 | RUN apt-get update -y && \ 10 | apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev libffi-dev && \ 11 | wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz && \ 12 | tar -xvf Python-${PYTHON_VERSION}.tgz && \ 13 | cd Python-${PYTHON_VERSION} && \ 14 | ./configure --without-tests --enable-shared && \ 15 | make -j6 && \ 16 | make install && \ 17 | ldconfig /usr/local/lib && \ 18 | cd .. && rm -f Python-${PYTHON_VERSION}.tgz && rm -rf Python-${PYTHON_VERSION} && \ 19 | ln -s /usr/local/bin/python3 /usr/local/bin/python && \ 20 | apt-get clean && \ 21 | rm -rf /var/lib/apt/lists/* 22 | 23 | # install PyFlink 24 | RUN pip3 install apache-flink==${FLINK_VERSION} 25 | -------------------------------------------------------------------------------- /fraud-detection/local/configs/ddb.json: -------------------------------------------------------------------------------- 1 | { 2 | "TableName": "flagged-transactions", 3 | "KeySchema": [ 4 | { "AttributeName": "transaction_id", "KeyType": "HASH" }, 5 | { "AttributeName": "transaction_date", "KeyType": "RANGE" } 6 | ], 7 | "AttributeDefinitions": [ 8 | { "AttributeName": "transaction_id", "AttributeType": "S" }, 9 | { "AttributeName": "account_id", "AttributeType": "N" }, 10 | { "AttributeName": "transaction_date", "AttributeType": "S" } 11 | ], 12 | "ProvisionedThroughput": { 13 | "ReadCapacityUnits": 2, 14 | "WriteCapacityUnits": 2 15 | }, 16 | "GlobalSecondaryIndexes": [ 17 | { 18 | "IndexName": "account", 19 | "KeySchema": [ 20 | { "AttributeName": "account_id", "KeyType": "HASH" }, 21 | { "AttributeName": "transaction_date", "KeyType": "RANGE" } 22 | ], 23 | "Projection": { "ProjectionType": "ALL" }, 24 | "ProvisionedThroughput": { 25 | "ReadCapacityUnits": 2, 26 | "WriteCapacityUnits": 2 27 | } 28 | } 29 | ] 30 | } 31 | -------------------------------------------------------------------------------- /confluent-flink-101/notes.md: -------------------------------------------------------------------------------- 1 | [Apache Flink® 101](https://developer.confluent.io/courses/apache-flink/intro/) 2 | 3 | ## Table of Contents 4 | 5 | 1. Introduction 6 | 2. Intro to Stream Processing with Apache Flink 7 | 3. Intro to Flink SQL 8 | 4. Batch and Stream Processing with Flink SQL (Exercise) 9 | 5. The Flink Runtime 10 | 6. Using the Flink Web UI (Exercise) 11 | 7. Using Kafka with Flink 12 | 8. Deploying an ETL Pipeline using Flink SQL (Exercise) 13 | 9. Stateful Stream Processing with Flink SQL 14 | 10. Streaming Analytics with Flink SQL (Exercise) 15 | 11. Event Time and Watermarks 16 | 12. Implementing and Troubleshooting Watermarks (Exercise) 17 | 13. Checkpoints and Recovery 18 | 14. Experiencing Failure Recovery (Exercise) 19 | 15. Conclusion 20 | 21 | ## 4 Big Ideas 22 | 23 | ### Streaming 24 | 25 | - Intro to Stream Processing with Apache Flink 26 | - Intro to Flink SQL 27 | - The Flink Runtime 28 | - Using Kafka with Flink 29 | 30 | ### State 31 | 32 | - Stateful Stream Processing with Flink SQL 33 | 34 | ### Time 35 | 36 | - Event Time and Watermarks 37 | 38 | ### Snapshot 39 | 40 | - Checkpoints and Recovery 41 | -------------------------------------------------------------------------------- /fraud-detection/local/application_properties.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "PropertyGroupId": "kinesis.analytics.flink.run.options", 4 | "PropertyMap": { 5 | "python": "processor.py", 6 | "jarfile": "package/lib/flink-sql-connector-kinesis-1.15.2.jar", 7 | "pyFiles": "package/site_packages/" 8 | } 9 | }, 10 | { 11 | "PropertyGroupId": "consumer.config.0", 12 | "PropertyMap": { 13 | "table.name": "flagged_accounts", 14 | "topic.name": "flagged-accounts", 15 | "bootstrap.servers": "localhost:29092", 16 | "startup.mode": "earliest-offset" 17 | } 18 | }, 19 | { 20 | "PropertyGroupId": "consumer.config.1", 21 | "PropertyMap": { 22 | "table.name": "transactions", 23 | "topic.name": "transactions", 24 | "bootstrap.servers": "localhost:29092", 25 | "startup.mode": "earliest-offset" 26 | } 27 | }, 28 | { 29 | "PropertyGroupId": "producer.config.0", 30 | "PropertyMap": { 31 | "table.name": "flagged_transactions", 32 | "topic.name": "flagged-transactions", 33 | "bootstrap.servers": "localhost:29092" 34 | } 35 | } 36 | ] 37 | -------------------------------------------------------------------------------- /fraud-detection/remote/application_properties.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "PropertyGroupId": "kinesis.analytics.flink.run.options", 4 | "PropertyMap": { 5 | "python": "processor.py", 6 | "jarfile": "package/lib/flink-sql-connector-kinesis-1.15.2.jar", 7 | "pyFiles": "package/site_packages/" 8 | } 9 | }, 10 | { 11 | "PropertyGroupId": "consumer.config.0", 12 | "PropertyMap": { 13 | "table.name": "flagged_accounts", 14 | "topic.name": "flagged-accounts", 15 | "bootstrap.servers": "localhost:29092", 16 | "startup.mode": "earliest-offset" 17 | } 18 | }, 19 | { 20 | "PropertyGroupId": "consumer.config.1", 21 | "PropertyMap": { 22 | "table.name": "transactions", 23 | "topic.name": "transactions", 24 | "bootstrap.servers": "localhost:29092", 25 | "startup.mode": "earliest-offset" 26 | } 27 | }, 28 | { 29 | "PropertyGroupId": "producer.config.0", 30 | "PropertyMap": { 31 | "table.name": "flagged_transactions", 32 | "topic.name": "flagged-transactions", 33 | "bootstrap.servers": "localhost:29092" 34 | } 35 | } 36 | ] 37 | -------------------------------------------------------------------------------- /pyflink-udemy/s3_05_csv_sink.py: -------------------------------------------------------------------------------- 1 | from pyflink.table import ( 2 | EnvironmentSettings, 3 | TableEnvironment, 4 | CsvTableSource, 5 | CsvTableSink, 6 | WriteMode, 7 | DataTypes, 8 | ) 9 | 10 | env_settings = EnvironmentSettings.in_batch_mode() 11 | table_env = TableEnvironment.create(env_settings) 12 | table_env.get_config().set("parallelism.default", "1") # output to single file 13 | 14 | field_names = "seller_id,product,quantity,product_price,sales_date".split(",") 15 | field_types = [ 16 | DataTypes.STRING(), 17 | DataTypes.STRING(), 18 | DataTypes.INT(), 19 | DataTypes.DOUBLE(), 20 | DataTypes.DATE(), 21 | ] 22 | 23 | # source table 24 | source = CsvTableSource("./csv-input", field_names, field_types, ignore_first_line=True) 25 | table_env.register_table_source("product_locale_sales", source) 26 | tbl = table_env.from_path("product_locale_sales") 27 | 28 | # sink table 29 | sink = CsvTableSink( 30 | field_names, field_types, "revenue.csv", num_files=1, write_mode=WriteMode.OVERWRITE 31 | ) 32 | table_env.register_table_sink("locale_revenue", sink) 33 | tbl.execute_insert("locale_revenue").wait() 34 | -------------------------------------------------------------------------------- /fraud-detection/remote/infra/scripts/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | 3 | ## Allocate elastic IP and disable source/destination checks 4 | TOKEN=$(curl --silent --max-time 60 -X PUT http://169.254.169.254/latest/api/token -H "X-aws-ec2-metadata-token-ttl-seconds: 30") 5 | INSTANCEID=$(curl --silent --max-time 60 -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-id) 6 | aws --region ${aws_region} ec2 associate-address --instance-id $INSTANCEID --allocation-id ${allocation_id} 7 | aws --region ${aws_region} ec2 modify-instance-attribute --instance-id $INSTANCEID --source-dest-check "{\"Value\": false}" 8 | 9 | ## Start SoftEther VPN server 10 | yum update -y && yum install docker -y 11 | systemctl enable docker.service && systemctl start docker.service 12 | 13 | docker pull siomiz/softethervpn:debian 14 | docker run -d \ 15 | --cap-add NET_ADMIN \ 16 | --name softethervpn \ 17 | --restart unless-stopped \ 18 | -p 500:500/udp -p 4500:4500/udp -p 1701:1701/tcp -p 1194:1194/udp -p 5555:5555/tcp -p 443:443/tcp \ 19 | -e PSK=${vpn_psk} \ 20 | -e SPW=${admin_password} \ 21 | -e HPW=DEFAULT \ 22 | siomiz/softethervpn:debian -------------------------------------------------------------------------------- /real-time-streaming-aws/infra/scripts/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | 3 | ## Allocate elastic IP and disable source/destination checks 4 | TOKEN=$(curl --silent --max-time 60 -X PUT http://169.254.169.254/latest/api/token -H "X-aws-ec2-metadata-token-ttl-seconds: 30") 5 | INSTANCEID=$(curl --silent --max-time 60 -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-id) 6 | aws --region ${aws_region} ec2 associate-address --instance-id $INSTANCEID --allocation-id ${allocation_id} 7 | aws --region ${aws_region} ec2 modify-instance-attribute --instance-id $INSTANCEID --source-dest-check "{\"Value\": false}" 8 | 9 | ## Start SoftEther VPN server 10 | yum update -y && yum install docker -y 11 | systemctl enable docker.service && systemctl start docker.service 12 | 13 | docker pull siomiz/softethervpn:debian 14 | docker run -d \ 15 | --cap-add NET_ADMIN \ 16 | --name softethervpn \ 17 | --restart unless-stopped \ 18 | -p 500:500/udp -p 4500:4500/udp -p 1701:1701/tcp -p 1194:1194/udp -p 5555:5555/tcp -p 443:443/tcp \ 19 | -e PSK=${vpn_psk} \ 20 | -e SPW=${admin_password} \ 21 | -e HPW=DEFAULT \ 22 | siomiz/softethervpn:debian -------------------------------------------------------------------------------- /stream-processing-with-flink/src/test/kotlin/me/jaehyeon/chapter1/AverageSensorReadingsTest.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.chapter1 2 | 3 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment 4 | import org.junit.jupiter.api.Assertions.assertNotNull 5 | import org.junit.jupiter.api.Test 6 | 7 | /** 8 | * A simple unit test to verify that Flink classes are available on the test classpath. 9 | */ 10 | class AverageSensorReadingsTest { 11 | @Test 12 | fun `test that Flink StreamExecutionEnvironment can be created`() { 13 | // This is the core of the test. 14 | // We are trying to use a class from the 'flink-streaming-java' dependency, 15 | // which is marked as 'compileOnly' in our build.gradle.kts. 16 | // If the test classpath was not configured correctly, this line would fail 17 | // with a ClassNotFoundException. 18 | val env = StreamExecutionEnvironment.getExecutionEnvironment() 19 | 20 | // A simple assertion to confirm that the environment object was created successfully. 21 | assertNotNull(env, "The Flink execution environment should not be null.") 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/infra/scripts/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -ex 2 | 3 | ## Allocate elastic IP and disable source/destination checks 4 | TOKEN=$(curl --silent --max-time 60 -X PUT http://169.254.169.254/latest/api/token -H "X-aws-ec2-metadata-token-ttl-seconds: 30") 5 | INSTANCEID=$(curl --silent --max-time 60 -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/instance-id) 6 | aws --region ${aws_region} ec2 associate-address --instance-id $INSTANCEID --allocation-id ${allocation_id} 7 | aws --region ${aws_region} ec2 modify-instance-attribute --instance-id $INSTANCEID --source-dest-check "{\"Value\": false}" 8 | 9 | ## Start SoftEther VPN server 10 | yum update -y && yum install docker -y 11 | systemctl enable docker.service && systemctl start docker.service 12 | 13 | docker pull siomiz/softethervpn:debian 14 | docker run -d \ 15 | --cap-add NET_ADMIN \ 16 | --name softethervpn \ 17 | --restart unless-stopped \ 18 | -p 500:500/udp -p 4500:4500/udp -p 1701:1701/tcp -p 1194:1194/udp -p 5555:5555/tcp -p 443:443/tcp \ 19 | -e PSK=${vpn_psk} \ 20 | -e SPW=${admin_password} \ 21 | -e HPW=DEFAULT \ 22 | siomiz/softethervpn:debian -------------------------------------------------------------------------------- /sql-training/client-image/java/sql-training-udfs/src/main/java/com/ververica/sql_training/udfs/IsInNYC.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Ververica GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.ververica.sql_training.udfs; 18 | 19 | import org.apache.flink.table.functions.ScalarFunction; 20 | 21 | import static com.ververica.sql_training.udfs.util.GeoUtils.isInNYC; 22 | 23 | /** 24 | * Table API / SQL Scalar UDF to check if a coordinate is in NYC. 25 | */ 26 | public class IsInNYC extends ScalarFunction { 27 | 28 | public boolean eval(Float lon, Float lat) { 29 | return isInNYC(lon, lat); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /sql-training/client-image/java/sql-training-udfs/src/main/java/com/ververica/sql_training/udfs/ToAreaId.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Ververica GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.ververica.sql_training.udfs; 18 | 19 | import org.apache.flink.table.functions.ScalarFunction; 20 | 21 | import com.ververica.sql_training.udfs.util.GeoUtils; 22 | 23 | /** 24 | * Table API / SQL Scalar UDF to convert a lon/lat pair into a cell ID. 25 | */ 26 | public class ToAreaId extends ScalarFunction { 27 | 28 | public int eval(Float lon, Float lat) { 29 | return GeoUtils.mapToGridCell(lon, lat); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /real-time-streaming-aws/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM flink:1.17.1 2 | 3 | ARG PYTHON_VERSION 4 | ENV PYTHON_VERSION=${PYTHON_VERSION:-3.8.10} 5 | ARG FLINK_VERSION 6 | ENV FLINK_VERSION=${FLINK_VERSION:-1.17.1} 7 | 8 | RUN mkdir ./plugins/s3-fs-hadoop \ 9 | && cp ./opt/flink-s3-fs-hadoop-${FLINK_VERSION}.jar ./plugins/s3-fs-hadoop 10 | 11 | RUN apt-get update -y && \ 12 | apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev libffi-dev liblzma-dev && \ 13 | wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz && \ 14 | tar -xvf Python-${PYTHON_VERSION}.tgz && \ 15 | cd Python-${PYTHON_VERSION} && \ 16 | ./configure --without-tests --enable-shared && \ 17 | make -j6 && \ 18 | make install && \ 19 | ldconfig /usr/local/lib && \ 20 | cd .. && rm -f Python-${PYTHON_VERSION}.tgz && rm -rf Python-${PYTHON_VERSION} && \ 21 | ln -s /usr/local/bin/python3 /usr/local/bin/python && \ 22 | apt-get clean && \ 23 | rm -rf /var/lib/apt/lists/* 24 | 25 | # install PyFlink 26 | RUN pip3 install apache-flink==${FLINK_VERSION} 27 | 28 | # add kafka client for Flink SQL client, will be added manually 29 | RUN wget -P /etc/lib/ https://repo.maven.apache.org/maven2/org/apache/kafka/kafka-clients/3.2.3/kafka-clients-3.2.3.jar; 30 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/connector/HttpSink.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.connector 2 | 3 | import org.apache.flink.api.connector.sink2.Sink 4 | import org.apache.flink.api.connector.sink2.SinkWriter 5 | import org.apache.flink.api.connector.sink2.WriterInitContext 6 | 7 | /** 8 | * The Sink class is the main entry point and a factory for the SinkWriter. 9 | * 10 | * @param url The target HTTP endpoint for all records. 11 | * @param httpMethodName The HTTP method name to use for the requests. 12 | */ 13 | class HttpSink( 14 | private val url: String, 15 | private val httpMethodName: String, 16 | ) : Sink { 17 | @Deprecated("Overrides deprecated member in superclass.") 18 | override fun createWriter(context: Sink.InitContext): SinkWriter { 19 | val subtaskId = 20 | if (context is WriterInitContext) { 21 | // Modern, warning-free path 22 | context.subtaskId 23 | } else { 24 | // Fallback path with targeted warning suppression 25 | @Suppress("DEPRECATION") 26 | context.getSubtaskId() 27 | } 28 | 29 | return HttpSinkWriter( 30 | url, 31 | httpMethodName, 32 | subtaskId, 33 | ) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/chapter8/CustomConnectors.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.chapter8 2 | 3 | import me.jaehyeon.connector.HttpSink 4 | import me.jaehyeon.connector.HttpSource 5 | import org.apache.flink.api.common.eventtime.WatermarkStrategy 6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment 7 | 8 | object CustomConnectors { 9 | @JvmStatic 10 | fun main(args: Array) { 11 | val env = StreamExecutionEnvironment.getExecutionEnvironment() 12 | env.parallelism = 2 13 | 14 | val httpSource = 15 | HttpSource( 16 | baseUrlPattern = "https://jsonplaceholder.typicode.com/posts/{id}", 17 | startId = 1, 18 | maxId = 100, 19 | ) 20 | 21 | val sourceStream = 22 | env.fromSource( 23 | httpSource, 24 | WatermarkStrategy.noWatermarks(), 25 | "Cyclical HTTP Source", 26 | ) 27 | 28 | val httpSink = 29 | HttpSink( 30 | url = "https://jsonplaceholder.typicode.com/posts", 31 | httpMethodName = "POST", 32 | ) 33 | 34 | sourceStream.sinkTo(httpSink) 35 | 36 | env.execute("Custom HTTP Source and Sink Jobs") 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter7/checkpointed_function.py: -------------------------------------------------------------------------------- 1 | # Deprecation of ListCheckpointed interface # 2 | # FLINK-6258 # 3 | # The ListCheckpointed interface has been deprecated because it uses Java Serialization for checkpointing state which is problematic for savepoint compatibility. Use the CheckpointedFunction interface instead, which gives more control over state serialization. 4 | 5 | # Operator State 6 | # https://nightlies.apache.org/flink/flink-docs-release-1.18/docs/dev/datastream/fault-tolerance/state/ 7 | 8 | # Operator State (or non-keyed state) is state that is bound to one parallel operator instance. The Kafka Connector is a good motivating example for the use of Operator State in Flink. Each parallel instance of the Kafka consumer maintains a map of topic partitions and offsets as its Operator State. 9 | 10 | # The Operator State interfaces support redistributing state among parallel operator instances when the parallelism is changed. There are different schemes for doing this redistribution. 11 | 12 | # In a typical stateful Flink Application you don’t need operators state. It is mostly a special type of state that is used in source/sink implementations and scenarios where you don’t have a key by which state can be partitioned. 13 | 14 | # Notes: Operator state is still not supported in Python DataStream API. 15 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter7/operator_list_state_function.py: -------------------------------------------------------------------------------- 1 | # Deprecation of ListCheckpointed interface # 2 | # FLINK-6258 # 3 | # The ListCheckpointed interface has been deprecated because it uses Java Serialization for checkpointing state which is problematic for savepoint compatibility. Use the CheckpointedFunction interface instead, which gives more control over state serialization. 4 | 5 | # Operator State 6 | # https://nightlies.apache.org/flink/flink-docs-release-1.18/docs/dev/datastream/fault-tolerance/state/ 7 | 8 | # Operator State (or non-keyed state) is state that is bound to one parallel operator instance. The Kafka Connector is a good motivating example for the use of Operator State in Flink. Each parallel instance of the Kafka consumer maintains a map of topic partitions and offsets as its Operator State. 9 | 10 | # The Operator State interfaces support redistributing state among parallel operator instances when the parallelism is changed. There are different schemes for doing this redistribution. 11 | 12 | # In a typical stateful Flink Application you don’t need operators state. It is mostly a special type of state that is used in source/sink implementations and scenarios where you don’t have a key by which state can be partitioned. 13 | 14 | # Notes: Operator state is still not supported in Python DataStream API. 15 | -------------------------------------------------------------------------------- /real-time-streaming-aws/package/lab2-pipeline/src/main/resources/log4j2.properties: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | rootLogger.level = INFO 20 | rootLogger.appenderRef.console.ref = ConsoleAppender 21 | 22 | appender.console.name = ConsoleAppender 23 | appender.console.type = CONSOLE 24 | appender.console.layout.type = PatternLayout 25 | appender.console.layout.pattern = %d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n 26 | -------------------------------------------------------------------------------- /real-time-streaming-aws/package/lab3-pipeline/src/main/resources/log4j2.properties: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | rootLogger.level = INFO 20 | rootLogger.appenderRef.console.ref = ConsoleAppender 21 | 22 | appender.console.name = ConsoleAppender 23 | appender.console.type = CONSOLE 24 | appender.console.layout.type = PatternLayout 25 | appender.console.layout.pattern = %d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n 26 | -------------------------------------------------------------------------------- /real-time-streaming-aws/package/lab4-pipeline/src/main/resources/log4j2.properties: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | rootLogger.level = INFO 20 | rootLogger.appenderRef.console.ref = ConsoleAppender 21 | 22 | appender.console.name = ConsoleAppender 23 | appender.console.type = CONSOLE 24 | appender.console.layout.type = PatternLayout 25 | appender.console.layout.pattern = %d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n 26 | -------------------------------------------------------------------------------- /fraud-detection/remote/package/uber-jar-for-pyflink/src/main/resources/log4j2.properties: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | rootLogger.level = INFO 20 | rootLogger.appenderRef.console.ref = ConsoleAppender 21 | 22 | appender.console.name = ConsoleAppender 23 | appender.console.type = CONSOLE 24 | appender.console.layout.type = PatternLayout 25 | appender.console.layout.pattern = %d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n 26 | -------------------------------------------------------------------------------- /sql-training/client-image/java/sql-training-udfs/src/main/java/com/ververica/sql_training/udfs/ToCoords.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Ververica GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.ververica.sql_training.udfs; 18 | 19 | import org.apache.flink.table.annotation.DataTypeHint; 20 | import org.apache.flink.table.functions.ScalarFunction; 21 | import org.apache.flink.types.Row; 22 | 23 | import com.ververica.sql_training.udfs.util.GeoUtils; 24 | 25 | /** 26 | * Table API / SQL Scalar UDF to convert a cell ID into a lon/lat pair. 27 | */ 28 | public class ToCoords extends ScalarFunction { 29 | 30 | @DataTypeHint("ROW") 31 | public Row eval(Integer cellId) { 32 | return Row.of( 33 | GeoUtils.getGridCellCenterLon(cellId), 34 | GeoUtils.getGridCellCenterLat(cellId) 35 | ); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/package/uber-jar-for-pyflink/src/main/resources/log4j2.properties: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Licensed to the Apache Software Foundation (ASF) under one 3 | # or more contributor license agreements. See the NOTICE file 4 | # distributed with this work for additional information 5 | # regarding copyright ownership. The ASF licenses this file 6 | # to you under the Apache License, Version 2.0 (the 7 | # "License"); you may not use this file except in compliance 8 | # with the License. You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | rootLogger.level = INFO 20 | rootLogger.appenderRef.console.ref = ConsoleAppender 21 | 22 | appender.console.name = ConsoleAppender 23 | appender.console.type = CONSOLE 24 | appender.console.layout.type = PatternLayout 25 | appender.console.layout.pattern = %d{HH:mm:ss,SSS} %-5p %-60c %x - %m%n 26 | -------------------------------------------------------------------------------- /confluent-flink-101/compose-flink-standalone.yml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | 3 | services: 4 | jobmanager: 5 | image: confluent-flink-101:1.15.4 6 | command: jobmanager 7 | ports: 8 | - "8081:8081" 9 | environment: 10 | - | 11 | FLINK_PROPERTIES= 12 | jobmanager.rpc.address: jobmanager 13 | state.backend: filesystem 14 | state.checkpoints.dir: file:///tmp/flink-checkpoints 15 | heartbeat.interval: 1000 16 | heartbeat.timeout: 5000 17 | rest.flamegraph.enabled: true 18 | web.backpressure.refresh-interval: 10000 19 | taskmanager: 20 | image: confluent-flink-101:1.15.4 21 | command: taskmanager 22 | volumes: 23 | - flink_data:/tmp/ 24 | environment: 25 | - | 26 | FLINK_PROPERTIES= 27 | jobmanager.rpc.address: jobmanager 28 | taskmanager.numberOfTaskSlots: 3 29 | state.backend: filesystem 30 | state.checkpoints.dir: file:///tmp/flink-checkpoints 31 | heartbeat.interval: 1000 32 | heartbeat.timeout: 5000 33 | depends_on: 34 | - jobmanager 35 | sql-client: 36 | image: confluent-flink-101:1.15.4 37 | command: bin/sql-client.sh 38 | depends_on: 39 | - jobmanager 40 | environment: 41 | - | 42 | FLINK_PROPERTIES= 43 | jobmanager.rpc.address: jobmanager 44 | rest.address: jobmanager 45 | 46 | volumes: 47 | flink_data: 48 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/README.md: -------------------------------------------------------------------------------- 1 | # Stream Processing with Flink in Python 2 | 3 | This project contains Python implementations of examples for learning Apache Flink, inspired by the [`streaming-with-flink` Scala](https://github.com/streaming-with-flink/examples-scala) project. 4 | 5 | ## Getting Started 6 | 7 | First, clone the repository to your local machine: 8 | 9 | ```bash 10 | git clone https://github.com/jaehyeon-kim/flink-demos.git 11 | cd flink-demos/stream-processing-with-pyflink 12 | ``` 13 | 14 | It is recommended to create a virtual environment to manage the project's dependencies. 15 | 16 | ```bash 17 | python -m venv venv 18 | source venv/bin/activate 19 | ``` 20 | 21 | Next, install the required Python packages, which are listed in the `requirements.txt` file. 22 | 23 | ```bash 24 | pip install -r requirements-dev.txt 25 | ``` 26 | 27 | ## Running the Examples 28 | 29 | You can run the Flink jobs locally from your command line. Use the `python` command to execute the individual example scripts. 30 | 31 | Here are a few examples from different chapters. Please check the `src` directory for all available applications. 32 | 33 | ```bash 34 | # Run an example from Chapter 5 35 | python src/chapter5/basic_transformations.py 36 | 37 | # Run an example from Chapter 6 38 | python src/chapter6/process_function_timers.py 39 | 40 | # Run an example from Chapter 7 41 | python src/chapter7/keyed_state_function.py 42 | ``` 43 | -------------------------------------------------------------------------------- /sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/ConsolePrinter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Ververica GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.ververica.sql_training.data_producer; 18 | 19 | import com.ververica.sql_training.data_producer.json_serde.JsonSerializer; 20 | import com.ververica.sql_training.data_producer.records.TaxiRecord; 21 | 22 | import java.util.function.Consumer; 23 | 24 | /** 25 | * Prints TaxiRecords as JSON strings on the standard output. 26 | */ 27 | public class ConsolePrinter implements Consumer { 28 | 29 | private final JsonSerializer serializer = new JsonSerializer<>(); 30 | 31 | @Override 32 | public void accept(TaxiRecord record) { 33 | String jsonString = serializer.toJSONString(record); 34 | System.out.println(jsonString); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.watcherExclude": { 3 | "**/venv": true 4 | }, 5 | "editor.formatOnSave": true, 6 | "editor.defaultFormatter": "esbenp.prettier-vscode", 7 | "editor.tabSize": 2, 8 | "[python]": { 9 | "editor.tabSize": 4, 10 | "editor.formatOnSave": true, 11 | "editor.defaultFormatter": "charliermarsh.ruff" 12 | }, 13 | "[terraform]": { 14 | "editor.defaultFormatter": "hashicorp.terraform", 15 | "editor.formatOnSave": false, 16 | "editor.codeActionsOnSave": { 17 | "source.formatAll.terraform": "explicit" 18 | } 19 | }, 20 | "[terraform-vars]": { 21 | "editor.defaultFormatter": "hashicorp.terraform", 22 | "editor.formatOnSave": false, 23 | "editor.codeActionsOnSave": { 24 | "source.formatAll.terraform": "explicit" 25 | } 26 | }, 27 | "yaml.customTags": [ 28 | "!Base64 scalar", 29 | "!Cidr scalar", 30 | "!And sequence", 31 | "!Equals sequence", 32 | "!If sequence", 33 | "!Not sequence", 34 | "!Or sequence", 35 | "!Condition scalar", 36 | "!FindInMap sequence", 37 | "!GetAtt scalar", 38 | "!GetAtt sequence", 39 | "!GetAZs scalar", 40 | "!ImportValue scalar", 41 | "!Join sequence", 42 | "!Select sequence", 43 | "!Split sequence", 44 | "!Sub scalar", 45 | "!Transform mapping", 46 | "!Ref scalar" 47 | ], 48 | "[terraform][terraform-vars]": { 49 | "editor.codeActionsOnSave": { 50 | "source.formatAll.terraform": "explicit" 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /sql-training/client-image/java/sql-training-udfs/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.ververica.sql-training 8 | sql-training-udfs 9 | 2-FLINK-1.11_2.11 10 | 11 | 12 | 1.11.1 13 | 14 | 15 | 16 | 17 | org.apache.flink 18 | flink-table-common 19 | ${flink.version} 20 | 21 | 22 | org.apache.flink 23 | flink-streaming-java_2.11 24 | ${flink.version} 25 | 26 | 27 | 28 | 29 | 30 | 31 | org.apache.maven.plugins 32 | maven-compiler-plugin 33 | 34 | 8 35 | 8 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter1/utils/model.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import dataclasses 3 | from typing import Iterable, Tuple 4 | 5 | from pyflink.common import Row 6 | from pyflink.common.typeinfo import Types 7 | 8 | from .type_helper import TypeMapping, set_type_info 9 | 10 | 11 | @dataclasses.dataclass 12 | class SensorReading(TypeMapping): 13 | id: str 14 | timestamp: int 15 | num_records: int 16 | temperature: float 17 | 18 | def to_row(self): 19 | return Row(**dataclasses.asdict(self)) 20 | 21 | @classmethod 22 | def from_row(cls, row: Row): 23 | return cls(**row.as_dict()) 24 | 25 | @staticmethod 26 | def process_elements(elements: Iterable[Tuple[int, int, datetime.datetime]]): 27 | id, count, temperature = None, 0, 0 28 | for e in elements: 29 | next_id = f"sensor_{e[0]}" 30 | if id is not None: 31 | assert id == next_id 32 | id = next_id 33 | count += 1 34 | temperature += 65 + (e[1] / 100 * 20) 35 | return id, count, temperature 36 | 37 | @staticmethod 38 | def type_mapping(): 39 | return { 40 | "id": Types.STRING(), 41 | "timestamp": Types.LONG(), 42 | "num_records": Types.INT(), 43 | "temperature": Types.DOUBLE(), 44 | } 45 | 46 | @staticmethod 47 | def set_key_type_info(): 48 | return set_type_info(SensorReading.type_mapping(), selects=["id"]) 49 | 50 | @staticmethod 51 | def set_value_type_info(): 52 | return set_type_info(SensorReading.type_mapping()) 53 | -------------------------------------------------------------------------------- /sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/records/DriverChange.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Ververica GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.ververica.sql_training.data_producer.records; 18 | 19 | import com.fasterxml.jackson.annotation.JsonFormat; 20 | 21 | import java.util.Date; 22 | 23 | /** 24 | * POJO for a DriverChange record. 25 | */ 26 | public class DriverChange implements TaxiRecord { 27 | 28 | @JsonFormat(shape = JsonFormat.Shape.STRING, pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'") 29 | private Date eventTime; 30 | @JsonFormat 31 | private long taxiId; 32 | @JsonFormat 33 | private long driverId; 34 | 35 | public DriverChange() {} 36 | 37 | public DriverChange(Date eventTime, long taxiId, long driverId) { 38 | this.eventTime = eventTime; 39 | this.taxiId = taxiId; 40 | this.driverId = driverId; 41 | } 42 | 43 | @Override 44 | public Date getEventTime() { 45 | return eventTime; 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/local/compose-flink.yml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | 3 | services: 4 | jobmanager: 5 | image: pyflink:1.15.2-scala_2.12 6 | container_name: jobmanager 7 | command: jobmanager 8 | ports: 9 | - "8081:8081" 10 | networks: 11 | - kafkanet 12 | environment: 13 | - | 14 | FLINK_PROPERTIES= 15 | jobmanager.rpc.address: jobmanager 16 | state.backend: filesystem 17 | state.checkpoints.dir: file:///tmp/flink-checkpoints 18 | heartbeat.interval: 1000 19 | heartbeat.timeout: 5000 20 | rest.flamegraph.enabled: true 21 | web.backpressure.refresh-interval: 10000 22 | - RUNTIME_ENV=DOCKER 23 | - BOOTSTRAP_SERVERS=kafka-0:9092 24 | volumes: 25 | - $PWD:/etc/flink 26 | taskmanager: 27 | image: pyflink:1.15.2-scala_2.12 28 | container_name: taskmanager 29 | command: taskmanager 30 | networks: 31 | - kafkanet 32 | volumes: 33 | - flink_data:/tmp/ 34 | - $PWD:/etc/flink 35 | environment: 36 | - | 37 | FLINK_PROPERTIES= 38 | jobmanager.rpc.address: jobmanager 39 | taskmanager.numberOfTaskSlots: 3 40 | state.backend: filesystem 41 | state.checkpoints.dir: file:///tmp/flink-checkpoints 42 | heartbeat.interval: 1000 43 | heartbeat.timeout: 5000 44 | - RUNTIME_ENV=DOCKER 45 | - BOOTSTRAP_SERVERS=kafka-0:9092 46 | depends_on: 47 | - jobmanager 48 | 49 | networks: 50 | kafkanet: 51 | external: true 52 | name: kafka-network 53 | 54 | volumes: 55 | flink_data: 56 | driver: local 57 | name: flink_data 58 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/local/compose-kafka.yml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | 3 | services: 4 | zookeeper: 5 | image: bitnami/zookeeper:3.5 6 | container_name: zookeeper 7 | ports: 8 | - "2181" 9 | networks: 10 | - kafkanet 11 | environment: 12 | - ALLOW_ANONYMOUS_LOGIN=yes 13 | volumes: 14 | - zookeeper_data:/bitnami/zookeeper 15 | kafka-0: 16 | image: bitnami/kafka:2.8.1 17 | container_name: kafka-0 18 | expose: 19 | - 9092 20 | ports: 21 | - "29092:29092" 22 | networks: 23 | - kafkanet 24 | environment: 25 | - ALLOW_PLAINTEXT_LISTENER=yes 26 | - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181 27 | - KAFKA_CFG_BROKER_ID=0 28 | - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT 29 | - KAFKA_CFG_LISTENERS=INTERNAL://:9092,EXTERNAL://:29092 30 | - KAFKA_CFG_ADVERTISED_LISTENERS=INTERNAL://kafka-0:9092,EXTERNAL://localhost:29092 31 | - KAFKA_CFG_INTER_BROKER_LISTENER_NAME=INTERNAL 32 | - KAFKA_CFG_NUM_PARTITIONS=2 33 | volumes: 34 | - kafka_0_data:/bitnami/kafka 35 | depends_on: 36 | - zookeeper 37 | kpow: 38 | image: factorhouse/kpow-ce:91.2.1 39 | container_name: kpow 40 | ports: 41 | - "3000:3000" 42 | networks: 43 | - kafkanet 44 | environment: 45 | BOOTSTRAP: kafka-0:9092 46 | depends_on: 47 | - zookeeper 48 | - kafka-0 49 | 50 | networks: 51 | kafkanet: 52 | name: kafka-network 53 | 54 | volumes: 55 | zookeeper_data: 56 | driver: local 57 | name: zookeeper_data 58 | kafka_0_data: 59 | driver: local 60 | name: kafka_0_data 61 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/connector/HttpSplit.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.connector 2 | 3 | import org.apache.flink.api.connector.source.SourceSplit 4 | import org.apache.flink.core.io.SimpleVersionedSerializer 5 | import java.io.ByteArrayInputStream 6 | import java.io.ByteArrayOutputStream 7 | import java.io.DataInputStream 8 | import java.io.DataOutputStream 9 | import java.io.IOException 10 | 11 | /** 12 | * Represents one unit of work: a single URL to be fetched by a SourceReader. 13 | */ 14 | data class HttpSplit( 15 | val url: String, 16 | ) : SourceSplit { 17 | override fun splitId(): String = url 18 | } 19 | 20 | /** 21 | * Serializer for sending HttpSplit objects from the JobManager (Enumerator) 22 | * to the TaskManagers (Readers). 23 | */ 24 | class HttpSplitSerializer : SimpleVersionedSerializer { 25 | companion object { 26 | private const val VERSION = 1 27 | } 28 | 29 | override fun getVersion(): Int = VERSION 30 | 31 | override fun serialize(split: HttpSplit): ByteArray = 32 | ByteArrayOutputStream().use { baos -> 33 | DataOutputStream(baos).use { out -> 34 | out.writeUTF(split.url) 35 | baos.toByteArray() 36 | } 37 | } 38 | 39 | override fun deserialize( 40 | version: Int, 41 | serialized: ByteArray, 42 | ): HttpSplit { 43 | if (version != VERSION) throw IOException("Unknown version: $version") 44 | return ByteArrayInputStream(serialized).use { bais -> 45 | DataInputStream(bais).use { inp -> 46 | HttpSplit(inp.readUTF()) 47 | } 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/json_serde/JsonDeserializer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Ververica GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.ververica.sql_training.data_producer.json_serde; 18 | 19 | import com.fasterxml.jackson.databind.ObjectMapper; 20 | 21 | import java.io.IOException; 22 | 23 | /** 24 | * Deserializes a record from a JSON string. 25 | * 26 | * @param The type of the deserialized record. 27 | */ 28 | public class JsonDeserializer { 29 | 30 | private final Class recordClazz; 31 | private final ObjectMapper jsonMapper; 32 | 33 | public JsonDeserializer(Class recordClazz) { 34 | this.recordClazz = recordClazz; 35 | this.jsonMapper = new ObjectMapper(); 36 | } 37 | 38 | public T parseFromString(String line) { 39 | try { 40 | return jsonMapper.readValue(line, this.recordClazz); 41 | } catch (IOException e) { 42 | throw new IllegalArgumentException("Could not deserialize record: " + line + " as class " + recordClazz, e); 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /datorios/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | jobmanager: 4 | image: localhost/metro-flink:1.17.2 5 | pull_policy: never 6 | command: jobmanager 7 | container_name: datorios-${CLUSTER_NAME}-jobmanager 8 | volumes: 9 | - ${MOUNT_SRC_PATH}:${MOUNT_DST_PATH} 10 | ports: 11 | - "${CLUSTER_JOB_MANAGER_PORT}:8081" 12 | environment: 13 | - | 14 | FLINK_PROPERTIES= 15 | jobmanager.rpc.address: jobmanager 16 | taskmanager: 17 | image: localhost/metro-flink:1.17.2 18 | pull_policy: never 19 | depends_on: 20 | - jobmanager 21 | command: taskmanager 22 | volumes: 23 | - ${MOUNT_SRC_PATH}:${MOUNT_DST_PATH} 24 | environment: 25 | - | 26 | FLINK_PROPERTIES= 27 | jobmanager.rpc.address: jobmanager 28 | taskmanager.memory.task.off-heap.size: 128mb 29 | runner: 30 | image: localhost/metro-flink-runner:1.17.2 31 | pull_policy: never 32 | depends_on: 33 | - jobmanager 34 | - taskmanager 35 | - fluent-bit 36 | tty: true 37 | container_name: datorios-${CLUSTER_NAME}-runner 38 | volumes: 39 | - ${MOUNT_SRC_PATH}:${MOUNT_DST_PATH} 40 | env_file: 41 | - .env 42 | environment: 43 | - CLUSTER_NAME=${CLUSTER_NAME} 44 | fluent-bit: 45 | image: localhost/metro-fluent-bit:2.2.2 46 | pull_policy: never 47 | depends_on: 48 | - jobmanager 49 | - taskmanager 50 | command: /opt/fluent-bit/bin/fluent-bit -c /fluent-bit/config.conf 51 | container_name: datorios-${CLUSTER_NAME}-fluent-bit 52 | volumes: 53 | - /var/run/docker.sock:/var/run/docker.sock:ro 54 | env_file: 55 | - .env 56 | environment: 57 | - CLUSTER_NAME=${CLUSTER_NAME} 58 | -------------------------------------------------------------------------------- /real-time-streaming-aws/compose-extra.yml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | 3 | services: 4 | opensearch: 5 | image: opensearchproject/opensearch:2.7.0 6 | container_name: opensearch 7 | environment: 8 | - discovery.type=single-node 9 | - node.name=opensearch 10 | - DISABLE_SECURITY_PLUGIN=true 11 | - "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m" 12 | volumes: 13 | - opensearch_data:/usr/share/opensearch/data 14 | ports: 15 | - 9200:9200 16 | - 9600:9600 17 | networks: 18 | - appnet 19 | opensearch-dashboards: 20 | image: opensearchproject/opensearch-dashboards:2.7.0 21 | container_name: opensearch-dashboards 22 | ports: 23 | - 5601:5601 24 | expose: 25 | - "5601" 26 | environment: 27 | OPENSEARCH_HOSTS: '["http://opensearch:9200"]' 28 | DISABLE_SECURITY_DASHBOARDS_PLUGIN: true 29 | networks: 30 | - appnet 31 | kafka-connect: 32 | image: bitnami/kafka:2.8.1 33 | container_name: connect 34 | command: > 35 | /opt/bitnami/kafka/bin/connect-distributed.sh 36 | /opt/bitnami/kafka/config/connect-distributed.properties 37 | ports: 38 | - "8083:8083" 39 | networks: 40 | - appnet 41 | environment: 42 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID 43 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY 44 | volumes: 45 | - "./configs/connect-distributed.properties:/opt/bitnami/kafka/config/connect-distributed.properties" 46 | - "./infra/connectors/camel-aws-ddb-sink-kafka-connector:/opt/connectors/camel-aws-ddb-sink-kafka-connector" 47 | 48 | networks: 49 | appnet: 50 | external: true 51 | name: app-network 52 | 53 | volumes: 54 | opensearch_data: 55 | driver: local 56 | name: opensearch_data 57 | -------------------------------------------------------------------------------- /sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/json_serde/JsonSerializer.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Ververica GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.ververica.sql_training.data_producer.json_serde; 18 | 19 | import com.fasterxml.jackson.core.JsonProcessingException; 20 | import com.fasterxml.jackson.databind.ObjectMapper; 21 | 22 | /** 23 | * Serializes a record as JSON string. 24 | * 25 | * @param The type for the records to serialize. 26 | */ 27 | public class JsonSerializer { 28 | 29 | private final ObjectMapper jsonMapper = new ObjectMapper(); 30 | 31 | public String toJSONString(T r) { 32 | try { 33 | return jsonMapper.writeValueAsString(r); 34 | } catch (JsonProcessingException e) { 35 | throw new IllegalArgumentException("Could not serialize record: " + r, e); 36 | } 37 | } 38 | 39 | public byte[] toJSONBytes(T r) { 40 | try { 41 | return jsonMapper.writeValueAsBytes(r); 42 | } catch (JsonProcessingException e) { 43 | throw new IllegalArgumentException("Could not serialize record: " + r, e); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /sql-cookbook/compose-kafka.yml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | 3 | services: 4 | zookeeper: 5 | image: bitnami/zookeeper:3.5 6 | container_name: zookeeper 7 | ports: 8 | - "2181" 9 | networks: 10 | - appnet 11 | environment: 12 | - ALLOW_ANONYMOUS_LOGIN=yes 13 | volumes: 14 | - zookeeper_data:/bitnami/zookeeper 15 | kafka-0: 16 | image: bitnami/kafka:2.8.1 17 | container_name: kafka-0 18 | expose: 19 | - 9092 20 | ports: 21 | - "29092:29092" 22 | networks: 23 | - appnet 24 | environment: 25 | - ALLOW_PLAINTEXT_LISTENER=yes 26 | - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181 27 | - KAFKA_CFG_BROKER_ID=0 28 | - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT 29 | - KAFKA_CFG_LISTENERS=INTERNAL://:9092,EXTERNAL://:29092 30 | - KAFKA_CFG_ADVERTISED_LISTENERS=INTERNAL://kafka-0:9092,EXTERNAL://localhost:29092 31 | - KAFKA_CFG_INTER_BROKER_LISTENER_NAME=INTERNAL 32 | # - KAFKA_CFG_NUM_PARTITIONS=3 33 | # - KAFKA_CFG_DEFAULT_REPLICATION_FACTOR=1 34 | volumes: 35 | - kafka_0_data:/bitnami/kafka 36 | depends_on: 37 | - zookeeper 38 | kafka-ui: 39 | image: provectuslabs/kafka-ui:master 40 | container_name: kafka-ui 41 | ports: 42 | - "8080:8080" 43 | networks: 44 | - appnet 45 | environment: 46 | KAFKA_CLUSTERS_0_NAME: local 47 | KAFKA_CLUSTERS_0_BOOTSTRAPSERVERS: kafka-0:9092 48 | KAFKA_CLUSTERS_0_ZOOKEEPER: zookeeper:2181 49 | depends_on: 50 | - zookeeper 51 | - kafka-0 52 | 53 | networks: 54 | appnet: 55 | external: true 56 | name: app-network 57 | 58 | volumes: 59 | zookeeper_data: 60 | driver: local 61 | name: zookeeper_data 62 | kafka_0_data: 63 | driver: local 64 | name: kafka_0_data 65 | -------------------------------------------------------------------------------- /pyflink-udemy/s4_13_row_operations.py: -------------------------------------------------------------------------------- 1 | from statistics import stdev, mean 2 | 3 | from pyflink.common import Row 4 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource 5 | from pyflink.table.udf import udf 6 | 7 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode()) 8 | 9 | field_names = "seller_id,q1,q2,q3,q4".split(",") 10 | field_types = [ 11 | DataTypes.STRING(), 12 | DataTypes.INT(), 13 | DataTypes.INT(), 14 | DataTypes.INT(), 15 | DataTypes.INT(), 16 | ] 17 | source = CsvTableSource("./quarterly-sales-input", field_names, field_types, ignore_first_line=True) 18 | tbl_env.register_table_source("quarterly_sales", source) 19 | 20 | tbl = tbl_env.from_path("quarterly_sales") 21 | print("\nQuarterly Sales Schema") 22 | tbl.print_schema() 23 | 24 | print("\nQuarterly Sales Data") 25 | tbl.execute().print() 26 | 27 | 28 | @udf( 29 | result_type=DataTypes.ROW( 30 | [ 31 | DataTypes.FIELD("seller_id", DataTypes.STRING()), 32 | DataTypes.FIELD("sales_total", DataTypes.INT()), 33 | DataTypes.FIELD("qtr_avg", DataTypes.DOUBLE()), 34 | DataTypes.FIELD("qtr_stdev", DataTypes.DOUBLE()), 35 | ] 36 | ) 37 | ) 38 | def sales_summary_stats(seller_sales: Row) -> Row: 39 | seller_id, q1, q2, q3, q4 = seller_sales 40 | sales = (q1, q2, q3, q4) 41 | total_sales = sum(sales) 42 | qtr_avg = round(mean(sales), 2) 43 | qtr_stdev = round(stdev(sales), 2) 44 | return Row(seller_id, total_sales, qtr_avg, qtr_stdev) 45 | 46 | 47 | sales_stats = tbl.map(sales_summary_stats).alias( 48 | "seller_id", "total_sales", "quarterly_avg", "quarterly_stdev" 49 | ) 50 | 51 | print("\nSales Summary Stats schema") 52 | sales_stats.print_schema() 53 | 54 | print("\nSales Summary Stats data") 55 | sales_stats.execute().print() 56 | -------------------------------------------------------------------------------- /fraud-detection/local/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | PKG_ALL="${PKG_ALL:-no}" 3 | 4 | SCRIPT_DIR="$(cd $(dirname "$0"); pwd)" 5 | 6 | #### Steps to package the flink app 7 | SRC_PATH=$SCRIPT_DIR/package 8 | rm -rf $SRC_PATH && mkdir -p $SRC_PATH/lib 9 | 10 | ## Download flink sql connector kafka 11 | echo "download flink sql connector kafka..." 12 | VERSION=1.15.2 13 | FILE_NAME=flink-sql-connector-kafka-$VERSION 14 | FLINK_SRC_DOWNLOAD_URL=https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/$VERSION/flink-sql-connector-kafka-$VERSION.jar 15 | curl -L -o $SRC_PATH/lib/$FILE_NAME.jar ${FLINK_SRC_DOWNLOAD_URL} 16 | 17 | ## Install pip packages 18 | echo "install and zip pip packages..." 19 | pip3 install -r requirements.txt --target $SRC_PATH/site_packages 20 | 21 | if [ $PKG_ALL == "yes" ]; then 22 | ## Package pyflink app 23 | echo "package pyflink app" 24 | zip -r kda-package.zip processor.py package/lib package/site_packages 25 | fi 26 | 27 | #### Steps to create the sink connector 28 | CONN_PATH=$SCRIPT_DIR/connectors 29 | rm -rf $CONN_PATH && mkdir $CONN_PATH 30 | 31 | ## Download camel dynamodb sink connector 32 | echo "download camel dynamodb sink connector..." 33 | CONNECTOR_SRC_DOWNLOAD_URL=https://repo.maven.apache.org/maven2/org/apache/camel/kafkaconnector/camel-aws-ddb-sink-kafka-connector/3.20.3/camel-aws-ddb-sink-kafka-connector-3.20.3-package.tar.gz 34 | 35 | ## decompress and zip contents to create custom plugin of msk connect later 36 | curl -o $CONN_PATH/camel-aws-ddb-sink-kafka-connector.tar.gz $CONNECTOR_SRC_DOWNLOAD_URL \ 37 | && tar -xvzf $CONN_PATH/camel-aws-ddb-sink-kafka-connector.tar.gz -C $CONN_PATH \ 38 | && cd $CONN_PATH/camel-aws-ddb-sink-kafka-connector \ 39 | && zip -r camel-aws-ddb-sink-kafka-connector.zip . \ 40 | && mv camel-aws-ddb-sink-kafka-connector.zip $CONN_PATH \ 41 | && rm $CONN_PATH/camel-aws-ddb-sink-kafka-connector.tar.gz -------------------------------------------------------------------------------- /pyflink-doc/data.py: -------------------------------------------------------------------------------- 1 | word_count_data = [ 2 | "To be, or not to be,--that is the question:--", 3 | "Whether 'tis nobler in the mind to suffer", 4 | "The slings and arrows of outrageous fortune", 5 | "Or to take arms against a sea of troubles,", 6 | "And by opposing end them?--To die,--to sleep,--", 7 | "No more; and by a sleep to say we end", 8 | "The heartache, and the thousand natural shocks", 9 | "That flesh is heir to,--'tis a consummation", 10 | "Devoutly to be wish'd. To die,--to sleep;--", 11 | "To sleep! perchance to dream:--ay, there's the rub;", 12 | "For in that sleep of death what dreams may come,", 13 | "When we have shuffled off this mortal coil,", 14 | "Must give us pause: there's the respect", 15 | "That makes calamity of so long life;", 16 | "For who would bear the whips and scorns of time,", 17 | "The oppressor's wrong, the proud man's contumely,", 18 | "The pangs of despis'd love, the law's delay,", 19 | "The insolence of office, and the spurns", 20 | "That patient merit of the unworthy takes,", 21 | "When he himself might his quietus make", 22 | "With a bare bodkin? who would these fardels bear,", 23 | "To grunt and sweat under a weary life,", 24 | "But that the dread of something after death,--", 25 | "The undiscover'd country, from whose bourn", 26 | "No traveller returns,--puzzles the will,", 27 | "And makes us rather bear those ills we have", 28 | "Than fly to others that we know not of?", 29 | "Thus conscience does make cowards of us all;", 30 | "And thus the native hue of resolution", 31 | "Is sicklied o'er with the pale cast of thought;", 32 | "And enterprises of great pith and moment,", 33 | "With this regard, their currents turn awry,", 34 | "And lose the name of action.--Soft you now!", 35 | "The fair Ophelia!--Nymph, in thy orisons", 36 | "Be all my sins remember'd.", 37 | ] 38 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/infra/variables.tf: -------------------------------------------------------------------------------- 1 | variable "vpn_to_create" { 2 | description = "Flag to indicate whether to create VPN" 3 | type = bool 4 | default = true 5 | } 6 | 7 | variable "vpn_to_use_spot" { 8 | description = "Flag to indicate whether to use a spot instance for VPN" 9 | type = bool 10 | default = false 11 | } 12 | 13 | variable "vpn_to_limit_vpn_ingress" { 14 | description = "Flag to indicate whether to limit ingress from the current machine's IP address" 15 | type = bool 16 | default = true 17 | } 18 | 19 | locals { 20 | name = "kda-getting-started" 21 | region = data.aws_region.current.name 22 | environment = "dev" 23 | 24 | vpc = { 25 | cidr = "10.0.0.0/16" 26 | azs = slice(data.aws_availability_zones.available.names, 0, 3) 27 | } 28 | 29 | default_bucket = { 30 | name = "${local.name}-${data.aws_caller_identity.current.account_id}-${local.region}" 31 | to_set_acl = false 32 | } 33 | 34 | vpn = { 35 | to_create = var.vpn_to_create 36 | to_use_spot = var.vpn_to_use_spot 37 | ingress_cidr = var.vpn_to_limit_vpn_ingress ? "${data.http.local_ip_address.response_body}/32" : "0.0.0.0/0" 38 | spot_override = [ 39 | { instance_type : "t3.small" }, 40 | { instance_type : "t3a.small" }, 41 | ] 42 | } 43 | 44 | msk = { 45 | version = "2.8.1" 46 | instance_size = "kafka.m5.large" 47 | ebs_volume_size = 20 48 | log_retention_ms = 604800000 # 7 days 49 | number_of_broker_nodes = 2 50 | num_partitions = 2 51 | default_replication_factor = 2 52 | } 53 | 54 | kda = { 55 | to_create = false 56 | runtime_env = "FLINK-1_15" 57 | package_name = "kda-package.zip" 58 | } 59 | 60 | tags = { 61 | Name = local.name 62 | Environment = local.environment 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /real-time-streaming-aws/forwarder/flinksql.sql: -------------------------------------------------------------------------------- 1 | -- docker exec -it jobmanager ./bin/sql-client.sh 2 | 3 | SET 'state.checkpoints.dir' = 'file:///tmp/checkpoints/'; 4 | SET 'execution.checkpointing.interval' = '5000'; 5 | 6 | ADD JAR '/etc/flink/package/lib/lab4-pipeline-1.0.0.jar'; 7 | 8 | CREATE TABLE taxi_rides_src ( 9 | id VARCHAR, 10 | vendor_id INT, 11 | pickup_date VARCHAR, 12 | dropoff_date VARCHAR, 13 | passenger_count INT, 14 | pickup_longitude VARCHAR, 15 | pickup_latitude VARCHAR, 16 | dropoff_longitude VARCHAR, 17 | dropoff_latitude VARCHAR, 18 | store_and_fwd_flag VARCHAR, 19 | gc_distance INT, 20 | trip_duration INT, 21 | google_distance INT, 22 | google_duration INT, 23 | process_time AS PROCTIME() 24 | ) WITH ( 25 | 'connector' = 'kafka', 26 | 'topic' = 'taxi-rides', 27 | 'properties.bootstrap.servers' = 'kafka-0:9092', 28 | 'properties.group.id' = 'soruce-group', 29 | 'format' = 'json', 30 | 'scan.startup.mode' = 'latest-offset' 31 | ); 32 | 33 | CREATE TABLE taxi_rides_sink ( 34 | vendor_id VARCHAR, 35 | trip_count BIGINT NOT NULL, 36 | passenger_count INT, 37 | trip_duration INT, 38 | window_start TIMESTAMP(3) NOT NULL, 39 | window_end TIMESTAMP(3) NOT NULL 40 | ) WITH ( 41 | 'connector' = 'opensearch', 42 | 'hosts' = 'http://opensearch:9200', 43 | 'index' = 'trip_stats' 44 | ); 45 | 46 | INSERT INTO taxi_rides_sink 47 | SELECT 48 | CAST(vendor_id AS STRING) AS vendor_id, 49 | COUNT(id) AS trip_count, 50 | SUM(passenger_count) AS passenger_count, 51 | SUM(trip_duration) AS trip_duration, 52 | window_start, 53 | window_end 54 | FROM TABLE( 55 | TUMBLE(TABLE taxi_rides_src, DESCRIPTOR(process_time), INTERVAL '5' SECONDS)) 56 | GROUP BY vendor_id, window_start, window_end; -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/compose-flink.yml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | 3 | services: 4 | jobmanager: 5 | image: pyflink:1.15.2-scala_2.12 6 | container_name: jobmanager 7 | command: jobmanager 8 | ports: 9 | - "8081:8081" 10 | networks: 11 | - flinknet 12 | environment: 13 | - | 14 | FLINK_PROPERTIES= 15 | jobmanager.rpc.address: jobmanager 16 | state.backend: filesystem 17 | state.checkpoints.dir: file:///tmp/flink-checkpoints 18 | heartbeat.interval: 1000 19 | heartbeat.timeout: 5000 20 | rest.flamegraph.enabled: true 21 | web.backpressure.refresh-interval: 10000 22 | - RUNTIME_ENV=DOCKER 23 | - BOOTSTRAP_SERVERS=$BOOTSTRAP_SERVERS 24 | - AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID 25 | - AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY 26 | - AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN 27 | volumes: 28 | - $PWD:/etc/flink 29 | taskmanager: 30 | image: pyflink:1.15.2-scala_2.12 31 | container_name: taskmanager 32 | command: taskmanager 33 | networks: 34 | - flinknet 35 | volumes: 36 | - flink_data:/tmp/ 37 | - $PWD:/etc/flink 38 | environment: 39 | - | 40 | FLINK_PROPERTIES= 41 | jobmanager.rpc.address: jobmanager 42 | taskmanager.numberOfTaskSlots: 3 43 | state.backend: filesystem 44 | state.checkpoints.dir: file:///tmp/flink-checkpoints 45 | heartbeat.interval: 1000 46 | heartbeat.timeout: 5000 47 | - RUNTIME_ENV=DOCKER 48 | - BOOTSTRAP_SERVERS=$BOOTSTRAP_SERVERS 49 | - AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID 50 | - AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY 51 | - AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN 52 | depends_on: 53 | - jobmanager 54 | 55 | networks: 56 | flinknet: 57 | name: flink-network 58 | 59 | volumes: 60 | flink_data: 61 | driver: local 62 | name: flink_data 63 | -------------------------------------------------------------------------------- /pyflink-udemy/s4_04_aggregations.py: -------------------------------------------------------------------------------- 1 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource 2 | from pyflink.table.expressions import col 3 | 4 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode()) 5 | 6 | field_names = "seller_id,product,quantity,product_price,sales_date".split(",") 7 | field_types = [ 8 | DataTypes.STRING(), 9 | DataTypes.STRING(), 10 | DataTypes.INT(), 11 | DataTypes.DOUBLE(), 12 | DataTypes.DATE(), 13 | ] 14 | source = CsvTableSource("./csv-input", field_names, field_types, ignore_first_line=True) 15 | 16 | tbl_env.register_table_source("product_locale_sales", source) 17 | tbl = tbl_env.from_path("product_locale_sales") 18 | 19 | tbl.order_by(col("quantity").asc).execute().print() 20 | tbl.order_by(col("quantity").asc).offset(1).fetch(2).execute().print() 21 | tbl.order_by(col("quantity").asc).fetch(8).execute().print() 22 | 23 | avg_price = ( 24 | tbl.select(col("product_price")).distinct().select(col("product_price").avg.alias("avg_price")) 25 | ) 26 | print("\navg_price data") 27 | avg_price.execute().print() 28 | 29 | avg_price2 = tbl_env.sql_query( 30 | """ 31 | SELECT avg(product_price) AS avg_price 32 | FROM product_locale_sales 33 | """ 34 | ) 35 | print("\navg_price2 data") 36 | avg_price2.execute().print() 37 | 38 | seller_revenue = ( 39 | tbl.select( 40 | col("seller_id"), col("product"), (col("product_price") * col("quantity")).alias("sales") 41 | ) 42 | .group_by(col("seller_id")) 43 | .select(col("seller_id"), col("sales").sum.alias("seller_revenue")) 44 | ) 45 | print("\nseller_revenue data") 46 | seller_revenue.execute().print() 47 | 48 | seller_revenue2 = tbl_env.sql_query( 49 | """ 50 | SELECT seller_id, sum(product_price * quantity) AS seller_revenue 51 | FROM product_locale_sales 52 | GROUP BY seller_id 53 | """ 54 | ) 55 | print("\nseller_revenue2 data") 56 | seller_revenue2.execute().print() 57 | -------------------------------------------------------------------------------- /sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/records/Ride.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Ververica GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.ververica.sql_training.data_producer.records; 18 | 19 | import com.fasterxml.jackson.annotation.JsonFormat; 20 | 21 | import java.util.Date; 22 | 23 | /** 24 | * POJO for a Ride record. 25 | */ 26 | public class Ride implements TaxiRecord { 27 | 28 | @JsonFormat 29 | private long rideId; 30 | @JsonFormat 31 | private boolean isStart; 32 | @JsonFormat 33 | private long taxiId; 34 | @JsonFormat(shape = JsonFormat.Shape.STRING, pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'") 35 | private Date eventTime; 36 | @JsonFormat 37 | private double lon; 38 | @JsonFormat 39 | private double lat; 40 | @JsonFormat 41 | private byte psgCnt; 42 | 43 | public Ride() {} 44 | 45 | public Ride(long rideId, boolean isStart, long taxiId, Date eventTime, double lon, double lat, byte psgCnt) { 46 | this.rideId = rideId; 47 | this.isStart = isStart; 48 | this.taxiId = taxiId; 49 | this.eventTime = eventTime; 50 | this.lon = lon; 51 | this.lat = lat; 52 | this.psgCnt = psgCnt; 53 | } 54 | 55 | @Override 56 | public Date getEventTime() { 57 | return eventTime; 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/chapter5/RollingSum.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.chapter5 2 | 3 | import org.apache.flink.api.java.tuple.Tuple3 4 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment 5 | 6 | /** 7 | * This Flink job demonstrates how to compute a "rolling sum" using the `.sum()` 8 | * aggregation function on a KeyedStream. 9 | * 10 | * A rolling sum is a continuous aggregation that is updated for every input event, 11 | * as opposed to a windowed aggregation which only emits a result at the end of a window. 12 | * 13 | * The pipeline works as follows: 14 | * 1. **Source**: A simple, static stream of `Tuple3` is created, where the first 15 | * element is the key and the second is the value to be summed. 16 | * 2. **KeyBy**: The stream is partitioned by the first field (`f0`) using a type-safe lambda. 17 | * 3. **Sum**: The `.sum(1)` operator maintains a running sum of the second field (at index 1) 18 | * for each key. It is a concise way to perform this specific aggregation. 19 | * 4. **Sink**: The resulting stream of continuous, updated sums is printed to the console. 20 | * 21 | * **Note on Best Practices:** While `.sum()` is convenient for Tuples, the modern, 22 | * recommended approach for most streaming applications is to use the more flexible and 23 | * fully type-safe `.reduce()` operator, especially when working with custom data classes. 24 | */ 25 | object RollingSum { 26 | @JvmStatic 27 | fun main(args: Array) { 28 | val env = StreamExecutionEnvironment.getExecutionEnvironment() 29 | 30 | val inputStream = 31 | env.fromData( 32 | Tuple3(1, 2, 2), 33 | Tuple3(2, 3, 1), 34 | Tuple3(2, 2, 4), 35 | Tuple3(1, 5, 3), 36 | ) 37 | 38 | val resultStream = inputStream.keyBy { it.f0 }.sum(1) 39 | 40 | resultStream.print() 41 | 42 | env.execute("Rolling Sum Example") 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/misc/ControlStreamGenerator.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.misc 2 | 3 | import org.apache.flink.api.common.eventtime.WatermarkStrategy 4 | import org.apache.flink.api.common.typeinfo.TypeInformation 5 | import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy 6 | import org.apache.flink.connector.datagen.source.DataGeneratorSource 7 | import org.apache.flink.streaming.api.datastream.DataStreamSource 8 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment 9 | 10 | /** 11 | * A generic data generator for creating mock control streams in Flink. 12 | * This object provides a single, reusable method to generate a DataStreamSource 13 | * from a provided list of data, emitting one element per second. 14 | */ 15 | object ControlStreamGenerator { 16 | /** 17 | * Creates a Flink DataStreamSource from a list of elements. 18 | * 19 | * @param T The type of elements in the stream. 20 | * @param env The Flink StreamExecutionEnvironment. 21 | * @param sourceName A descriptive name for the Flink source. 22 | * @param data The list of data to be emitted by the source. 23 | * @param typeInfo The Flink TypeInformation for the data type T. 24 | * @return A DataStreamSource that will emit the elements from the data list. 25 | */ 26 | fun createSource( 27 | env: StreamExecutionEnvironment, 28 | sourceName: String, 29 | data: List, 30 | typeInfo: TypeInformation, 31 | ): DataStreamSource { 32 | val generatorSource = 33 | DataGeneratorSource( 34 | { index -> data[index.toInt()] }, 35 | data.size.toLong(), 36 | RateLimiterStrategy.perSecond(1.0), 37 | typeInfo, 38 | ) 39 | 40 | return env.fromSource( 41 | generatorSource, 42 | WatermarkStrategy.noWatermarks(), 43 | sourceName, 44 | ) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /fraud-detection/remote/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | shopt -s extglob 3 | 4 | PKG_ALL="${PKG_ALL:-yes}" 5 | SCRIPT_DIR="$(cd $(dirname "$0"); pwd)" 6 | 7 | #### Steps to package the flink app 8 | # remove contents under $SRC_PATH (except for uber-jar-for-pyflink) and kda-package.zip file 9 | SRC_PATH=$SCRIPT_DIR/package 10 | rm -rf $SRC_PATH/!(uber-jar-for-pyflink) kda-package.zip 11 | 12 | ## Generate Uber Jar for PyFlink app for MSK cluster with IAM authN 13 | echo "generate Uber jar for PyFlink app..." 14 | mkdir $SRC_PATH/lib 15 | mvn clean install -f $SRC_PATH/uber-jar-for-pyflink/pom.xml \ 16 | && mv $SRC_PATH/uber-jar-for-pyflink/target/pyflink-getting-started-1.0.0.jar $SRC_PATH/lib \ 17 | && rm -rf $SRC_PATH/uber-jar-for-pyflink/target 18 | 19 | ## Install pip packages 20 | echo "install and zip pip packages..." 21 | pip install -r requirements.txt --target $SRC_PATH/site_packages 22 | 23 | if [ $PKG_ALL == "yes" ]; then 24 | ## Package pyflink app 25 | echo "package pyflink app" 26 | zip -r kda-package.zip processor.py package/lib package/site_packages 27 | fi 28 | 29 | #### Steps to create the sink connector 30 | CONN_PATH=$SCRIPT_DIR/connectors 31 | rm -rf $CONN_PATH && mkdir $CONN_PATH 32 | 33 | ## Download camel dynamodb sink connector 34 | echo "download camel dynamodb sink connector..." 35 | CONNECTOR_SRC_DOWNLOAD_URL=https://repo.maven.apache.org/maven2/org/apache/camel/kafkaconnector/camel-aws-ddb-sink-kafka-connector/3.20.3/camel-aws-ddb-sink-kafka-connector-3.20.3-package.tar.gz 36 | 37 | ## decompress and zip contents to create custom plugin of msk connect later 38 | curl -o $CONN_PATH/camel-aws-ddb-sink-kafka-connector.tar.gz $CONNECTOR_SRC_DOWNLOAD_URL \ 39 | && tar -xvzf $CONN_PATH/camel-aws-ddb-sink-kafka-connector.tar.gz -C $CONN_PATH \ 40 | && cd $CONN_PATH/camel-aws-ddb-sink-kafka-connector \ 41 | && zip -r camel-aws-ddb-sink-kafka-connector.zip . \ 42 | && mv camel-aws-ddb-sink-kafka-connector.zip $CONN_PATH \ 43 | && rm $CONN_PATH/camel-aws-ddb-sink-kafka-connector.tar.gz -------------------------------------------------------------------------------- /pyflink-udemy/s4_01_projections.py: -------------------------------------------------------------------------------- 1 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource 2 | from pyflink.table.expressions import col 3 | 4 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode()) 5 | 6 | field_names = "seller_id,product,quantity,product_price,sales_date".split(",") 7 | field_types = [ 8 | DataTypes.STRING(), 9 | DataTypes.STRING(), 10 | DataTypes.INT(), 11 | DataTypes.DOUBLE(), 12 | DataTypes.DATE(), 13 | ] 14 | source = CsvTableSource("./csv-input", field_names, field_types, ignore_first_line=True) 15 | 16 | tbl_env.register_table_source("product_locale_sales", source) 17 | tbl = tbl_env.from_path("product_locale_sales") 18 | 19 | redundant_prices = tbl.select(col("product"), col("product_price").alias("price")) 20 | print("\nredundant_prices data") 21 | redundant_prices.execute().print() 22 | 23 | redundant_prices2 = tbl_env.sql_query( 24 | f"SELECT product, product_price As price FROM product_locale_sales" 25 | ) 26 | print("\nredundant_prices2 data") 27 | redundant_prices2.execute().print() 28 | 29 | distinct_prices = tbl.select(col("product"), col("product_price").alias("price")).distinct() 30 | print("\ndistinct_prices data") 31 | distinct_prices.execute().print() 32 | 33 | distinct_prices2 = tbl_env.sql_query( 34 | "SELECT DISTINCT product, product_price AS price FROM product_locale_sales" 35 | ) 36 | print("\ndistinct_prices2 data") 37 | distinct_prices2.execute().print() 38 | 39 | product_sales = tbl.select( 40 | col("sales_date"), 41 | col("seller_id"), 42 | col("product"), 43 | (col("product_price") * col("quantity")).alias("sales"), 44 | ).distinct() 45 | print("\nproduct_sales data") 46 | product_sales.execute().print() 47 | 48 | product_sales2 = tbl_env.sql_query( 49 | """ 50 | SELECT DISTINCT 51 | sales_date, seller_id, product, product_price * quantity AS sales 52 | FROM product_locale_sales 53 | """ 54 | ) 55 | print("\nproduct_sales2 data") 56 | product_sales2.execute().print() 57 | -------------------------------------------------------------------------------- /pyflink-udemy/s3_04_kafka_source.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.table import EnvironmentSettings, TableEnvironment 4 | 5 | BOOTSTRAP_SERVERS = os.getenv("BOOTSTRAP_SERVERS", "localhost:29092") 6 | # https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/connectors/table/kafka/ 7 | version_map = {"15": "1.15.4", "16": "1.16.0"} 8 | FLINK_VERSION = version_map[os.getenv("MINOR_VERSION", "15")] 9 | FLINK_SQL_CONNECTOR_KAFKA = f"flink-sql-connector-kafka-{FLINK_VERSION}.jar" 10 | 11 | env_settings = EnvironmentSettings.in_streaming_mode() 12 | table_env = TableEnvironment.create(env_settings) 13 | # https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/dev/python/dependency_management/ 14 | kafka_jar = os.path.join(os.path.abspath(os.path.dirname(__file__)), FLINK_SQL_CONNECTOR_KAFKA) 15 | table_env.get_config().set("pipeline.jars", f"file://{kafka_jar}") 16 | 17 | ## create kafka source table 18 | table_env.execute_sql( 19 | f""" 20 | CREATE TABLE product_sales ( 21 | `seller_id` VARCHAR, 22 | `product` VARCHAR, 23 | `quantity` INT, 24 | `product_price` DOUBLE, 25 | `sales_date` VARCHAR 26 | ) WITH ( 27 | 'connector' = 'kafka', 28 | 'topic' = 'product_sales', 29 | 'properties.bootstrap.servers' = '{BOOTSTRAP_SERVERS}', 30 | 'properties.group.id' = 'source-demo', 31 | 'format' = 'json', 32 | 'scan.startup.mode' = 'earliest-offset', 33 | 'json.fail-on-missing-field' = 'false', 34 | 'json.ignore-parse-errors' = 'true' 35 | ) 36 | """ 37 | ) 38 | 39 | ## create print sink table 40 | table_env.execute_sql( 41 | f""" 42 | CREATE TABLE print ( 43 | `seller_id` VARCHAR, 44 | `product` VARCHAR, 45 | `quantity` INT, 46 | `product_price` DOUBLE, 47 | `sales_date` VARCHAR 48 | ) WITH ( 49 | 'connector' = 'print' 50 | ) 51 | """ 52 | ) 53 | 54 | ## insert into sink table 55 | tbl = table_env.from_path("product_sales") 56 | tbl.execute_insert("print").wait() 57 | -------------------------------------------------------------------------------- /sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/records/Fare.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Ververica GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.ververica.sql_training.data_producer.records; 18 | 19 | import com.fasterxml.jackson.annotation.JsonFormat; 20 | 21 | import java.util.Date; 22 | 23 | /** 24 | * POJO for a Fare record. 25 | */ 26 | public class Fare implements TaxiRecord { 27 | 28 | @JsonFormat 29 | private long rideId; 30 | @JsonFormat(shape = JsonFormat.Shape.STRING, pattern = "yyyy-MM-dd'T'HH:mm:ss'Z'") 31 | private Date eventTime; 32 | @JsonFormat(shape = JsonFormat.Shape.STRING) 33 | private PayMethod payMethod; 34 | @JsonFormat 35 | private double fare; 36 | @JsonFormat 37 | private double toll; 38 | @JsonFormat 39 | private double tip; 40 | 41 | public Fare() {} 42 | 43 | public Fare(long rideId, Date eventTime, PayMethod payMethod, double fare, double toll, double tip) { 44 | this.rideId = rideId; 45 | this.eventTime = eventTime; 46 | this.payMethod = payMethod; 47 | this.fare = fare; 48 | this.toll = toll; 49 | this.tip = tip; 50 | } 51 | 52 | @Override 53 | public Date getEventTime() { 54 | return eventTime; 55 | } 56 | 57 | public static enum PayMethod { 58 | CSH, 59 | CRD, 60 | DIS, 61 | NOC, 62 | UNK 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /sql-training/receipts.md: -------------------------------------------------------------------------------- 1 | ## Apache Flink® SQL Training 2 | 3 | - [GitHub](https://github.com/ververica/sql-training/tree/master) 4 | 5 | ### Sessions 6 | 7 | ```sql 8 | CREATE TABLE rides ( 9 | rideId INT, 10 | taxiId INT, 11 | isStart BOOLEAN, 12 | lon FLOAT, 13 | lat FLOAT, 14 | psgCnt INT, 15 | eventTime STRING, 16 | rideTime AS TO_TIMESTAMP(eventTime, 'yyyy-MM-dd''T''HH:mm:ss''Z'''), 17 | WATERMARK FOR rideTime AS rideTime - INTERVAL '60' SECOND 18 | ) 19 | WITH ( 20 | 'connector' = 'kafka', 21 | 'topic' = 'Rides', 22 | 'properties.bootstrap.servers' = 'kafka-0:9092', 23 | 'properties.group.id' = 'rides', 24 | 'scan.startup.mode' = 'earliest-offset', 25 | 'format' = 'json' 26 | ); 27 | 28 | CREATE TABLE fairs ( 29 | rideId INT, 30 | payMethod STRING, 31 | tip FLOAT, 32 | toll FLOAT, 33 | fare FLOAT, 34 | eventTime STRING, 35 | payTime AS TO_TIMESTAMP(eventTime, 'yyyy-MM-dd''T''HH:mm:ss''Z'''), 36 | WATERMARK FOR payTime AS payTime - INTERVAL '60' SECOND 37 | ) 38 | WITH ( 39 | 'connector' = 'kafka', 40 | 'topic' = 'Fares', 41 | 'properties.bootstrap.servers' = 'kafka-0:9092', 42 | 'properties.group.id' = 'fares', 43 | 'scan.startup.mode' = 'earliest-offset', 44 | 'format' = 'json' 45 | ); 46 | 47 | CREATE TABLE driver_changes ( 48 | taxiId INT, 49 | driverId INT, 50 | eventTime STRING, 51 | usageStartTime AS TO_TIMESTAMP(eventTime, 'yyyy-MM-dd''T''HH:mm:ss''Z'''), 52 | WATERMARK FOR usageStartTime AS usageStartTime - INTERVAL '60' SECOND 53 | ) 54 | WITH ( 55 | 'connector' = 'kafka', 56 | 'topic' = 'DriverChanges', 57 | 'properties.bootstrap.servers' = 'kafka-0:9092', 58 | 'properties.group.id' = 'driver-changes', 59 | 'scan.startup.mode' = 'earliest-offset', 60 | 'format' = 'json' 61 | ); 62 | ``` 63 | 64 | #### Introduction to SQL on Flink 65 | 66 | #### Querying Dynamic Tables with SQL 67 | 68 | #### Queries and Time 69 | 70 | #### Joining Dynamic Tables 71 | 72 | #### Pattern Matching with MATCH_RECOGNIZE 73 | 74 | #### Creating Tables & Writing Query Results to External Systems 75 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter6/utils/model.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import dataclasses 3 | from typing import Iterable, Tuple 4 | 5 | from pyflink.common import Row 6 | from pyflink.common.typeinfo import Types 7 | 8 | from .type_helper import TypeMapping, set_type_info 9 | 10 | 11 | @dataclasses.dataclass 12 | class SensorReading(TypeMapping): 13 | id: str 14 | timestamp: int 15 | num_records: int 16 | temperature: float 17 | 18 | def to_row(self): 19 | return Row(**dataclasses.asdict(self)) 20 | 21 | @classmethod 22 | def from_row(cls, row: Row): 23 | return cls(**row.as_dict()) 24 | 25 | @classmethod 26 | def from_tuple(cls, tup: Tuple[int, int, datetime.datetime]): 27 | return cls( 28 | id=f"sensor_{tup[0]}", 29 | timestamp=int(tup[2].strftime("%s")) * 1000, 30 | num_records=1, 31 | temperature=65 + (tup[1] / 100 * 20), 32 | ) 33 | 34 | @staticmethod 35 | def process_elements(elements: Iterable[Tuple[int, int, datetime.datetime]]): 36 | id, count, temperature = None, 0, 0 37 | for e in elements: 38 | next_id = f"sensor_{e[0]}" 39 | if id is not None: 40 | assert id == next_id 41 | id = next_id 42 | count += 1 43 | temperature += 65 + (e[1] / 100 * 20) 44 | return id, count, temperature 45 | 46 | @staticmethod 47 | def type_mapping(): 48 | return { 49 | "id": Types.STRING(), 50 | "timestamp": Types.LONG(), 51 | "num_records": Types.INT(), 52 | "temperature": Types.DOUBLE(), 53 | } 54 | 55 | @staticmethod 56 | def set_key_type_info(): 57 | return set_type_info(SensorReading.type_mapping(), selects=["id"]) 58 | 59 | @staticmethod 60 | def set_value_type_info(): 61 | return set_type_info(SensorReading.type_mapping()) 62 | 63 | 64 | @dataclasses.dataclass 65 | class MinMaxTemp: 66 | id: str 67 | min_temp: float 68 | max_temp: float 69 | num_records: int 70 | timestamp: int 71 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter7/utils/model.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import dataclasses 3 | from typing import Iterable, Tuple 4 | 5 | from pyflink.common import Row 6 | from pyflink.common.typeinfo import Types 7 | 8 | from .type_helper import TypeMapping, set_type_info 9 | 10 | 11 | @dataclasses.dataclass 12 | class SensorReading(TypeMapping): 13 | id: str 14 | timestamp: int 15 | num_records: int 16 | temperature: float 17 | 18 | def to_row(self): 19 | return Row(**dataclasses.asdict(self)) 20 | 21 | @classmethod 22 | def from_row(cls, row: Row): 23 | return cls(**row.as_dict()) 24 | 25 | @classmethod 26 | def from_tuple(cls, tup: Tuple[int, int, datetime.datetime]): 27 | return cls( 28 | id=f"sensor_{tup[0]}", 29 | timestamp=int(tup[2].strftime("%s")) * 1000, 30 | num_records=1, 31 | temperature=65 + (tup[1] / 100 * 20), 32 | ) 33 | 34 | @staticmethod 35 | def process_elements(elements: Iterable[Tuple[int, int, datetime.datetime]]): 36 | id, count, temperature = None, 0, 0 37 | for e in elements: 38 | next_id = f"sensor_{e[0]}" 39 | if id is not None: 40 | assert id == next_id 41 | id = next_id 42 | count += 1 43 | temperature += 65 + (e[1] / 100 * 20) 44 | return id, count, temperature 45 | 46 | @staticmethod 47 | def type_mapping(): 48 | return { 49 | "id": Types.STRING(), 50 | "timestamp": Types.LONG(), 51 | "num_records": Types.INT(), 52 | "temperature": Types.DOUBLE(), 53 | } 54 | 55 | @staticmethod 56 | def set_key_type_info(): 57 | return set_type_info(SensorReading.type_mapping(), selects=["id"]) 58 | 59 | @staticmethod 60 | def set_value_type_info(): 61 | return set_type_info(SensorReading.type_mapping()) 62 | 63 | 64 | @dataclasses.dataclass 65 | class MinMaxTemp: 66 | id: str 67 | min_temp: float 68 | max_temp: float 69 | num_records: int 70 | timestamp: int 71 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/chapter5/KeyedTransformations.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.chapter5 2 | 3 | import me.jaehyeon.sensor.SensorReading 4 | import me.jaehyeon.sensor.SensorSource 5 | import org.apache.flink.api.common.eventtime.WatermarkStrategy 6 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment 7 | import java.time.Duration 8 | 9 | /** 10 | * This Flink job demonstrates transformations on a `KeyedStream`. 11 | * 12 | * It showcases the `reduce` operator, a powerful tool for maintaining running aggregates 13 | * for each key in a stream. 14 | * 15 | * The pipeline is as follows: 16 | * 1. **Source**: Ingests a stream of `SensorReading` events. 17 | * 2. **KeyBy**: Partitions the stream by the `id` of each sensor. All subsequent 18 | * operations will run independently for each sensor. 19 | * 3. **Reduce**: For each key, this operator maintains a running state of the `SensorReading` 20 | * with the maximum temperature seen so far. For every new reading that arrives, it 21 | * compares it to the current maximum and emits the new maximum downstream. 22 | * 4. **Sink**: Prints the continuous stream of running maximums for each sensor to the console. 23 | */ 24 | object KeyedTransformations { 25 | @JvmStatic 26 | fun main(args: Array) { 27 | val env = StreamExecutionEnvironment.getExecutionEnvironment() 28 | 29 | val readings = 30 | env.fromSource( 31 | SensorSource(), 32 | WatermarkStrategy 33 | .forBoundedOutOfOrderness(Duration.ofSeconds(5)) 34 | .withTimestampAssigner { reading, _ -> 35 | reading.timestamp 36 | }, 37 | "Sensor Source", 38 | ) 39 | 40 | val keyed = readings.keyBy { it.id } 41 | val maxTempPerSensor = 42 | keyed.reduce { r1, r2 -> 43 | if (r1.temperature > r2.temperature) r1 else r2 44 | } 45 | 46 | maxTempPerSensor.print() 47 | 48 | env.execute("Keyed Transformations Example") 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /pyflink-getting-started-on-aws/remote/infra/outputs.tf: -------------------------------------------------------------------------------- 1 | # VPC 2 | output "vpc_id" { 3 | description = "The ID of the VPC" 4 | value = module.vpc.vpc_id 5 | } 6 | 7 | output "vpc_cidr_block" { 8 | description = "The CIDR block of the VPC" 9 | value = module.vpc.vpc_cidr_block 10 | } 11 | 12 | output "private_subnets" { 13 | description = "List of IDs of private subnets" 14 | value = module.vpc.private_subnets 15 | } 16 | 17 | output "public_subnets" { 18 | description = "List of IDs of public subnets" 19 | value = module.vpc.public_subnets 20 | } 21 | 22 | output "nat_public_ips" { 23 | description = "List of public Elastic IPs created for AWS NAT Gateway" 24 | value = module.vpc.nat_public_ips 25 | } 26 | 27 | output "azs" { 28 | description = "A list of availability zones specified as argument to this module" 29 | value = module.vpc.azs 30 | } 31 | 32 | # Default bucket 33 | output "default_bucket_name" { 34 | description = "Default bucket name" 35 | value = aws_s3_bucket.default_bucket.id 36 | } 37 | 38 | # VPN 39 | output "vpn_launch_template_arn" { 40 | description = "The ARN of the VPN launch template" 41 | value = { 42 | for k, v in module.vpn : k => v.launch_template_arn 43 | } 44 | } 45 | 46 | output "vpn_autoscaling_group_id" { 47 | description = "VPN autoscaling group id" 48 | value = { 49 | for k, v in module.vpn : k => v.autoscaling_group_id 50 | } 51 | } 52 | 53 | output "vpn_autoscaling_group_name" { 54 | description = "VPN autoscaling group name" 55 | value = { 56 | for k, v in module.vpn : k => v.autoscaling_group_name 57 | } 58 | } 59 | 60 | # MSK 61 | output "msk_arn" { 62 | description = "Amazon Resource Name (ARN) of the MSK cluster" 63 | value = aws_msk_cluster.msk_data_cluster.arn 64 | } 65 | 66 | output "msk_bootstrap_brokers_sasl_iam" { 67 | description = "One or more DNS names (or IP addresses) and SASL IAM port pairs" 68 | value = aws_msk_cluster.msk_data_cluster.bootstrap_brokers_sasl_iam 69 | } 70 | 71 | # KDA 72 | output "kda_app_arn" { 73 | description = "Kinesis Application ARN" 74 | value = local.kda.to_create ? aws_kinesisanalyticsv2_application.kda_app[0].arn : "NA" 75 | } 76 | -------------------------------------------------------------------------------- /pyflink-udemy/s4_05_producer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | import time 4 | import json 5 | import random 6 | 7 | from kafka import KafkaProducer 8 | 9 | 10 | class Sales: 11 | def __init__(self): 12 | self.products = [ 13 | {"product": "Toothpaste", "product_price": 4.99}, 14 | {"product": "Toothbrush", "product_price": 3.99}, 15 | {"product": "Dental Floss", "product_price": 1.99}, 16 | ] 17 | self.sellers = ["LNK", "OMA", "KC", "DEN"] 18 | 19 | def make_sales_item(self): 20 | return { 21 | **{ 22 | "seller_id": random.choice(self.sellers), 23 | "quantity": random.randint(1, 5), 24 | "sale_ts": int(time.time() * 1000), 25 | }, 26 | **random.choice(self.products), 27 | } 28 | 29 | def create(self, num: int): 30 | return [self.make_sales_item() for _ in range(num)] 31 | 32 | 33 | class Producer: 34 | def __init__(self, bootstrap_servers: list, topic: str): 35 | self.bootstrap_servers = bootstrap_servers 36 | self.topic = topic 37 | self.producer = self.create() 38 | 39 | def create(self): 40 | return KafkaProducer( 41 | bootstrap_servers=self.bootstrap_servers, 42 | value_serializer=lambda v: json.dumps(v, default=self.serialize).encode("utf-8"), 43 | ) 44 | 45 | def send(self, sales_items: list): 46 | for item in sales_items: 47 | self.producer.send(self.topic, value=item) 48 | self.producer.flush() 49 | 50 | def serialize(self, obj): 51 | if isinstance(obj, datetime.datetime): 52 | return obj.isoformat() 53 | if isinstance(obj, datetime.date): 54 | return str(obj) 55 | return obj 56 | 57 | 58 | if __name__ == "__main__": 59 | producer = Producer( 60 | bootstrap_servers=os.getenv("BOOTSTRAP_SERVERS", "localhost:29092").split(","), 61 | topic=os.getenv("TOPIC_NAME", "sales_items"), 62 | ) 63 | 64 | while True: 65 | sales_items = Sales().create(10) 66 | producer.send(sales_items) 67 | secs = random.randint(5, 10) 68 | print(f"messages sent... wait {secs} seconds") 69 | time.sleep(secs) 70 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/chapter5/BasicTransformations.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.chapter5 2 | 3 | import me.jaehyeon.sensor.SensorReading 4 | import me.jaehyeon.sensor.SensorSource 5 | import org.apache.flink.api.common.eventtime.WatermarkStrategy 6 | import org.apache.flink.api.common.typeinfo.Types 7 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment 8 | import org.apache.flink.util.Collector 9 | import java.time.Duration 10 | 11 | /** 12 | * This Flink job demonstrates basic, non-keyed transformations on a DataStream. 13 | * 14 | * It showcases a simple pipeline: 15 | * 1. **Source**: Ingests a stream of `SensorReading` events from a custom source. 16 | * 2. **Filter**: Discards readings with a temperature below 25. 17 | * 3. **Map**: Transforms the remaining `SensorReading` objects into just their String IDs. 18 | * 4. **FlatMap**: Splits each String ID into its constituent parts (e.g., "sensor_1" -> "sensor", "1"). 19 | * 5. **Sink**: Prints the final stream of ID parts to the console. 20 | * 21 | * This example highlights stateless, one-to-one (map, filter) and one-to-many (flatMap) transformations. 22 | */ 23 | object BasicTransformations { 24 | @JvmStatic 25 | fun main(args: Array) { 26 | val env = StreamExecutionEnvironment.getExecutionEnvironment() 27 | 28 | val readings = 29 | env.fromSource( 30 | SensorSource(), 31 | WatermarkStrategy 32 | .forBoundedOutOfOrderness(Duration.ofSeconds(5)) 33 | .withTimestampAssigner { reading, _ -> 34 | reading.timestamp 35 | }, 36 | "Sensor Source", 37 | ) 38 | 39 | val filteredSensors = readings.filter { r -> r.temperature >= 25 } 40 | val sensorIds = filteredSensors.map { r -> r.id } 41 | val splitIds = 42 | sensorIds 43 | .flatMap { id, out: Collector -> 44 | id.split("_").forEach { part -> 45 | out.collect(part) 46 | } 47 | }.returns(Types.STRING) 48 | 49 | splitIds.print() 50 | 51 | env.execute("Basic Transformations Example") 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /pyflink-udemy/s4_02_filtering.py: -------------------------------------------------------------------------------- 1 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource 2 | from pyflink.table.expressions import col, lit, and_ 3 | 4 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode()) 5 | 6 | field_names = "seller_id,product,quantity,product_price,sales_date".split(",") 7 | field_types = [ 8 | DataTypes.STRING(), 9 | DataTypes.STRING(), 10 | DataTypes.INT(), 11 | DataTypes.DOUBLE(), 12 | DataTypes.DATE(), 13 | ] 14 | source = CsvTableSource("./csv-input", field_names, field_types, ignore_first_line=True) 15 | 16 | tbl_env.register_table_source("product_locale_sales", source) 17 | tbl = tbl_env.from_path("product_locale_sales") 18 | 19 | high_sales = ( 20 | tbl.select( 21 | col("sales_date"), 22 | col("seller_id"), 23 | col("product"), 24 | (col("product_price") * col("quantity")).alias("sales"), 25 | ) 26 | .distinct() 27 | .where(col("sales") >= 80) 28 | ) 29 | print("\nhigh_sales data") 30 | high_sales.execute().print() 31 | 32 | high_sales2 = tbl_env.sql_query( 33 | """ 34 | WITH distinct_sales AS ( 35 | SELECT DISTINCT 36 | sales_date, seller_id, product, product_price * quantity AS sales 37 | FROM product_locale_sales 38 | ) 39 | SELECT * 40 | FROM distinct_sales 41 | WHERE sales >= 80 42 | """ 43 | ) 44 | print("\nhigh_sales2 data") 45 | high_sales2.execute().print() 46 | 47 | july1_high_sales = ( 48 | tbl.select( 49 | col("sales_date"), 50 | col("seller_id"), 51 | col("product"), 52 | (col("product_price") * col("quantity")).alias("sales"), 53 | ) 54 | .distinct() 55 | .where(and_(col("sales") >= 80, col("sales_date") == lit("2021-07-01").to_date)) 56 | ) 57 | print("\njuly1_high_sales data") 58 | july1_high_sales.execute().print() 59 | 60 | july1_high_sales2 = tbl_env.sql_query( 61 | """ 62 | WITH distinct_sales AS ( 63 | SELECT DISTINCT 64 | sales_date, seller_id, product, product_price * quantity AS sales 65 | FROM product_locale_sales 66 | ) 67 | SELECT * 68 | FROM distinct_sales 69 | WHERE sales >= 80 and sales_date = '2021-07-01' 70 | """ 71 | ) 72 | print("\njuly1_high_sales2 data") 73 | july1_high_sales2.execute().print() 74 | -------------------------------------------------------------------------------- /fraud-detection/remote/infra/variables.tf: -------------------------------------------------------------------------------- 1 | variable "vpn_to_create" { 2 | description = "Flag to indicate whether to create VPN" 3 | type = bool 4 | default = true 5 | } 6 | 7 | variable "vpn_to_use_spot" { 8 | description = "Flag to indicate whether to use a spot instance for VPN" 9 | type = bool 10 | default = false 11 | } 12 | 13 | variable "vpn_to_limit_vpn_ingress" { 14 | description = "Flag to indicate whether to limit ingress from the current machine's IP address" 15 | type = bool 16 | default = true 17 | } 18 | 19 | locals { 20 | name = "fraud-detection" 21 | region = data.aws_region.current.name 22 | environment = "dev" 23 | 24 | vpc = { 25 | cidr = "10.0.0.0/16" 26 | azs = slice(data.aws_availability_zones.available.names, 0, 3) 27 | } 28 | 29 | default_bucket = { 30 | name = "${local.name}-${data.aws_caller_identity.current.account_id}-${local.region}" 31 | to_set_acl = false 32 | } 33 | 34 | vpn = { 35 | to_create = var.vpn_to_create 36 | to_use_spot = var.vpn_to_use_spot 37 | ingress_cidr = var.vpn_to_limit_vpn_ingress ? "${data.http.local_ip_address.response_body}/32" : "0.0.0.0/0" 38 | spot_override = [ 39 | { instance_type : "t3.small" }, 40 | { instance_type : "t3a.small" }, 41 | ] 42 | } 43 | 44 | msk = { 45 | version = "2.8.1" 46 | instance_size = "kafka.m5.large" 47 | ebs_volume_size = 20 48 | log_retention_ms = 604800000 # 7 days 49 | number_of_broker_nodes = 2 50 | num_partitions = 2 51 | default_replication_factor = 2 52 | } 53 | 54 | msk_connect = { 55 | package_name = "camel-aws-ddb-sink-kafka-connector.zip" 56 | } 57 | 58 | kda = { 59 | runtime_env = "FLINK-1_15" 60 | package_name = "kda-package.zip" 61 | consumer_0 = { 62 | table_name = "flagged_accounts" 63 | topic_name = "flagged-accounts" 64 | } 65 | consumer_1 = { 66 | table_name = "transactions" 67 | topic_name = "transactions" 68 | } 69 | producer_0 = { 70 | table_name = "flagged_transactions" 71 | topic_name = "flagged-transactions" 72 | } 73 | } 74 | 75 | tags = { 76 | Name = local.name 77 | Environment = local.environment 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /sql-cookbook/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM flink:1.17.1 2 | 3 | ARG PYTHON_VERSION 4 | ENV PYTHON_VERSION=${PYTHON_VERSION:-3.8.10} 5 | ARG FLINK_VERSION 6 | ENV FLINK_VERSION=${FLINK_VERSION:-1.17.1} 7 | 8 | RUN wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-kafka/$FLINK_VERSION/flink-connector-kafka-$FLINK_VERSION.jar; \ 9 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/kafka/kafka-clients/3.2.3/kafka-clients-3.2.3.jar; \ 10 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/$FLINK_VERSION/flink-sql-connector-kafka-$FLINK_VERSION.jar; \ 11 | wget -P /opt/flink/lib/ https://github.com/knaufk/flink-faker/releases/download/v0.5.3/flink-faker-0.5.3.jar; 12 | 13 | ## Python version (3.7, 3.8, 3.9 or 3.10) is required, apt repo 14 | 15 | # Python 3.3 and later versions provide the lzma module. 16 | # However, if Python is installed using the source code and the lzma-dev package is not installed in the system, 17 | # the lzma module will not be installed. 18 | # https://support.huawei.com/enterprise/en/doc/EDOC1100289998/db0db8f0/modulenotfounderror-no-module-named-_lzma- 19 | # INFO:root:Starting up Python harness in a standalone process. 20 | # Traceback (most recent call last): 21 | # File "/usr/local/lib/python3.8/site-packages/fastavro/read.py", line 2, in 22 | # from . import _read 23 | # File "fastavro/_read.pyx", line 11, in init fastavro._read 24 | # File "/usr/local/lib/python3.8/lzma.py", line 27, in 25 | # from _lzma import * 26 | # ModuleNotFoundError: No module named '_lzma' 27 | 28 | RUN apt-get update -y && \ 29 | apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev libffi-dev liblzma-dev && \ 30 | wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz && \ 31 | tar -xvf Python-${PYTHON_VERSION}.tgz && \ 32 | cd Python-${PYTHON_VERSION} && \ 33 | ./configure --without-tests --enable-shared && \ 34 | make -j6 && \ 35 | make install && \ 36 | ldconfig /usr/local/lib && \ 37 | cd .. && rm -f Python-${PYTHON_VERSION}.tgz && rm -rf Python-${PYTHON_VERSION} && \ 38 | ln -s /usr/local/bin/python3 /usr/local/bin/python && \ 39 | apt-get clean && \ 40 | rm -rf /var/lib/apt/lists/* 41 | 42 | # install PyFlink 43 | RUN pip3 install apache-flink==${FLINK_VERSION} -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/connector/HttpSplitEnumerator.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.connector 2 | 3 | import org.apache.flink.api.connector.source.SplitEnumerator 4 | import org.apache.flink.api.connector.source.SplitEnumeratorContext 5 | 6 | /** 7 | * The coordinator for the source. It runs on the JobManager. 8 | * Its job is to generate splits on demand from a configurable numeric range 9 | * and assign them to readers. It checkpoints its position in the cycle (`lastSeenId`). 10 | */ 11 | class HttpSplitEnumerator( 12 | private val context: SplitEnumeratorContext, 13 | private val baseUrlPattern: String, 14 | private val startId: Long, 15 | private val maxId: Long, 16 | restoredLastSeenId: Long?, 17 | ) : SplitEnumerator { 18 | private var lastSeenId: Long = restoredLastSeenId ?: (startId - 1) 19 | 20 | override fun start() {} 21 | 22 | override fun handleSplitRequest( 23 | subtaskId: Int, 24 | requesterHostname: String?, 25 | ) { 26 | // 1. Calculate the next ID in the cycle. 27 | var nextId = lastSeenId + 1 28 | // 2. Apply the configurable wrap-around rule. 29 | if (nextId > maxId) { 30 | nextId = startId 31 | } 32 | 33 | // 3. Generate the URL and create the split. 34 | val url = baseUrlPattern.replace("{id}", nextId.toString()) 35 | val split = HttpSplit(url) 36 | 37 | // 4. Assign the split to the requesting reader. 38 | context.assignSplit(split, subtaskId) 39 | 40 | // 5. CRITICAL: Update the state for the next request. 41 | this.lastSeenId = nextId 42 | } 43 | 44 | override fun addSplitsBack( 45 | splits: MutableList, 46 | subtaskId: Int, 47 | ) { 48 | // This source is cyclical and state-based, not queue-based. If a reader 49 | // fails, we don't need to re-add its specific splits. The cyclical logic 50 | // will naturally re-assign a split for that ID when its turn comes again. 51 | } 52 | 53 | override fun addReader(subtaskId: Int) {} 54 | 55 | // --- Checkpointing --- 56 | override fun snapshotState(checkpointId: Long): Long { 57 | // On checkpoint, save the last ID that was successfully assigned. 58 | return lastSeenId 59 | } 60 | 61 | override fun close() {} 62 | } 63 | -------------------------------------------------------------------------------- /pyflink-udemy/s4_03_joining.py: -------------------------------------------------------------------------------- 1 | from pyflink.table import EnvironmentSettings, TableEnvironment, DataTypes, CsvTableSource 2 | from pyflink.table.expressions import col 3 | 4 | tbl_env = TableEnvironment.create(EnvironmentSettings.in_batch_mode()) 5 | 6 | # sales source 7 | sales_field_names = "seller_id,product,quantity,product_price,sales_date".split(",") 8 | sales_field_types = [ 9 | DataTypes.STRING(), 10 | DataTypes.STRING(), 11 | DataTypes.INT(), 12 | DataTypes.DOUBLE(), 13 | DataTypes.DATE(), 14 | ] 15 | sales_source = CsvTableSource( 16 | "./csv-input", sales_field_names, sales_field_types, ignore_first_line=True 17 | ) 18 | tbl_env.register_table_source("product_locale_sales", sales_source) 19 | 20 | # sellers source 21 | sellers_field_names = "id,city,state".split(",") 22 | sellers_field_types = [DataTypes.STRING(), DataTypes.STRING(), DataTypes.STRING()] 23 | sellers_source = CsvTableSource( 24 | "./seller-input", sellers_field_names, sellers_field_types, ignore_first_line=True 25 | ) 26 | tbl_env.register_table_source("seller_locales", sellers_source) 27 | 28 | sales_tbl = tbl_env.from_path("product_locale_sales") 29 | sellers_tbl = tbl_env.from_path("seller_locales") 30 | 31 | seller_products = ( 32 | sales_tbl.join(sellers_tbl, col("seller_id") == col("id")) 33 | .select(col("city"), col("state"), col("product"), col("product_price")) 34 | .distinct() 35 | ) 36 | print("\nseller_products data") 37 | seller_products.execute().print() 38 | 39 | seller_products2 = tbl_env.sql_query( 40 | """ 41 | SELECT DISTINCT city, state, product, product_price 42 | FROM product_locale_sales l 43 | JOIN seller_locales r ON l.seller_id = r.id 44 | """ 45 | ) 46 | print("\nseller_products2 data") 47 | seller_products2.execute().print() 48 | 49 | sellers_no_sales = ( 50 | sales_tbl.right_outer_join(sellers_tbl, col("seller_id") == col("id")) 51 | .where(col("product").is_null) 52 | .select(col("city"), col("state"), col("product")) 53 | .distinct() 54 | ) 55 | print("\nsellers_no_sales data") 56 | sellers_no_sales.execute().print() 57 | 58 | sellers_no_sales2 = tbl_env.sql_query( 59 | """ 60 | SELECT city, state, product 61 | FROM product_locale_sales l 62 | RIGHT JOIN seller_locales r ON l.seller_id = r.id 63 | WHERE product IS NULL 64 | """ 65 | ) 66 | print("\nsellers_no_sales2 data") 67 | sellers_no_sales2.execute().print() 68 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/smoke/SmokeLevelSourceReader.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.smoke 2 | 3 | import org.apache.flink.api.connector.source.ReaderOutput 4 | import org.apache.flink.api.connector.source.SourceReader 5 | import org.apache.flink.api.connector.source.SourceReaderContext 6 | import org.apache.flink.core.io.InputStatus 7 | import org.apache.flink.util.concurrent.FutureUtils 8 | import java.util.Random 9 | import java.util.concurrent.CompletableFuture 10 | import java.util.concurrent.LinkedBlockingQueue 11 | 12 | /** 13 | * The SourceReader for the SmokeLevelSource. It runs on TaskManagers and 14 | * generates the stream of SmokeLevel events in a background thread. 15 | */ 16 | class SmokeLevelSourceReader( 17 | private val readerContext: SourceReaderContext, 18 | ) : SourceReader { 19 | private val buffer = LinkedBlockingQueue(1) 20 | 21 | @Volatile 22 | private var running = false 23 | private var generatorThread: Thread? = null 24 | 25 | override fun start() { 26 | running = true 27 | } 28 | 29 | override fun addSplits(splits: List) { 30 | generatorThread = 31 | Thread { 32 | val rand = Random() 33 | try { 34 | while (running) { 35 | val smokeLevel = if (rand.nextGaussian() > 0.8) SmokeLevel.High else SmokeLevel.Low 36 | buffer.put(smokeLevel) 37 | Thread.sleep(1000) 38 | } 39 | } catch (e: InterruptedException) { 40 | // Thread interrupted, exit 41 | } 42 | } 43 | generatorThread?.start() 44 | } 45 | 46 | override fun pollNext(output: ReaderOutput): InputStatus? { 47 | val level = buffer.poll() 48 | return if (level != null) { 49 | output.collect(level) 50 | InputStatus.MORE_AVAILABLE 51 | } else { 52 | InputStatus.NOTHING_AVAILABLE 53 | } 54 | } 55 | 56 | override fun isAvailable(): CompletableFuture = FutureUtils.completedVoidFuture() 57 | 58 | override fun snapshotState(checkpointId: Long): List = mutableListOf() 59 | 60 | override fun notifyNoMoreSplits() {} 61 | 62 | override fun close() { 63 | running == false 64 | generatorThread?.interrupt() 65 | generatorThread?.join() 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /fraud-detection/local/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | 3 | services: 4 | zookeeper: 5 | image: bitnami/zookeeper:3.5 6 | container_name: zookeeper 7 | ports: 8 | - "2181" 9 | networks: 10 | - kafkanet 11 | environment: 12 | - ALLOW_ANONYMOUS_LOGIN=yes 13 | volumes: 14 | - zookeeper_data:/bitnami/zookeeper 15 | kafka-0: 16 | image: bitnami/kafka:2.8.1 17 | container_name: kafka-0 18 | expose: 19 | - 9092 20 | ports: 21 | - "29092:29092" 22 | networks: 23 | - kafkanet 24 | environment: 25 | - ALLOW_PLAINTEXT_LISTENER=yes 26 | - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181 27 | - KAFKA_CFG_BROKER_ID=0 28 | - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=INTERNAL:PLAINTEXT,EXTERNAL:PLAINTEXT 29 | - KAFKA_CFG_LISTENERS=INTERNAL://:9092,EXTERNAL://:29092 30 | - KAFKA_CFG_ADVERTISED_LISTENERS=INTERNAL://kafka-0:9092,EXTERNAL://localhost:29092 31 | - KAFKA_CFG_INTER_BROKER_LISTENER_NAME=INTERNAL 32 | - KAFKA_CFG_NUM_PARTITIONS=2 33 | volumes: 34 | - kafka_0_data:/bitnami/kafka 35 | depends_on: 36 | - zookeeper 37 | kafka-connect: 38 | image: bitnami/kafka:2.8.1 39 | container_name: connect 40 | command: > 41 | /opt/bitnami/kafka/bin/connect-distributed.sh 42 | /opt/bitnami/kafka/config/connect-distributed.properties 43 | ports: 44 | - "8083:8083" 45 | networks: 46 | - kafkanet 47 | environment: 48 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID 49 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY 50 | AWS_SESSION_TOKEN: $AWS_SESSION_TOKEN 51 | volumes: 52 | - "./configs/connect-distributed.properties:/opt/bitnami/kafka/config/connect-distributed.properties" 53 | - "./connectors/camel-aws-ddb-sink-kafka-connector:/opt/connectors/camel-aws-ddb-sink-kafka-connector" 54 | depends_on: 55 | - zookeeper 56 | - kafka-0 57 | kpow: 58 | image: factorhouse/kpow-ce:91.2.1 59 | container_name: kpow 60 | ports: 61 | - "3000:3000" 62 | networks: 63 | - kafkanet 64 | environment: 65 | BOOTSTRAP: kafka-0:9092 66 | CONNECT_REST_URL: http://kafka-connect:8083 67 | depends_on: 68 | - zookeeper 69 | - kafka-0 70 | - kafka-connect 71 | 72 | networks: 73 | kafkanet: 74 | name: kafka-network 75 | 76 | volumes: 77 | zookeeper_data: 78 | driver: local 79 | name: zookeeper_data 80 | kafka_0_data: 81 | driver: local 82 | name: kafka_0_data 83 | -------------------------------------------------------------------------------- /sql-training/client-image/java/sql-training-data-producer/src/main/java/com/ververica/sql_training/data_producer/FileReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 Ververica GmbH 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.ververica.sql_training.data_producer; 18 | 19 | import com.ververica.sql_training.data_producer.json_serde.JsonDeserializer; 20 | import com.ververica.sql_training.data_producer.records.TaxiRecord; 21 | 22 | import java.io.*; 23 | import java.nio.charset.StandardCharsets; 24 | import java.util.Iterator; 25 | import java.util.NoSuchElementException; 26 | import java.util.function.Supplier; 27 | import java.util.stream.Stream; 28 | import java.util.zip.GZIPInputStream; 29 | 30 | /** 31 | * Reads JSON-encoded TaxiRecords from a gzipped text file. 32 | */ 33 | public class FileReader implements Supplier { 34 | 35 | private final Iterator records; 36 | private final String filePath; 37 | 38 | public FileReader(String filePath, Class recordClazz) throws IOException { 39 | 40 | this.filePath = filePath; 41 | JsonDeserializer deserializer = new JsonDeserializer<>(recordClazz); 42 | try { 43 | 44 | BufferedReader reader = new BufferedReader( 45 | new InputStreamReader(new GZIPInputStream(new FileInputStream(filePath)), StandardCharsets.UTF_8)); 46 | 47 | Stream lines = reader.lines().sequential(); 48 | records = lines.map(l -> (TaxiRecord) deserializer.parseFromString(l)).iterator(); 49 | 50 | } catch (IOException e) { 51 | throw new IOException("Error reading TaxiRecords from file: " + filePath, e); 52 | } 53 | } 54 | 55 | @Override 56 | public TaxiRecord get() { 57 | 58 | if (records.hasNext()) { 59 | return records.next(); 60 | } else { 61 | throw new NoSuchElementException("All records read from " + filePath); 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /stream-processing-with-flink/src/main/kotlin/me/jaehyeon/chapter6/MarkerBasedWatermarkGeneration.kt: -------------------------------------------------------------------------------- 1 | package me.jaehyeon.chapter6 2 | 3 | import me.jaehyeon.sensor.SensorReading 4 | import me.jaehyeon.sensor.SensorSource 5 | import org.apache.flink.api.common.eventtime.Watermark 6 | import org.apache.flink.api.common.eventtime.WatermarkGenerator 7 | import org.apache.flink.api.common.eventtime.WatermarkOutput 8 | import org.apache.flink.api.common.eventtime.WatermarkStrategy 9 | import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment 10 | 11 | /** 12 | * This Flink job demonstrates a custom, event-driven watermark generation strategy. 13 | * 14 | * This pattern, formerly known as "punctuated", emits a watermark on-the-fly 15 | * whenever it sees a specific "marker" event in the stream, rather than on a 16 | * periodic interval. 17 | */ 18 | object MarkerBasedWatermarkGeneration { 19 | @JvmStatic 20 | fun main(args: Array) { 21 | val env = StreamExecutionEnvironment.getExecutionEnvironment() 22 | 23 | val readings = 24 | env.fromSource( 25 | SensorSource(), 26 | WatermarkStrategy 27 | .forGenerator { ctx -> MarkerBasedWatermarkGenerator() } 28 | .withTimestampAssigner { reading, _ -> reading.timestamp }, 29 | "Sensor Source", 30 | ) 31 | 32 | readings.print() 33 | env.execute("Marker-Based Watermark Generation") 34 | } 35 | } 36 | 37 | /** 38 | * A custom WatermarkGenerator that emits a new watermark every time it sees a 39 | * specific marker event (in this case, a reading from "sensor_1"). 40 | */ 41 | class MarkerBasedWatermarkGenerator : WatermarkGenerator { 42 | /** 43 | * This method is called for every event. We inspect the event and decide whether to emit a watermark. 44 | */ 45 | override fun onEvent( 46 | event: SensorReading, 47 | eventTimestamp: Long, 48 | output: WatermarkOutput, 49 | ) { 50 | // Emit a new watermark if the event is from our marker, "sensor_1". 51 | if (event.id == "sensor_1") { 52 | output.emitWatermark(Watermark(eventTimestamp)) 53 | } 54 | } 55 | 56 | /** 57 | * This method is called periodically. Since our logic is purely event-driven, 58 | * we don't need to do anything here. 59 | */ 60 | override fun onPeriodicEmit(output: WatermarkOutput) { 61 | // This is not a periodic generator, so we do nothing here. 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /fraud-detection/remote/infra/outputs.tf: -------------------------------------------------------------------------------- 1 | # VPC 2 | output "vpc_id" { 3 | description = "The ID of the VPC" 4 | value = module.vpc.vpc_id 5 | } 6 | 7 | output "vpc_cidr_block" { 8 | description = "The CIDR block of the VPC" 9 | value = module.vpc.vpc_cidr_block 10 | } 11 | 12 | output "private_subnets" { 13 | description = "List of IDs of private subnets" 14 | value = module.vpc.private_subnets 15 | } 16 | 17 | output "public_subnets" { 18 | description = "List of IDs of public subnets" 19 | value = module.vpc.public_subnets 20 | } 21 | 22 | output "nat_public_ips" { 23 | description = "List of public Elastic IPs created for AWS NAT Gateway" 24 | value = module.vpc.nat_public_ips 25 | } 26 | 27 | output "azs" { 28 | description = "A list of availability zones specified as argument to this module" 29 | value = module.vpc.azs 30 | } 31 | 32 | # Default bucket 33 | output "default_bucket_name" { 34 | description = "Default bucket name" 35 | value = aws_s3_bucket.default_bucket.id 36 | } 37 | 38 | # VPN 39 | output "vpn_launch_template_arn" { 40 | description = "The ARN of the VPN launch template" 41 | value = { 42 | for k, v in module.vpn : k => v.launch_template_arn 43 | } 44 | } 45 | 46 | output "vpn_autoscaling_group_id" { 47 | description = "VPN autoscaling group id" 48 | value = { 49 | for k, v in module.vpn : k => v.autoscaling_group_id 50 | } 51 | } 52 | 53 | output "vpn_autoscaling_group_name" { 54 | description = "VPN autoscaling group name" 55 | value = { 56 | for k, v in module.vpn : k => v.autoscaling_group_name 57 | } 58 | } 59 | 60 | # MSK 61 | output "msk_arn" { 62 | description = "Amazon Resource Name (ARN) of the MSK cluster" 63 | value = aws_msk_cluster.msk_data_cluster.arn 64 | } 65 | 66 | output "msk_bootstrap_brokers_sasl_iam" { 67 | description = "One or more DNS names (or IP addresses) and SASL IAM port pairs" 68 | value = aws_msk_cluster.msk_data_cluster.bootstrap_brokers_sasl_iam 69 | } 70 | 71 | # MSK Connect 72 | output "ddb_sink_arn" { 73 | description = "Amazon Resource Name (ARN) of the Camel DyanmoDB Sink connector" 74 | value = aws_mskconnect_connector.camel_ddb_sink.arn 75 | } 76 | 77 | output "ddb_sink_version" { 78 | description = "Current version of the Camel DyanmoDB Sink connector" 79 | value = aws_mskconnect_connector.camel_ddb_sink.version 80 | } 81 | 82 | # KDA 83 | output "kda_app_arn" { 84 | description = "Kinesis Application ARN" 85 | value = aws_kinesisanalyticsv2_application.kda_app.arn 86 | } 87 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM flink:1.17.1 2 | 3 | ARG PYTHON_VERSION 4 | ENV PYTHON_VERSION=${PYTHON_VERSION:-3.8.10} 5 | ARG FLINK_VERSION 6 | ENV FLINK_VERSION=${FLINK_VERSION:-1.17.1} 7 | 8 | RUN wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-kafka/$FLINK_VERSION/flink-connector-kafka-$FLINK_VERSION.jar; \ 9 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/kafka/kafka-clients/3.2.3/kafka-clients-3.2.3.jar; \ 10 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-kafka/$FLINK_VERSION/flink-sql-connector-kafka-$FLINK_VERSION.jar; \ 11 | wget -P /opt/flink/lib/ https://repo.maven.apache.org/maven2/org/apache/flink/flink-connector-jdbc/3.1.0-1.17/flink-connector-jdbc-3.1.0-1.17.jar; \ 12 | wget -P /opt/flink/lib/ https://jdbc.postgresql.org/download/postgresql-42.6.0.jar; \ 13 | wget -P /opt/flink/lib/ https://github.com/knaufk/flink-faker/releases/download/v0.5.3/flink-faker-0.5.3.jar; 14 | 15 | ## Python version (3.7, 3.8, 3.9 or 3.10) is required, apt repo 16 | # Python 3.3 and later versions provide the lzma module. 17 | # However, if Python is installed using the source code and the lzma-dev package is not installed in the system, 18 | # the lzma module will not be installed. 19 | # https://support.huawei.com/enterprise/en/doc/EDOC1100289998/db0db8f0/modulenotfounderror-no-module-named-_lzma- 20 | # INFO:root:Starting up Python harness in a standalone process. 21 | # Traceback (most recent call last): 22 | # File "/usr/local/lib/python3.8/site-packages/fastavro/read.py", line 2, in 23 | # from . import _read 24 | # File "fastavro/_read.pyx", line 11, in init fastavro._read 25 | # File "/usr/local/lib/python3.8/lzma.py", line 27, in 26 | # from _lzma import * 27 | # ModuleNotFoundError: No module named '_lzma' 28 | 29 | RUN apt-get update -y && \ 30 | apt-get install -y build-essential libssl-dev zlib1g-dev libbz2-dev libffi-dev liblzma-dev && \ 31 | wget https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz && \ 32 | tar -xvf Python-${PYTHON_VERSION}.tgz && \ 33 | cd Python-${PYTHON_VERSION} && \ 34 | ./configure --without-tests --enable-shared && \ 35 | make -j6 && \ 36 | make install && \ 37 | ldconfig /usr/local/lib && \ 38 | cd .. && rm -f Python-${PYTHON_VERSION}.tgz && rm -rf Python-${PYTHON_VERSION} && \ 39 | ln -s /usr/local/bin/python3 /usr/local/bin/python && \ 40 | apt-get clean && \ 41 | rm -rf /var/lib/apt/lists/* 42 | 43 | # install PyFlink 44 | RUN pip3 install apache-flink==${FLINK_VERSION} -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/manage_topics.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import typing 4 | import logging 5 | 6 | from kafka import KafkaAdminClient 7 | from kafka.admin import NewTopic 8 | from kafka.errors import KafkaError, UnknownTopicOrPartitionError 9 | 10 | 11 | class KafkaClient: 12 | def __init__(self, bootstrap_servers: str) -> None: 13 | self.bootstrap_servers = bootstrap_servers 14 | self.admin_client = self.create_admin() 15 | 16 | def create_admin(self): 17 | return KafkaAdminClient(bootstrap_servers=self.bootstrap_servers) 18 | 19 | def delete_topics(self, topic_names: typing.List[str]): 20 | for name in topic_names: 21 | try: 22 | self.admin_client.delete_topics([name]) 23 | except UnknownTopicOrPartitionError: 24 | pass 25 | except Exception as err: 26 | raise RuntimeError(f"fails to delete topic - {name}") from err 27 | 28 | def create_topics(self, topics: typing.List[NewTopic], to_recreate: bool = True): 29 | if to_recreate: 30 | self.delete_topics([t.name for t in topics]) 31 | for topic in topics: 32 | try: 33 | resp = self.admin_client.create_topics([topic]) 34 | name, error_code, error_message = resp.topic_errors[0] 35 | logging.info( 36 | f"topic created, name - {name}, error code - {error_code}, error message - {error_message}" 37 | ) 38 | except KafkaError as err: 39 | raise RuntimeError( 40 | f"fails to create topics - {', '.join(t.name for t in topics)}" 41 | ) from err 42 | logging.info(f"topics created successfully - {', '.join([t.name for t in topics])}") 43 | 44 | 45 | if __name__ == "__main__": 46 | logging.basicConfig( 47 | level=logging.INFO, 48 | format="%(asctime)s.%(msecs)03d:%(levelname)s:%(name)s:%(message)s", 49 | datefmt="%Y-%m-%d %H:%M:%S", 50 | ) 51 | 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument("--delete", action="store_true") 54 | parser.set_defaults(delete=False) 55 | parser.add_argument("--create", action="store_true") 56 | parser.set_defaults(create=False) 57 | args = parser.parse_args() 58 | 59 | client = KafkaClient(os.getenv("BOOTSTRAP_SERVERS", "localhost:29092")) 60 | 61 | topics = [NewTopic(name="sensor-reading", num_partitions=3, replication_factor=1)] 62 | 63 | if args.delete: 64 | client.delete_topics(topics) 65 | if args.create: 66 | client.create_topics(topics, to_recreate=True) 67 | -------------------------------------------------------------------------------- /stream-processing-with-flink/build.gradle.kts: -------------------------------------------------------------------------------- 1 | import com.github.jengelman.gradle.plugins.shadow.tasks.ShadowJar 2 | import org.gradle.api.tasks.JavaExec 3 | import org.gradle.api.tasks.testing.Test 4 | 5 | plugins { 6 | kotlin("jvm") version "2.2.20" 7 | application 8 | id("com.github.johnrengelman.shadow") version "8.1.1" 9 | kotlin("plugin.serialization") version "2.2.20" 10 | } 11 | 12 | group = "me.jaehyeon" 13 | version = "1.0-SNAPSHOT" 14 | 15 | val localRunClasspath by configurations.creating { 16 | extendsFrom(configurations.implementation.get(), configurations.compileOnly.get(), configurations.runtimeOnly.get()) 17 | } 18 | 19 | repositories { 20 | mavenCentral() 21 | } 22 | 23 | val flinkVersion = "1.20.1" 24 | val ktorVersion = "3.3.3" 25 | 26 | dependencies { 27 | // Flink Dependencies 28 | compileOnly("org.apache.flink:flink-streaming-java:$flinkVersion") 29 | compileOnly("org.apache.flink:flink-clients:$flinkVersion") 30 | compileOnly("org.apache.flink:flink-connector-base:$flinkVersion") 31 | // 'testImplementation' makes Flink available for test source compilation and execution. 32 | testImplementation("org.apache.flink:flink-streaming-java:$flinkVersion") 33 | testImplementation("org.apache.flink:flink-clients:$flinkVersion") 34 | testImplementation("org.apache.flink:flink-connector-base:$flinkVersion") 35 | // Ktor 36 | implementation("io.ktor:ktor-client-core:$ktorVersion") 37 | implementation("io.ktor:ktor-client-cio:$ktorVersion") 38 | implementation("io.ktor:ktor-client-content-negotiation:$ktorVersion") 39 | implementation("io.ktor:ktor-serialization-kotlinx-json:$ktorVersion") 40 | // Logging 41 | implementation("org.slf4j:slf4j-simple:2.0.17") 42 | // Testing 43 | testImplementation(kotlin("test")) 44 | testImplementation("org.junit.jupiter:junit-jupiter-api:5.14.1") 45 | testRuntimeOnly("org.junit.jupiter:junit-jupiter-engine:5.14.1") 46 | } 47 | 48 | kotlin { 49 | jvmToolchain(11) 50 | } 51 | 52 | application { 53 | mainClass.set( 54 | project.findProperty("mainClass")?.toString() 55 | ?: "me.jaehyeon.chapter1.AverageSensorReadings", 56 | ) 57 | } 58 | 59 | tasks.named("run") { 60 | // Classpath = All library dependencies + The application's compiled code. 61 | classpath = localRunClasspath + sourceSets.main.get().output 62 | } 63 | 64 | tasks.withType { 65 | useJUnitPlatform() 66 | } 67 | 68 | tasks.withType { 69 | archiveBaseName.set(rootProject.name) 70 | archiveClassifier.set("") 71 | archiveVersion.set("1.0") 72 | mergeServiceFiles() 73 | } 74 | 75 | tasks.named("build") { 76 | dependsOn("shadowJar") 77 | } 78 | -------------------------------------------------------------------------------- /flink-sql-cookbook/README.md: -------------------------------------------------------------------------------- 1 | ## Flink SQL Cookbook on Docker 2 | 3 | A Flink cluster that can be used to run queries of the [Apache Flink SQL Cookbook](https://github.com/ververica/flink-sql-cookbook/tree/main) repo from Ververica. 4 | 5 | The Flink Docker image is updated with the [Flink SQL Faker Connector](https://github.com/knaufk/flink-faker) for fake data generation. Note that the example SQL queries are based on an old version of the connector, and some of them have to be modified. 6 | 7 | ### Flink Cluster on Docker 8 | 9 | The cookbook generates sample records using the [Flink SQL Faker Connector](https://github.com/knaufk/flink-faker), and we use a custom Docker image that downloads its source to the `/opt/flink/lib/` folder. In this way, we don't have to specify the connector source whenever we start the [SQL client](https://nightlies.apache.org/flink/flink-docs-master/docs/dev/table/sqlclient/). 10 | 11 | ```Dockerfile 12 | FROM flink:1.20.1 13 | 14 | # add faker connector 15 | RUN wget -P /opt/flink/lib/ \ 16 | https://github.com/knaufk/flink-faker/releases/download/v0.5.3/flink-faker-0.5.3.jar 17 | ``` 18 | 19 | A local Apache Flink cluster can be deployed using Docker Compose. 20 | 21 | ```bash 22 | # start containers 23 | $ docker compose up -d 24 | 25 | # list containers 26 | $ docker-compose ps 27 | # NAME COMMAND SERVICE STATUS PORTS 28 | # jobmanager "/docker-entrypoint.…" jobmanager running (healthy) 6123/tcp, 0.0.0.0:8081->8081/tcp, :::8081->8081/tcp 29 | # taskmanager-1 "/docker-entrypoint.…" taskmanager-1 running 6123/tcp, 8081/tcp 30 | # taskmanager-2 "/docker-entrypoint.…" taskmanager-2 running 6123/tcp, 8081/tcp 31 | # taskmanager-3 "/docker-entrypoint.…" taskmanager-3 running 6123/tcp, 8081/tcp 32 | ``` 33 | 34 | ### Flink SQL Client 35 | 36 | ```sql 37 | -- // create a temporary table 38 | CREATE TEMPORARY TABLE heros ( 39 | `name` STRING, 40 | `power` STRING, 41 | `age` INT 42 | ) WITH ( 43 | 'connector' = 'faker', 44 | 'fields.name.expression' = '#{superhero.name}', 45 | 'fields.power.expression' = '#{superhero.power}', 46 | 'fields.power.null-rate' = '0.05', 47 | 'fields.age.expression' = '#{number.numberBetween ''0'',''1000''}' 48 | ); 49 | -- [INFO] Execute statement succeeded. 50 | 51 | -- list tables 52 | SHOW TABLES; 53 | -- +------------+ 54 | -- | table name | 55 | -- +------------+ 56 | -- | heros | 57 | -- +------------+ 58 | -- 1 row in set 59 | 60 | -- query records from the heros table 61 | -- hit 'q' to exit the record view 62 | SELECT * FROM heros; 63 | 64 | -- quit sql shell 65 | quit; 66 | ``` 67 | 68 | ![](./img/sql-client.gif#center) 69 | -------------------------------------------------------------------------------- /datorios/README.md: -------------------------------------------------------------------------------- 1 | [![](https://github.com/metrolinkai/Datorios/blob/main/resources/Horizontal%20Positive.png)](https://datorios.con "See The Data Behind Your Data - Data Observability for Apache Flink") 2 | 3 | 4 | 5 | # See The Data Behind Your Data - Data Observability for Apache Flink ® 6 | 7 | ### **Unified Investigation Platform:** 8 | When all of your pipeline data is available in one place, you no longer have the need to waste time flipping between different places to access source data, application logging, and pipeline metrics. The precious time spent flipping between platforms could be time spent bringing your pipelines back to a healthy state. 9 | 10 | ### **Effortless Debugging of Operator Functionality & State:** 11 | See every record that passed through each of your operators, unlocking the ability to see how the record altered state while it was being processed. 12 | 13 | ### **Better Integration Testing:** 14 | A lot is left to be desired when testing your pipelines. Unit testing only gives us half the story, by seeing how all of your operators work together when the data flows you can be confident in the results provided by your pipelines. 15 | 16 | ### **Peace of Mind in Production Monitoring:** 17 | Access to your mission-critical performance metrics gives you the peace of mind knowing your jobs are resourced correctly and won't fall over when more data starts flowing in. 18 | 19 | ### **Breeze Through Window Investigation:** 20 | With the Window Investigation tool you can magnify the problems you are encountering with windowing. Problems can include late events, incorrect watermark settings, or even your aggregation functions. 21 | 22 | # **High level Architecture:** 23 | Datorios consists of two components: 24 | 25 | 26 | - Datorios client running on Docker Compose - The client will install the Apache Flink engine on your local/cloud machine, where your jobs will be deployed (embedding Datorios to your current Flink is coming up soon). 27 | - A cloud observability service is used for deep investigation and debugging. 28 | 29 | ![](https://github.com/metrolinkai/Datorios/blob/main/resources/image-20240425-111715.png) 30 | 31 | [Signup](https://app.datorios.com/signup) to download the Datorios cluster - You can use your own or the demo Flink jobs in [this repository](https://github.com/metrolinkai/Datorios/tree/main/flink-examples) for a test run 32 | 33 | # **K8S Architecture:** 34 | 35 | ![](https://github.com/metrolinkai/Datorios/blob/main/resources/K8K%20Architecture%203.png) 36 | 37 | [SaaS side](https://github.com/metrolinkai/Datorios/blob/main/resources/SAAS.drawio.png) 38 | 39 | [![](https://github.com/metrolinkai/Datorios/blob/main/resources/Copy%20of%20squirrel%20xray%20(1).png)](https://datorios.con "Making your Flink transparent") 40 | -------------------------------------------------------------------------------- /pyflink-udemy/s3_06_kafka_sink.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyflink.table import EnvironmentSettings, TableEnvironment 4 | 5 | BOOTSTRAP_SERVERS = os.getenv("BOOTSTRAP_SERVERS", "localhost:29092") 6 | # https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/connectors/table/kafka/ 7 | version_map = {"15": "1.15.4", "16": "1.16.0"} 8 | FLINK_VERSION = version_map[os.getenv("MINOR_VERSION", "15")] 9 | FLINK_SQL_CONNECTOR_KAFKA = f"flink-sql-connector-kafka-{FLINK_VERSION}.jar" 10 | 11 | env_settings = EnvironmentSettings.in_streaming_mode() 12 | table_env = TableEnvironment.create(env_settings) 13 | # https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/dev/python/dependency_management/ 14 | kafka_jar = os.path.join(os.path.abspath(os.path.dirname(__file__)), FLINK_SQL_CONNECTOR_KAFKA) 15 | table_env.get_config().set("pipeline.jars", f"file://{kafka_jar}") 16 | 17 | ## create kafka source table 18 | table_env.execute_sql( 19 | f""" 20 | CREATE TABLE product_sales ( 21 | `seller_id` VARCHAR, 22 | `product` VARCHAR, 23 | `quantity` INT, 24 | `product_price` DOUBLE, 25 | `sales_date` VARCHAR 26 | ) WITH ( 27 | 'connector' = 'kafka', 28 | 'topic' = 'product_sales', 29 | 'properties.bootstrap.servers' = '{BOOTSTRAP_SERVERS}', 30 | 'properties.group.id' = 'source-demo', 31 | 'format' = 'json', 32 | 'scan.startup.mode' = 'earliest-offset', 33 | 'json.fail-on-missing-field' = 'false', 34 | 'json.ignore-parse-errors' = 'true' 35 | ) 36 | """ 37 | ) 38 | 39 | ## create print sink table 40 | table_env.execute_sql( 41 | f""" 42 | CREATE TABLE print ( 43 | `seller_id` VARCHAR, 44 | `product` VARCHAR, 45 | `quantity` INT, 46 | `product_price` DOUBLE, 47 | `sales_date` VARCHAR 48 | ) WITH ( 49 | 'connector' = 'print' 50 | ) 51 | """ 52 | ) 53 | 54 | ## create kafka sink table 55 | table_env.execute_sql( 56 | f""" 57 | CREATE TABLE product_sales_sink ( 58 | `seller_id` VARCHAR, 59 | `product` VARCHAR, 60 | `quantity` INT, 61 | `product_price` DOUBLE, 62 | `sales_date` VARCHAR 63 | ) WITH ( 64 | 'connector' = 'kafka', 65 | 'topic' = 'product_sales_sink', 66 | 'properties.bootstrap.servers' = '{BOOTSTRAP_SERVERS}', 67 | 'format' = 'json', 68 | 'json.fail-on-missing-field' = 'false', 69 | 'json.ignore-parse-errors' = 'true' 70 | ) 71 | """ 72 | ) 73 | 74 | ## insert into sink tables 75 | tbl = table_env.from_path("product_sales") 76 | statement_set = table_env.create_statement_set() 77 | statement_set.add_insert("print", tbl) 78 | statement_set.add_insert("product_sales_sink", tbl) 79 | statement_set.execute().wait() 80 | -------------------------------------------------------------------------------- /real-time-streaming-aws/compose-msk.yml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | 3 | services: 4 | jobmanager: 5 | image: real-time-streaming-aws:1.17.1 6 | command: jobmanager 7 | container_name: jobmanager 8 | ports: 9 | - "8081:8081" 10 | networks: 11 | - appnet 12 | volumes: 13 | - ./:/etc/flink 14 | environment: 15 | - BOOTSTRAP_SERVERS=${BOOTSTRAP_SERVERS:-not_set} 16 | - OPENSEARCH_HOSTS=${OPENSEARCH_HOSTS:-not_set} 17 | - RUNTIME_ENV=DOCKER 18 | - AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID 19 | - AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY 20 | # - AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN 21 | - | 22 | FLINK_PROPERTIES= 23 | jobmanager.rpc.address: jobmanager 24 | state.backend: filesystem 25 | state.checkpoints.dir: file:///tmp/flink-checkpoints 26 | heartbeat.interval: 1000 27 | heartbeat.timeout: 5000 28 | rest.flamegraph.enabled: true 29 | web.backpressure.refresh-interval: 10000 30 | taskmanager: 31 | image: real-time-streaming-aws:1.17.1 32 | command: taskmanager 33 | container_name: taskmanager 34 | networks: 35 | - appnet 36 | volumes: 37 | - flink_data:/tmp/ 38 | - ./:/etc/flink 39 | environment: 40 | - BOOTSTRAP_SERVERS=${BOOTSTRAP_SERVERS:-not_set} 41 | - OPENSEARCH_HOSTS=${OPENSEARCH_HOSTS:-not_set} 42 | - RUNTIME_ENV=DOCKER 43 | - AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID 44 | - AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY 45 | # - AWS_SESSION_TOKEN=$AWS_SESSION_TOKEN 46 | - | 47 | FLINK_PROPERTIES= 48 | jobmanager.rpc.address: jobmanager 49 | taskmanager.numberOfTaskSlots: 5 50 | state.backend: filesystem 51 | state.checkpoints.dir: file:///tmp/flink-checkpoints 52 | heartbeat.interval: 1000 53 | heartbeat.timeout: 5000 54 | depends_on: 55 | - jobmanager 56 | kpow: 57 | image: factorhouse/kpow-ce:91.5.1 58 | container_name: kpow 59 | ports: 60 | - "3000:3000" 61 | networks: 62 | - appnet 63 | environment: 64 | AWS_ACCESS_KEY_ID: $AWS_ACCESS_KEY_ID 65 | AWS_SECRET_ACCESS_KEY: $AWS_SECRET_ACCESS_KEY 66 | # AWS_SESSION_TOKEN: $AWS_SESSION_TOKEN 67 | BOOTSTRAP: ${BOOTSTRAP_SERVERS:-not_set} 68 | SECURITY_PROTOCOL: SASL_SSL 69 | SASL_MECHANISM: AWS_MSK_IAM 70 | SASL_JAAS_CONFIG: software.amazon.msk.auth.iam.IAMLoginModule required; 71 | SASL_CLIENT_CALLBACK_HANDLER_CLASS: software.amazon.msk.auth.iam.IAMClientCallbackHandler 72 | env_file: # https://kpow.io/get-started/#individual 73 | - ./kpow.env 74 | 75 | networks: 76 | appnet: 77 | name: app-network 78 | 79 | volumes: 80 | flink_data: 81 | driver: local 82 | name: flink_data 83 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter5/utils/model.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import dataclasses 3 | from typing import Iterable, Tuple 4 | 5 | from pyflink.common import Row 6 | from pyflink.common.typeinfo import Types 7 | 8 | from .type_helper import TypeMapping, set_type_info 9 | 10 | 11 | @dataclasses.dataclass 12 | class SensorReading(TypeMapping): 13 | id: str 14 | timestamp: int 15 | num_records: int 16 | temperature: float 17 | 18 | def to_row(self): 19 | return Row(**dataclasses.asdict(self)) 20 | 21 | @classmethod 22 | def from_row(cls, row: Row): 23 | return cls(**row.as_dict()) 24 | 25 | @classmethod 26 | def from_tuple(cls, tup: Tuple[int, int, datetime.datetime]): 27 | return cls( 28 | id=f"sensor_{tup[0]}", 29 | timestamp=int(tup[2].strftime("%s")) * 1000, 30 | num_records=1, 31 | temperature=65 + (tup[1] / 100 * 20), 32 | ) 33 | 34 | @staticmethod 35 | def process_elements(elements: Iterable[Tuple[int, int, datetime.datetime]]): 36 | id, count, temperature = None, 0, 0 37 | for e in elements: 38 | next_id = f"sensor_{e[0]}" 39 | if id is not None: 40 | assert id == next_id 41 | id = next_id 42 | count += 1 43 | temperature += 65 + (e[1] / 100 * 20) 44 | return id, count, temperature 45 | 46 | @staticmethod 47 | def type_mapping(): 48 | return { 49 | "id": Types.STRING(), 50 | "timestamp": Types.LONG(), 51 | "num_records": Types.INT(), 52 | "temperature": Types.DOUBLE(), 53 | } 54 | 55 | @staticmethod 56 | def set_key_type_info(): 57 | return set_type_info(SensorReading.type_mapping(), selects=["id"]) 58 | 59 | @staticmethod 60 | def set_value_type_info(): 61 | return set_type_info(SensorReading.type_mapping()) 62 | 63 | 64 | @dataclasses.dataclass 65 | class SmokeLevel(TypeMapping): 66 | value: str 67 | 68 | @classmethod 69 | def from_tuple(cls, tup: Tuple[int]): 70 | return cls(value="High" if tup[0] / 100 > 0.8 else "Low") 71 | 72 | @staticmethod 73 | def type_mapping(): 74 | return {"value": Types.STRING()} 75 | 76 | @staticmethod 77 | def set_value_type_info(): 78 | return set_type_info(SmokeLevel.type_mapping()) 79 | 80 | 81 | @dataclasses.dataclass 82 | class Alert(TypeMapping): 83 | message: str 84 | timestamp: int 85 | temperature: float 86 | 87 | @staticmethod 88 | def type_mapping(): 89 | return {"message": Types.STRING(), "timestamp": Types.LONG(), "temperature": Types.DOUBLE()} 90 | 91 | @staticmethod 92 | def set_value_type_info(): 93 | return set_type_info(Alert.type_mapping()) 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Flink Demos 2 | 3 | This repository contains the source of the following posts, along with additional learning resources. 4 | 5 | - [Getting Started With Pyflink on AWS](https://jaehyeon.me/blog/2023-08-17-getting-started-with-pyflink-on-aws-part-1/) 6 | - Apache Flink is widely used for building real-time stream processing applications. On AWS, Amazon Managed Service for Apache Flink is the easiest option to develop a Flink app as it provides the underlying infrastructure. Updating a guide from AWS, this series of posts discuss how to develop and deploy a Flink (Pyflink) application on AWS where the data source and sink are Kafka topics. 7 | - [Kafka, Flink and DynamoDB for Real Time Fraud Detection](https://jaehyeon.me/blog/2023-08-10-fraud-detection-part-1/) 8 | - Re-implementing a solution from an AWS workshop, this series of posts discuss how to develop and deploy a fraud detection app using Kafka, Flink and DynamoDB. Part 1 covers local development using Docker while deployment on AWS will be discussed in part 2. 9 | - [Building Apache Flink Applications in Python](https://jaehyeon.me/blog/2023-10-19-build-pyflink-apps/) 10 | - Building Apache Flink Applications in Java by Confluent is a course to introduce Apache Flink through a series of hands-on exercises. Utilising the Flink DataStream API, the course develops three Flink applications from ingesting source data into calculating usage statistics. As part of learning the Flink DataStream API in Pyflink, I converted the Java apps into Python equivalent while performing the course exercises in Pyflink. This post summarises the progress of the conversion and shows the final output. 11 | - [Run Flink SQL Cookbook in Docker](https://jaehyeon.me/blog/2025-04-15-sql-cookbook/) 12 | - The [Flink SQL Cookbook](https://github.com/ververica/flink-sql-cookbook) is a practical guide packed with self-contained examples for learning [Apache Flink SQL](https://nightlies.apache.org/flink/flink-docs-master/docs/dev/table/sql/overview/), covering everything from basic queries to advanced stream processing patterns. Since it's designed for the Ververica Platform and lacks cluster setup instructions, this post shows how to run the examples locally using Docker Compose for a smooth, hands-on experience. 13 | - [Stream Processing With Flink in Kotlin](https://jaehyeon.me/blog/2025-12-10-streaming-processing-with-flink-in-kotlin/) 14 | - A couple of years ago, I read [Stream Processing with Apache Flink](https://www.oreilly.com/library/view/stream-processing-with/9781491974285/) and worked through the examples using PyFlink. While the book offered a solid introduction to Flink, I frequently hit limitations with the Python API, as many features from the book weren't supported. This time, I decided to revisit the material, but using Kotlin. The experience has been much more rewarding and fun. 15 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter5/basic_transformations.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | from typing import Tuple 4 | 5 | from pyflink.common import WatermarkStrategy 6 | from pyflink.common.typeinfo import Types 7 | from pyflink.common.watermark_strategy import TimestampAssigner, Duration 8 | from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode 9 | from pyflink.table import StreamTableEnvironment 10 | 11 | from utils.model import SensorReading 12 | 13 | 14 | if __name__ == "__main__": 15 | """ 16 | ## local execution 17 | python src/chapter5/basic_transformations.py 18 | """ 19 | 20 | RUNTIME_ENV = os.getenv("RUNTIME_ENV", "local") 21 | 22 | env = StreamExecutionEnvironment.get_execution_environment() 23 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING) 24 | if RUNTIME_ENV == "local": 25 | SRC_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 26 | jar_files = ["flink-faker-0.5.3.jar"] 27 | jar_paths = tuple([f"file://{os.path.join(SRC_DIR, 'jars', name)}" for name in jar_files]) 28 | print(jar_paths) 29 | env.add_jars(*jar_paths) 30 | 31 | t_env = StreamTableEnvironment.create(stream_execution_environment=env) 32 | t_env.get_config().set_local_timezone("Australia/Sydney") 33 | t_env.execute_sql( 34 | """ 35 | CREATE TABLE sensor_source ( 36 | `id` INT, 37 | `rn` INT, 38 | `log_time` TIMESTAMP_LTZ(3) 39 | ) 40 | WITH ( 41 | 'connector' = 'faker', 42 | 'rows-per-second' = '1', 43 | 'fields.id.expression' = '#{number.numberBetween ''0'',''20''}', 44 | 'fields.rn.expression' = '#{number.numberBetween ''0'',''100''}', 45 | 'fields.log_time.expression' = '#{date.past ''10'',''5'',''SECONDS''}' 46 | ); 47 | """ 48 | ) 49 | 50 | class SourceTimestampAssigner(TimestampAssigner): 51 | def extract_timestamp( 52 | self, value: Tuple[int, int, datetime.datetime], record_timestamp: int 53 | ): 54 | return int(value[2].strftime("%s")) * 1000 55 | 56 | source_stream = t_env.to_append_stream( 57 | t_env.from_path("sensor_source"), 58 | Types.TUPLE([Types.INT(), Types.INT(), Types.SQL_TIMESTAMP()]), 59 | ).assign_timestamps_and_watermarks( 60 | WatermarkStrategy.for_bounded_out_of_orderness( 61 | Duration.of_seconds(5) 62 | ).with_timestamp_assigner(SourceTimestampAssigner()) 63 | ) 64 | 65 | filtered_sensors = source_stream.map(SensorReading.from_tuple).filter( 66 | lambda e: e.temperature >= 25 67 | ) 68 | 69 | split_ids = filtered_sensors.flat_map(lambda e: e.id.split("_")) 70 | 71 | split_ids.print() 72 | 73 | env.execute("Basic Transformations Example") 74 | -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter6/test_window_functions_reduce.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from typing import Tuple 3 | 4 | import pytest 5 | from pyflink.common import WatermarkStrategy 6 | from pyflink.common.watermark_strategy import TimestampAssigner, Duration 7 | from pyflink.datastream import DataStream, StreamExecutionEnvironment 8 | 9 | from window_functions_reduce import define_workflow 10 | 11 | 12 | @pytest.fixture(scope="module") 13 | def env(): 14 | env = StreamExecutionEnvironment.get_execution_environment() 15 | yield env 16 | 17 | 18 | def test_define_workflow_should_return_records_having_mininum_temperature_by_id(env): 19 | source_1 = (1, 0, datetime.datetime.now()) 20 | source_2 = (1, 50, datetime.datetime.now()) 21 | source_3 = (2, 20, datetime.datetime.now()) 22 | source_4 = (2, 100, datetime.datetime.now()) 23 | 24 | class SourceTimestampAssigner(TimestampAssigner): 25 | def extract_timestamp( 26 | self, value: Tuple[int, int, datetime.datetime], record_timestamp: int 27 | ): 28 | return int(value[2].strftime("%s")) * 1000 29 | 30 | source_stream: DataStream = env.from_collection( 31 | collection=[source_1, source_2, source_3, source_4] 32 | ).assign_timestamps_and_watermarks( 33 | WatermarkStrategy.for_bounded_out_of_orderness( 34 | Duration.of_seconds(5) 35 | ).with_timestamp_assigner(SourceTimestampAssigner()) 36 | ) 37 | 38 | elements = list(define_workflow(source_stream).execute_and_collect()) 39 | assert len(elements) == 2 40 | for e in elements: 41 | if e.id == "sensor_1": 42 | assert e.temperature == 65 43 | else: 44 | assert e.temperature == 69 45 | 46 | 47 | def test_define_workflow_should_return_records_having_mininum_temperature_within_window(env): 48 | source_1 = (1, 0, datetime.datetime.now()) 49 | source_2 = (1, 50, datetime.datetime.now() + datetime.timedelta(milliseconds=100)) 50 | source_3 = (1, 100, datetime.datetime.now() + datetime.timedelta(milliseconds=1000)) 51 | 52 | class SourceTimestampAssigner(TimestampAssigner): 53 | def extract_timestamp( 54 | self, value: Tuple[int, int, datetime.datetime], record_timestamp: int 55 | ): 56 | return int(value[2].strftime("%s")) * 1000 57 | 58 | source_stream: DataStream = env.from_collection( 59 | collection=[source_1, source_2, source_3] 60 | ).assign_timestamps_and_watermarks( 61 | WatermarkStrategy.for_bounded_out_of_orderness( 62 | Duration.of_seconds(5) 63 | ).with_timestamp_assigner(SourceTimestampAssigner()) 64 | ) 65 | 66 | elements = list(define_workflow(source_stream).execute_and_collect()) 67 | assert len(elements) == 2 68 | assert elements[0].temperature == 65 69 | assert elements[1].temperature == 85 70 | -------------------------------------------------------------------------------- /real-time-streaming-aws/exporter/flinksql.sql: -------------------------------------------------------------------------------- 1 | -- docker exec -it jobmanager ./bin/sql-client.sh 2 | 3 | SET 'state.checkpoints.dir' = 'file:///tmp/checkpoints/'; 4 | SET 'execution.checkpointing.interval' = '60000'; 5 | 6 | ADD JAR '/etc/lib/kafka-clients-3.2.3.jar'; 7 | ADD JAR '/etc/flink/package/lib/lab3-pipeline-1.0.0.jar'; 8 | 9 | CREATE TABLE taxi_rides_src ( 10 | id VARCHAR, 11 | vendor_id INT, 12 | pickup_date VARCHAR, 13 | pickup_datetime AS TO_TIMESTAMP(REPLACE(pickup_date, 'T', ' ')), 14 | dropoff_date VARCHAR, 15 | dropoff_datetime AS TO_TIMESTAMP(REPLACE(dropoff_date, 'T', ' ')), 16 | passenger_count INT, 17 | pickup_longitude VARCHAR, 18 | pickup_latitude VARCHAR, 19 | dropoff_longitude VARCHAR, 20 | dropoff_latitude VARCHAR, 21 | store_and_fwd_flag VARCHAR, 22 | gc_distance INT, 23 | trip_duration INT, 24 | google_distance INT, 25 | google_duration INT 26 | ) WITH ( 27 | 'connector' = 'kafka', 28 | 'topic' = 'taxi-rides', 29 | 'properties.bootstrap.servers' = 'kafka-0:9092', 30 | 'properties.group.id' = 'soruce-group', 31 | 'format' = 'json', 32 | 'scan.startup.mode' = 'latest-offset' 33 | ); 34 | 35 | CREATE TABLE taxi_rides_sink ( 36 | id VARCHAR, 37 | vendor_id INT, 38 | pickup_datetime TIMESTAMP, 39 | dropoff_datetime TIMESTAMP, 40 | passenger_count INT, 41 | pickup_longitude VARCHAR, 42 | pickup_latitude VARCHAR, 43 | dropoff_longitude VARCHAR, 44 | dropoff_latitude VARCHAR, 45 | store_and_fwd_flag VARCHAR, 46 | gc_distance INT, 47 | trip_duration INT, 48 | google_distance INT, 49 | google_duration INT, 50 | `year` VARCHAR, 51 | `month` VARCHAR, 52 | `date` VARCHAR, 53 | `hour` VARCHAR 54 | ) PARTITIONED BY (`year`, `month`, `date`, `hour`) WITH ( 55 | 'connector' = 'filesystem', 56 | 'path' = 's3://real-time-streaming-ap-southeast-2/taxi-rides/', 57 | 'format' = 'parquet', 58 | 'sink.partition-commit.delay'='1 h', 59 | 'sink.partition-commit.policy.kind'='success-file' 60 | ); 61 | 62 | -- 'path' = '/tmp/taxi_rides', 63 | 64 | INSERT INTO taxi_rides_sink 65 | SELECT 66 | id, 67 | vendor_id, 68 | pickup_datetime, 69 | dropoff_datetime, 70 | passenger_count, 71 | pickup_longitude, 72 | pickup_latitude, 73 | dropoff_longitude, 74 | dropoff_latitude, 75 | store_and_fwd_flag, 76 | gc_distance, 77 | trip_duration, 78 | google_distance, 79 | google_duration, 80 | DATE_FORMAT(pickup_datetime, 'yyyy') AS `year`, 81 | DATE_FORMAT(pickup_datetime, 'MM') AS `month`, 82 | DATE_FORMAT(pickup_datetime, 'dd') AS `date`, 83 | DATE_FORMAT(pickup_datetime, 'HH') AS `hour` 84 | FROM taxi_rides_src; -------------------------------------------------------------------------------- /stream-processing-with-pyflink/src/chapter5/keyed_transformations.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | from typing import Tuple 4 | 5 | from pyflink.common import WatermarkStrategy 6 | from pyflink.common.typeinfo import Types 7 | from pyflink.common.watermark_strategy import TimestampAssigner, Duration 8 | from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode 9 | from pyflink.table import StreamTableEnvironment 10 | 11 | from utils.model import SensorReading 12 | 13 | 14 | if __name__ == "__main__": 15 | """ 16 | ## local execution 17 | python src/chapter5/keyed_transformations.py 18 | """ 19 | 20 | RUNTIME_ENV = os.getenv("RUNTIME_ENV", "local") 21 | 22 | env = StreamExecutionEnvironment.get_execution_environment() 23 | env.set_runtime_mode(RuntimeExecutionMode.STREAMING) 24 | if RUNTIME_ENV == "local": 25 | SRC_DIR = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) 26 | jar_files = ["flink-faker-0.5.3.jar"] 27 | jar_paths = tuple([f"file://{os.path.join(SRC_DIR, 'jars', name)}" for name in jar_files]) 28 | print(jar_paths) 29 | env.add_jars(*jar_paths) 30 | 31 | t_env = StreamTableEnvironment.create(stream_execution_environment=env) 32 | t_env.get_config().set_local_timezone("Australia/Sydney") 33 | t_env.execute_sql( 34 | """ 35 | CREATE TABLE sensor_source ( 36 | `id` INT, 37 | `rn` INT, 38 | `log_time` TIMESTAMP_LTZ(3) 39 | ) 40 | WITH ( 41 | 'connector' = 'faker', 42 | 'rows-per-second' = '1', 43 | 'fields.id.expression' = '#{number.numberBetween ''0'',''20''}', 44 | 'fields.rn.expression' = '#{number.numberBetween ''0'',''100''}', 45 | 'fields.log_time.expression' = '#{date.past ''10'',''5'',''SECONDS''}' 46 | ); 47 | """ 48 | ) 49 | 50 | class SourceTimestampAssigner(TimestampAssigner): 51 | def extract_timestamp( 52 | self, value: Tuple[int, int, datetime.datetime], record_timestamp: int 53 | ): 54 | return int(value[2].strftime("%s")) * 1000 55 | 56 | source_stream = t_env.to_append_stream( 57 | t_env.from_path("sensor_source"), 58 | Types.TUPLE([Types.INT(), Types.INT(), Types.SQL_TIMESTAMP()]), 59 | ).assign_timestamps_and_watermarks( 60 | WatermarkStrategy.for_bounded_out_of_orderness( 61 | Duration.of_seconds(5) 62 | ).with_timestamp_assigner(SourceTimestampAssigner()) 63 | ) 64 | 65 | keyed = source_stream.map(SensorReading.from_tuple).key_by(lambda e: e.id) 66 | 67 | max_temp_per_sensor = keyed.reduce(lambda r1, r2: r1 if r1.temperature > r2.temperature else r2) 68 | 69 | # max_temp_per_sensor.filter(lambda e: e.id == "sensor_1").print() 70 | max_temp_per_sensor.print() 71 | 72 | env.execute("Keyed Transformations Example") 73 | -------------------------------------------------------------------------------- /flink-sql-cookbook/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | jobmanager: 5 | image: flink-sql-cookbook 6 | build: . 7 | command: jobmanager 8 | container_name: jobmanager 9 | ports: 10 | - "8081:8081" 11 | networks: 12 | - cookbook 13 | environment: 14 | - | 15 | FLINK_PROPERTIES= 16 | jobmanager.rpc.address: jobmanager 17 | state.backend: filesystem 18 | state.checkpoints.dir: file:///tmp/flink-checkpoints 19 | state.savepoints.dir: file:///tmp/flink-savepoints 20 | heartbeat.interval: 1000 21 | heartbeat.timeout: 5000 22 | rest.flamegraph.enabled: true 23 | web.backpressure.refresh-interval: 10000 24 | healthcheck: 25 | test: ["CMD", "curl", "-f", "http://localhost:8081/config"] 26 | interval: 5s 27 | timeout: 5s 28 | retries: 5 29 | 30 | taskmanager-1: 31 | image: flink-sql-cookbook 32 | build: . 33 | command: taskmanager 34 | container_name: taskmanager-1 35 | networks: 36 | - cookbook 37 | depends_on: 38 | jobmanager: 39 | condition: service_healthy 40 | environment: 41 | - | 42 | FLINK_PROPERTIES= 43 | jobmanager.rpc.address: jobmanager 44 | taskmanager.numberOfTaskSlots: 10 45 | state.backend: filesystem 46 | state.checkpoints.dir: file:///tmp/flink-checkpoints 47 | state.savepoints.dir: file:///tmp/flink-savepoints 48 | heartbeat.interval: 1000 49 | heartbeat.timeout: 5000 50 | 51 | taskmanager-2: 52 | image: flink-sql-cookbook 53 | build: . 54 | command: taskmanager 55 | container_name: taskmanager-2 56 | networks: 57 | - cookbook 58 | depends_on: 59 | jobmanager: 60 | condition: service_healthy 61 | environment: 62 | - | 63 | FLINK_PROPERTIES= 64 | jobmanager.rpc.address: jobmanager 65 | taskmanager.numberOfTaskSlots: 10 66 | state.backend: filesystem 67 | state.checkpoints.dir: file:///tmp/flink-checkpoints 68 | state.savepoints.dir: file:///tmp/flink-savepoints 69 | heartbeat.interval: 1000 70 | heartbeat.timeout: 5000 71 | 72 | taskmanager-3: 73 | image: flink-sql-cookbook 74 | build: . 75 | command: taskmanager 76 | container_name: taskmanager-3 77 | networks: 78 | - cookbook 79 | depends_on: 80 | jobmanager: 81 | condition: service_healthy 82 | environment: 83 | - | 84 | FLINK_PROPERTIES= 85 | jobmanager.rpc.address: jobmanager 86 | taskmanager.numberOfTaskSlots: 10 87 | state.backend: filesystem 88 | state.checkpoints.dir: file:///tmp/flink-checkpoints 89 | state.savepoints.dir: file:///tmp/flink-savepoints 90 | heartbeat.interval: 1000 91 | heartbeat.timeout: 5000 92 | 93 | networks: 94 | cookbook: 95 | name: flink-sql-cookbook 96 | --------------------------------------------------------------------------------