├── LICENSE
├── README.md
├── docker-compose.yaml
├── flink
    ├── sql-client
    │   ├── Dockerfile
    │   └── flink-conf.yaml
    └── sql-jobs
    │   └── clickstream-filtering.sql
├── img
    ├── e2e-pipeline.png
    ├── flink-job.png
    ├── superset_dashboard.png
    ├── topic-clickstream.png
    ├── trino-query.png
    └── warehouse-bucket.png
├── producer
    ├── Dockerfile
    ├── producer.py
    └── requirements.txt
├── superset
    ├── Dockerfile
    ├── superset-init.sh
    └── superset_config.py
└── trino
    └── iceberg.properties


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Abel dos Santos Tavares
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | **E2E Real-Time Data Pipeline with Kafka, Flink, Iceberg, Trino, MinIO, and Superset**
  2 | ======================================================================================
  3 | 
  4 | ![Docker](https://img.shields.io/badge/Docker-Enabled-blue?logo=docker)
  5 | ![Apache Kafka](https://img.shields.io/badge/Apache%20Kafka-Event%20Streaming-black?logo=apachekafka)
  6 | ![Apache Flink](https://img.shields.io/badge/Apache%20Flink-Real%20Time%20Processing-orange?logo=apacheflink)
  7 | ![Apache Iceberg](https://img.shields.io/badge/Apache%20Iceberg-Table%20Format-blue?logo=apache)
  8 | ![Trino](https://img.shields.io/badge/Trino-SQL%20Query%20Engine-green?logo=trino)
  9 | ![Apache Superset](https://img.shields.io/badge/Apache%20Superset-Visualization-ff69b4?logo=apache)
 10 | 
 11 | 
 12 | **📖 Overview**
 13 | ---------------
 14 | 
 15 | This project demonstrates a **real-time end-to-end (E2E) data pipeline** designed to handle clickstream data. It shows how to ingest, process, store, query, and visualize streaming data using open-source tools, all containerized with Docker for easy deployment.
 16 | 
 17 | 🔎 **Technologies Used:**
 18 | 
 19 | -   **Data Ingestion:** [Apache Kafka](https://kafka.apache.org/)  
 20 | -   **Stream Processing:** [Apache Flink](https://flink.apache.org/)  
 21 | -   **Object Storage:** [MinIO (S3-compatible)](https://min.io/)
 22 | -   **Data Lake Table Format:** [Apache Iceberg](https://iceberg.apache.org/)  
 23 | -   **Query Engine:** [Trino](https://trino.io/)  
 24 | -   **Visualization:** [Apache Superset](https://superset.apache.org/)    
 25 | 
 26 | 
 27 | This pipeline is perfect for **data engineers** and **students** interested in learning how to design real-time data systems.
 28 | 
 29 | * * * * *
 30 | 
 31 | **🏗  Architecture**
 32 | -----------------------------------
 33 | 
 34 | ![Architecture Diagram](img/e2e-pipeline.png)
 35 | 
 36 | 1.  **Clickstream Data Generator** simulates real-time user events and pushes them to **Kafka** topic.
 37 | 2.  **Apache Flink** processes Kafka streams and writes clean data to **Iceberg tables** stored on **MinIO**.
 38 | 3.  **Trino** connects to Iceberg for querying the processed data.
 39 | 4.  **Apache Superset** visualizes the data by connecting to Trino.
 40 | 
 41 | 
 42 | 🛠 **Tech Stack**
 43 | -----------------
 44 | 
 45 | | **Component**       | **Technology**                                                                 | **Purpose**                                     |
 46 | |--------------------|-------------------------------------------------------------------------------|-------------------------------------------------|
 47 | | **Data Generator**  | [Python (Faker)](https://faker.readthedocs.io/)                              | Simulate clickstream events                      |
 48 | | **Data Ingestion**  | [Apache Kafka](https://kafka.apache.org/)                                    | Real-time event streaming                        |
 49 | | **Coordination Service** | [Apache ZooKeeper](https://zookeeper.apache.org/) | Kafka broker coordination and metadata management |
 50 | | **Stream Processing** | [Apache Flink](https://flink.apache.org/)                                  | Real-time data processing and transformation     |
 51 | | **Data Lake Storage** | [Apache Iceberg](https://iceberg.apache.org/)                               | Data storage and schema management              |
 52 | | **Object Storage**  | [MinIO](https://min.io/)                                                      | S3-compatible storage for Iceberg tables         |
 53 | | **Query Engine**    | [Trino](https://trino.io/)                                                    | Distributed SQL querying on Iceberg data         |
 54 | | **Visualization**   | [Apache Superset](https://superset.apache.org/)                               | Interactive dashboards and data visualization    |
 55 | 
 56 | * * * * *
 57 | 
 58 | 
 59 | **📦 Project Structure**
 60 | ------------------------
 61 | 
 62 | ```bash
 63 | e2e-data-pipeline/
 64 | ├── docker-compose.yml   # Docker setup for all services
 65 | ├── flink/               # Flink SQL client and streaming jobs
 66 | ├── producer/            # Clickstream data producer using Faker
 67 | ├── superset/            # Superset setup and configuration
 68 | └── trino/               # Trino configuration for Iceberg 
 69 | ```
 70 | 
 71 | * * * * *
 72 | 
 73 | **🔧 Setup Instructions**
 74 | -------------------------
 75 | 
 76 | ### **1\. Prerequisites**
 77 | 
 78 | -   **Docker** and **Docker Compose** installed.
 79 | -   Minimum **16GB RAM** recommended.
 80 | 
 81 | ### **2\. Clone the Repository**
 82 | 
 83 | ```bash
 84 | git clone https://github.com/abeltavares/real-time-data-pipeline.git
 85 | cd real-time-data-pipeline
 86 | ```
 87 | 
 88 | ### **3\. Start All Services**
 89 | 
 90 | ```bash
 91 | docker-compose up -d
 92 | ```
 93 | 
 94 | ⚠️ **Note:** All components (Kafka, Flink, Iceberg, Trino, MinIO, and Superset) are containerized using Docker for easy deployment and scalabilit
 95 | 
 96 | ### **4\. Access the Services**
 97 | 
 98 | | **Service** | **URL** | **Credentials** |
 99 | | --- | --- | --- |
100 | | **Kafka Control Center** | `http://localhost:9021` | *No Auth* |
101 | | **Flink Dashboard** | `http://localhost:18081` | *No Auth* |
102 | | **MinIO Console** | `http://localhost:9001` | `admin` / `password` |
103 | | **Trino UI** | `http://localhost:8080/ui` | *No Auth* |
104 | | **Superset** | `http://localhost:8088` | `admin` / `admin` |
105 | 
106 | 
107 | 📥 **Data Ingestion**
108 | ---------------------
109 | 
110 | ### 1\. **Clickstream Data Generation**
111 | 
112 | Clickstream events are simulated using Python's **Faker** library. Here's the event structure:
113 | 
114 | ```python
115 | {
116 |   "event_id": fake.uuid4(),
117 |   "user_id": fake.uuid4(),
118 |   "event_type": fake.random_element(elements=("page_view",        "add_to_cart", "purchase", "logout")),
119 |   "url": fake.uri_path(),
120 |   "session_id": fake.uuid4(),
121 |   "device": fake.random_element(elements=("mobile", "desktop", "tablet")),
122 |   "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
123 |   "geo_location": {
124 |       "lat": float(fake.latitude()),
125 |       "lon": float(fake.longitude())
126 |   },
127 |   "purchase_amount": float(random.uniform(0.0, 500.0)) if fake.boolean(chance_of_getting_true=30) else None
128 | }
129 | ```
130 | 
131 | ⚠️ **Note:** The **Clickstream Producer** runs automatically when Docker Compose is up. No manual execution is needed.
132 | 
133 | ### 2\. **Kafka Consumer**
134 | 
135 | The Kafka consumer reads the clickstream events and pushes them to **Apache Flink** for real-time processing.
136 | 
137 | You can monitor the Kafka topic through the **Kafka Control Center**:
138 | 
139 | -   **Kafka Control Center URL:** <http://localhost:9021>
140 | 
141 | ![Kafka Topic](img/topic-clickstream.png)
142 | 
143 | * * * * *
144 | 
145 | ⚡ **Real-Time Data Processing with Apache Flink**
146 | -------------------------------------------------
147 | 
148 | ### 1\. **Flink Configuration**
149 | 
150 | -   **State Backend:** RocksDB
151 | -   **Checkpointing:** Enabled for fault tolerance
152 | -   **Connectors:** Kafka → Iceberg (via Flink SQL)
153 | 
154 | ### 2\. **Flink SQL Job Execution**
155 | 
156 | The `sql-client` service in Docker Compose automatically submits the Flink SQL job after the JobManager and TaskManager are running. It uses the `clickstream-filtering.sql` script to process Kafka streams and write to Iceberg.
157 | 
158 | ```bash
159 | /opt/flink/bin/sql-client.sh -f /opt/flink/clickstream-filtering.sql
160 | ```
161 | 
162 | ### 2\. **Flink Dashboard**
163 | 
164 | Monitor real-time data processing jobs at:\
165 | 📊 http://localhost:18081
166 | 
167 | ![Flink Job](img/flink-job.png)
168 | 
169 | * * * * *
170 | 
171 | 🗄️ **Data Lakehouse with Apache Iceberg**
172 | ------------------------------------------
173 | 
174 | Processed data from Flink is stored in **Iceberg tables** on **MinIO**. This enables:
175 | 
176 | -   **Efficient Querying** with Trino
177 | -   **Schema Evolution** and **Time Travel**
178 | 
179 | To list the contents of the MinIO warehouse, you can use the following command:
180 | 
181 | ```bash
182 | docker exec mc bash -c "mc ls -r minio/warehouse/"
183 | ```
184 | 
185 | Alternatively, you can access the MinIO console via the web at <http://localhost:9001>.
186 | 
187 | -   **Username:** `admin`
188 | -   **Password:** `password`
189 | 
190 | ![Warehouse Bucket](img/warehouse-bucket.png)
191 | 
192 | **🔍 Query Data with Trino**
193 | ----------------------------
194 | 
195 |  **1\. Run Trino CLI**
196 | 
197 | ```bash
198 | docker-compose exec trino trino
199 | ```
200 | 
201 | **2\. Connect to Iceberg Catalog**
202 | 
203 | ```sql
204 | USE iceberg.db;
205 | ```
206 | 
207 | **3\. Query Processed Data**
208 | 
209 | ```sql
210 | SELECT * FROM iceberg.db.clickstream_sink
211 | WHERE purchase_amount > 100
212 | LIMIT 10;
213 | ```
214 | 
215 | ![Trino Query](img/trino-query.png)
216 | 
217 | 📊 **Data Visualization with Apache Superset**
218 | ----------------------------------------------
219 | 
220 | 1.  **Access Superset:** <http://localhost:8088>
221 | 
222 |     -   **Username:** `admin`
223 |     -   **Password:** `admin`
224 | 2.  **Connect Superset to Trino:**
225 | 
226 | -   **SQLAlchemy URI:**
227 | 
228 |     ```bash
229 |     trino://trino@trino:8080/iceberg/db
230 |     ```
231 | -   **Configure in Superset:**
232 | 
233 |     1.  Open `http://localhost:8088`
234 |     2.  Go to **Data** → **Databases** → **+**
235 |     3.  Use the above SQLAlchemy URI.
236 | 
237 | 3.  **Create Dashboards:**
238 | 
239 | ![Superset](img/superset_dashboard.png)
240 | 
241 | 🏆 **Key Features**
242 | -------------------
243 | 
244 | ### 🔄 **Real-Time Data Processing**
245 | 
246 | -   Stream processing with **Apache Flink**.
247 | -   Clickstream events are transformed and filtered in real-time.
248 | 
249 | ### 📂 **Modern Data Lakehouse**
250 | 
251 | -   Data is stored in **Apache Iceberg** on **MinIO**, S3 compatible, supporting schema evolution and time travel.
252 | 
253 | ### ⚡ **Fast SQL Analytics**
254 | 
255 | -   **Trino** provides fast, distributed SQL queries on Iceberg data.
256 | 
257 | ### 📊 **Interactive Dashboards**
258 | 
259 | -   **Apache Superset** delivers real-time visual analytics.
260 | 
261 | ### 📦 **Fully Containerized Setup**
262 | 
263 | -   Simplified deployment using **Docker** and **Docker Compose** for seamless integration across all services.
264 | 
265 | * * * * *
266 | 
267 | 📈 **Future Enhancements**
268 | --------------------------
269 | 
270 | -   Implement **alerting** and **monitoring** with **Grafana** and **Prometheus**.
271 | -   Introduce **machine learning pipelines** for predictive analytics.
272 | -   Optimize **Iceberg partitioning** for faster queries.
273 | 
274 | * * * * *
275 | 
276 | 📎 **Quick Reference Commands**
277 | -------------------------------
278 | 
279 | | **Component** | **Command** |
280 | | --- | --- |
281 | | **Start Services** | `docker-compose up --build -d` |
282 | | **Stop Services** | `docker-compose down` |
283 | | **View Running Containers** | `docker ps` |
284 | | **Check Logs** | `docker-compose logs -f` |
285 | | **Rebuild Containers** | `docker-compose up --build --force-recreate -d` |
286 | 
287 | * * * * *
288 | 
289 | 🙌 **Get Involved**
290 | -------------------
291 | 
292 | Contributions are welcome! Feel free to submit issues or pull requests to improve this project.
293 | 
294 | * * * * *
295 | 
296 | 📜 License
297 | --------------
298 | 
299 | This project is licensed under the [MIT License](LICENSE).
300 | 
301 | * * * * *
302 | 
303 | Enjoy exploring real-time data pipelines!


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | services:
  2 |   zookeeper:
  3 |     image: confluentinc/cp-zookeeper:7.2.1
  4 |     hostname: zookeeper
  5 |     container_name: zookeeper
  6 |     ports:
  7 |       - 2181:2181
  8 |     environment:
  9 |       ZOOKEEPER_CLIENT_PORT: 2181
 10 |     networks:
 11 |       - iceberg_net
 12 | 
 13 |   broker:
 14 |     image: confluentinc/cp-server:7.2.1
 15 |     hostname: kafka
 16 |     container_name: broker
 17 |     depends_on:
 18 |       - zookeeper
 19 |     ports:
 20 |       - 9092:9092
 21 |       - 29092:29092
 22 |     healthcheck:
 23 |       test: ["CMD-SHELL", "sleep 1;"]
 24 |       interval: 30s
 25 |       timeout: 10s
 26 |       retries: 5
 27 |     environment:
 28 |       # Basic Kafka configurations
 29 |       KAFKA_BROKER_ID: 1
 30 |       KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
 31 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
 32 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
 33 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
 34 |       KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
 35 |       KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
 36 | 
 37 |       # Logging settings
 38 |       KAFKA_LOG4J_LOGGERS: org.apache.zookeeper=ERROR,org.apache.kafka=ERROR,kafka=ERROR,kafka.cluster=ERROR,kafka.controller=ERROR,kafka.coordinator=ERROR,kafka.log=ERROR,kafka.server=ERROR,kafka.zookeeper=ERROR,state.change.logger=ERROR
 39 |       KAFKA_LOG4J_ROOT_LOGLEVEL: ERROR
 40 | 
 41 |       # Metrics and telemetry configurations
 42 |       KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter
 43 |       CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: broker:29092
 44 |       CONFLUENT_METRICS_ENABLE: 'false'
 45 |       CONFLUENT_TELEMETRY_ENABLED: 'false'  # Ensure telemetry is fully disabled
 46 |       KAFKA_CONFLUENT_BALANCER_TOPIC_REPLICATION_FACTOR: 1
 47 |       KAFKA_CONFLUENT_SUPPORT_METRICS_TOPIC_REPLICATION_FACTOR: 1
 48 |       KAFKA_CONFLUENT_TELEMETRY_TOPIC_REPLICATION_FACTOR: 1
 49 |       KAFKA_CONFLUENT_METADATA_TOPIC_REPLICATION_FACTOR: 1
 50 | 
 51 |     networks:
 52 |       - iceberg_net
 53 | 
 54 |   control-center:
 55 |     image: confluentinc/cp-enterprise-control-center:7.2.1
 56 |     hostname: control-center
 57 |     container_name: control-center
 58 |     depends_on:
 59 |       - broker
 60 |     ports:
 61 |       - 9021:9021
 62 |     environment:
 63 |       CONTROL_CENTER_BOOTSTRAP_SERVERS: 'broker:29092'
 64 |       CONTROL_CENTER_REPLICATION_FACTOR: 1
 65 |       CONTROL_CENTER_INTERNAL_TOPICS_PARTITIONS: 1
 66 |       CONTROL_CENTER_MONITORING_INTERCEPTOR_TOPIC_PARTITIONS: 1
 67 |       CONFLUENT_METRICS_TOPIC_REPLICATION: 1
 68 |       PORT: 9021
 69 |     networks:
 70 |       - iceberg_net
 71 | 
 72 |   producer:
 73 |     container_name: clickstream-producer
 74 |     build:
 75 |       context: ./producer
 76 |     depends_on:
 77 |       - broker
 78 |     environment:
 79 |       KAFKA_BROKER: broker:29092
 80 |     networks:
 81 |       - iceberg_net
 82 | 
 83 |   ## FLINK
 84 |   jobmanager:
 85 |     container_name: jobmanager
 86 |     image: flink:1.18.1-scala_2.12-java11
 87 |     ports:
 88 |       - 18081:18081
 89 |     command: jobmanager   
 90 |     environment:
 91 |       - |
 92 |         FLINK_PROPERTIES=
 93 |         jobmanager.rpc.address: jobmanager
 94 |         rest.port: 18081
 95 |         state.backend: rocksdb
 96 |         state.backend.incremental: true
 97 |         AWS_REGION=us-east-1
 98 |         AWS_ACCESS_KEY_ID=admin
 99 |         AWS_SECRET_ACCESS_KEY=password
100 |         AWS_DEFAULT_REGION=us-east-1
101 |         S3_ENDPOINT=http://minio:9000
102 |         S3_PATH_STYLE_ACCESS=true
103 |         JAVA_TOOL_OPTIONS=-Daws.accessKeyId=admin -Daws.secretKey=password
104 |     healthcheck:
105 |       test: ["CMD", "curl", "-f", "http://localhost:18081"]
106 |     networks:
107 |       - iceberg_net
108 | 
109 |   taskmanager:
110 |     container_name: taskmanager
111 |     image: flink:1.18.1-scala_2.12-java11
112 |     depends_on:
113 |       - jobmanager
114 |     command: taskmanager
115 |     environment:
116 |       - |
117 |         FLINK_PROPERTIES=
118 |         jobmanager.rpc.address: jobmanager
119 |         rest.port: 18081
120 |         taskmanager.numberOfTaskSlots: 2
121 |         state.backend: rocksdb
122 |         state.backend.incremental: true
123 |         AWS_REGION=us-east-1
124 |         AWS_ACCESS_KEY_ID=admin
125 |         AWS_SECRET_ACCESS_KEY=password
126 |         AWS_DEFAULT_REGION=us-east-1
127 |         S3_ENDPOINT=http://minio:9000
128 |         S3_PATH_STYLE_ACCESS=true
129 |         JAVA_TOOL_OPTIONS=-Daws.accessKeyId=admin -Daws.secretKey=password
130 | 
131 |     networks:
132 |       - iceberg_net
133 | 
134 |   sql-client:
135 |     container_name: sql-client
136 |     depends_on:
137 |       jobmanager:
138 |         condition: service_healthy
139 |       taskmanager:
140 |         condition: service_started
141 |     build:
142 |       context: ./flink/sql-client/
143 |     environment:
144 |       FLINK_JOBMANAGER_HOST: jobmanager
145 |       S3_ENDPOINT: http://minio:9000
146 |       S3_PATH_STYLE_ACCESS: true
147 |       AWS_ACCESS_KEY_ID: admin
148 |       AWS_SECRET_ACCESS_KEY: password
149 |       AWS_REGION: us-east-1
150 |       AWS_DEFAULT_REGION: us-east-1
151 |       JAVA_TOOL_OPTIONS: -Daws.accessKeyId=admin -Daws.secretKey=password
152 |     volumes:
153 |       - type: bind
154 |         source: ${PWD}/flink/sql-client/flink-conf.yaml
155 |         target: /opt/flink/conf/flink-conf.yaml
156 |       - type: bind
157 |         source: ${PWD}/flink/sql-jobs/clickstream-filtering.sql
158 |         target: /opt/flink/clickstream-filtering.sql
159 |     command: >
160 |       /bin/sh -c "
161 |       /opt/flink/bin/sql-client.sh -f /opt/flink/clickstream-filtering.sql;
162 |       tail -f /dev/null
163 |       "
164 |     networks:
165 |       - iceberg_net
166 | 
167 |   minio:
168 |     image: minio/minio
169 |     container_name: minio
170 |     environment:
171 |       - MINIO_ROOT_USER=admin
172 |       - MINIO_ROOT_PASSWORD=password
173 |       - MINIO_DOMAIN=minio
174 |     networks:
175 |       iceberg_net:
176 |         aliases:
177 |           - warehouse.minio
178 |     ports:
179 |       - 9001:9001
180 |       - 9000:9000
181 |     command: ["server", "/data", "--console-address", ":9001"]
182 |   mc:
183 |     depends_on:
184 |       - minio
185 |     image: minio/mc
186 |     container_name: mc
187 |     networks:
188 |       - iceberg_net
189 |     environment:
190 |       - AWS_ACCESS_KEY_ID=admin
191 |       - AWS_SECRET_ACCESS_KEY=password
192 |       - AWS_REGION=us-east-1
193 |       - AWS_DEFAULT_REGION=us-east-1
194 |     entrypoint: >
195 |       /bin/sh -c "
196 |       until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done;
197 |       /usr/bin/mc rm -r --force minio/warehouse;
198 |       /usr/bin/mc mb minio/warehouse;
199 |       /usr/bin/mc policy set public minio/warehouse;
200 |       tail -f /dev/null
201 |       "
202 | 
203 |   rest:
204 |     image: tabulario/iceberg-rest
205 |     container_name: iceberg-rest
206 |     ports:
207 |       - 8181:8181
208 |     environment:
209 |       - AWS_ACCESS_KEY_ID=admin
210 |       - AWS_SECRET_ACCESS_KEY=password
211 |       - AWS_REGION=us-east-1
212 |       - CATALOG_WAREHOUSE=s3://warehouse/
213 |       - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO
214 |       - CATALOG_S3_ENDPOINT=http://minio:9000
215 |     networks:
216 |       - iceberg_net
217 | 
218 |   trino:
219 |     image: trinodb/trino:latest
220 |     container_name: trino
221 |     networks:
222 |       - iceberg_net
223 |     environment:
224 |       - TRINO_USER=admin
225 |       - TRINO_PASSWORD=admin
226 |     ports:
227 |       - 8080:8080
228 |     depends_on:
229 |       - rest
230 |       - minio
231 |     volumes:
232 |       - ./trino/iceberg.properties:/etc/trino/catalog/iceberg.properties
233 | 
234 |   superset:
235 |       build:
236 |         context: ./superset
237 |       container_name: superset
238 |       networks:
239 |         - iceberg_net
240 |       environment:
241 |         - ADMIN_USERNAME=admin
242 |         - ADMIN_EMAIL=admin@superset.com
243 |         - ADMIN_PASSWORD=admin
244 |       ports:
245 |         - 8088:8088
246 | 
247 | networks:
248 |   iceberg_net:
249 |     driver: bridge
250 | 
251 | volumes:
252 |   minio_data:
253 |   superset-data:
254 | 


--------------------------------------------------------------------------------
/flink/sql-client/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM flink:1.18.1-scala_2.12-java11
 2 | 
 3 | RUN curl -o ${FLINK_HOME}/lib/flink-sql-connector-kafka-3.1.0-1.18.jar https://repo1.maven.org/maven2/org/apache/flink/flink-sql-connector-kafka/3.1.0-1.18/flink-sql-connector-kafka-3.1.0-1.18.jar && \
 4 |     curl -o ${FLINK_HOME}/lib/flink-json-1.18.1.jar https://repo.maven.apache.org/maven2/org/apache/flink/flink-json/1.18.1/flink-json-1.18.1.jar && \
 5 |     curl -o ${FLINK_HOME}/lib/iceberg-flink-runtime-1.18-1.5.0.jar https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.18/1.5.0/iceberg-flink-runtime-1.18-1.5.0.jar && \
 6 |     curl -o ${FLINK_HOME}/lib/hadoop-common-2.8.3.jar https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-common/2.8.3/hadoop-common-2.8.3.jar && \
 7 |     curl -o ${FLINK_HOME}/lib/hadoop-hdfs-2.8.3.jar https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-hdfs/2.8.3/hadoop-hdfs-2.8.3.jar && \
 8 |     curl -o ${FLINK_HOME}/lib/hadoop-client-2.8.3.jar https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client/2.8.3/hadoop-client-2.8.3.jar && \
 9 |     curl -o ${FLINK_HOME}/lib/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar https://repo.maven.apache.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar && \
10 |     curl -o ${FLINK_HOME}/lib/bundle-2.20.18.jar https://repo1.maven.org/maven2/software/amazon/awssdk/bundle/2.20.18/bundle-2.20.18.jar
11 | 
12 | WORKDIR /opt/flink
13 | 
14 | CMD ["bash", "-c", "${FLINK_HOME}/bin/sql-client.sh && tail -f /dev/null"]
15 | 


--------------------------------------------------------------------------------
/flink/sql-client/flink-conf.yaml:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Copyright 2019 Ververica GmbH
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #  http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ################################################################################
16 | 
17 | jobmanager.rpc.address: jobmanager
18 | rest.port: 18081
19 | state.backend: rocksdb
20 | state.backend.incremental: true
21 | state.checkpoint-storage: filesystem
22 | blob.server.port: 6124
23 | query.server.port: 6125
24 | classloader.resolve-order: parent-first
25 | jobmanager.archive.fs.cleanup-expired-jobs: false
26 | jobmanager.archive.fs.job-expiration-time: 24h
27 | rest.client.max-content-length: 104857600 # 100MB
28 | taskmanager.network.request-backoff.max: 120000
29 | heartbeat.timeout: 300000
30 | akka.ask.timeout: 60s
31 | 


--------------------------------------------------------------------------------
/flink/sql-jobs/clickstream-filtering.sql:
--------------------------------------------------------------------------------
 1 | -- Configure Flink Settings for Streaming and State Management
 2 | SET 'state.backend' = 'rocksdb';
 3 | SET 'state.backend.incremental' = 'true';
 4 | SET 'execution.checkpointing.mode' = 'EXACTLY_ONCE';
 5 | SET 'execution.checkpointing.interval' = '10s';
 6 | SET 'execution.checkpointing.min-pause' = '10s';
 7 | SET 'sql-client.execution.result-mode' = 'TABLEAU';
 8 | SET 'parallelism.default' = '1';
 9 | 
10 | -- Load Required Jars
11 | ADD JAR '/opt/flink/lib/flink-sql-connector-kafka-3.1.0-1.18.jar';
12 | ADD JAR '/opt/flink/lib/flink-json-1.18.1.jar';
13 | ADD JAR '/opt/flink/lib/iceberg-flink-runtime-1.18-1.5.0.jar';
14 | ADD JAR '/opt/flink/lib/hadoop-common-2.8.3.jar';
15 | ADD JAR '/opt/flink/lib/hadoop-hdfs-2.8.3.jar';
16 | ADD JAR '/opt/flink/lib/hadoop-client-2.8.3.jar';
17 | ADD JAR '/opt/flink/lib/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar';
18 | ADD JAR '/opt/flink/lib/bundle-2.20.18.jar';
19 | 
20 | -- Confirm Jars are Loaded
21 | SHOW JARS;
22 | 
23 | DROP CATALOG IF EXISTS iceberg;
24 | CREATE CATALOG iceberg WITH (
25 |     'type' = 'iceberg',
26 |     'catalog-impl' = 'org.apache.iceberg.rest.RESTCatalog',  -- Use REST catalog
27 |     'uri' = 'http://iceberg-rest:8181',                     -- REST catalog server URL
28 |     'warehouse' = 's3://warehouse/',                        -- Warehouse location
29 |     'io-impl' = 'org.apache.iceberg.aws.s3.S3FileIO',       -- S3 file IO
30 |     's3.endpoint' = 'http://minio:9000',                    -- MinIO endpoint
31 |     's3.path-style-access' = 'true',                        -- Enable path-style access
32 |     'client.region' = 'us-east-1',                          -- S3 region
33 |     's3.access-key-id' = 'admin',                           -- MinIO access key
34 |     's3.secret-access-key' = 'password'                     -- MinIO secret key
35 | );
36 | 
37 | -- Define Kafka Source Table
38 | DROP TABLE IF EXISTS clickstream_source;
39 | CREATE TABLE IF NOT EXISTS clickstream_source (
40 |     event_id STRING,
41 |     user_id STRING,
42 |     event_type STRING,
43 |     url STRING,
44 |     session_id STRING,
45 |     device STRING,
46 |     event_time TIMESTAMP_LTZ(3) METADATA FROM 'timestamp',
47 |     geo_location ROW<lat DOUBLE, lon DOUBLE>,
48 |     purchase_amount DOUBLE
49 | ) WITH (
50 |     'connector' = 'kafka',
51 |     'topic' = 'clickstream',
52 |     'properties.bootstrap.servers' = 'broker:29092',
53 |     'scan.startup.mode' = 'earliest-offset',
54 |     'format' = 'json',
55 |     'json.ignore-parse-errors' = 'true',
56 |     'json.timestamp-format.standard' = 'ISO-8601'
57 | );
58 | 
59 | -- Define Iceberg Sink Table
60 | CREATE DATABASE IF NOT EXISTS iceberg.db;
61 | DROP TABLE IF EXISTS iceberg.db.clickstream_sink;
62 | CREATE TABLE iceberg.db.clickstream_sink
63 | WITH (
64 |     'catalog-name' = 'iceberg',
65 |     'format' = 'parquet'
66 | )
67 | AS
68 | SELECT
69 |     event_id,
70 |     user_id,
71 |     event_type,
72 |     url,
73 |     session_id,
74 |     device,
75 |     event_time,
76 |     geo_location.lat AS latitude,
77 |     geo_location.lon AS longitude,
78 |     purchase_amount
79 | FROM clickstream_source
80 | WHERE event_type = 'purchase'
81 |   AND device IS NOT NULL;
82 | 
83 | 


--------------------------------------------------------------------------------
/img/e2e-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abeltavares/real-time-data-pipeline/4b80eef11803254fc912dae613f697faeefa3017/img/e2e-pipeline.png


--------------------------------------------------------------------------------
/img/flink-job.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abeltavares/real-time-data-pipeline/4b80eef11803254fc912dae613f697faeefa3017/img/flink-job.png


--------------------------------------------------------------------------------
/img/superset_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abeltavares/real-time-data-pipeline/4b80eef11803254fc912dae613f697faeefa3017/img/superset_dashboard.png


--------------------------------------------------------------------------------
/img/topic-clickstream.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abeltavares/real-time-data-pipeline/4b80eef11803254fc912dae613f697faeefa3017/img/topic-clickstream.png


--------------------------------------------------------------------------------
/img/trino-query.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abeltavares/real-time-data-pipeline/4b80eef11803254fc912dae613f697faeefa3017/img/trino-query.png


--------------------------------------------------------------------------------
/img/warehouse-bucket.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abeltavares/real-time-data-pipeline/4b80eef11803254fc912dae613f697faeefa3017/img/warehouse-bucket.png


--------------------------------------------------------------------------------
/producer/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.9-slim
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | COPY requirements.txt .
 6 | RUN pip install --no-cache-dir -r requirements.txt
 7 | 
 8 | COPY producer.py .
 9 | 
10 | #(optional, for debugging)
11 | EXPOSE 5000
12 | 
13 | CMD ["python", "producer.py"]
14 | 


--------------------------------------------------------------------------------
/producer/producer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import random
 4 | import json
 5 | import logging
 6 | import signal
 7 | import sys
 8 | from faker import Faker
 9 | from confluent_kafka import Producer
10 | 
11 | fake = Faker()
12 | 
13 | # Set up logging
14 | logging.basicConfig(level=logging.INFO)
15 | logger = logging.getLogger(__name__)
16 | 
17 | # Kafka configuration
18 | kafka_broker = os.getenv("KAFKA_BROKER")
19 | if not kafka_broker:
20 |     raise ValueError("KAFKA_BROKER environment variable is not set.")
21 | 
22 | kafka_config = {
23 |     'bootstrap.servers': kafka_broker
24 | }
25 | producer = Producer(kafka_config)
26 | 
27 | topic = 'clickstream'
28 | 
29 | def generate_clickstream_event():
30 |     return {
31 |         "event_id": fake.uuid4(),
32 |         "user_id": fake.uuid4(),
33 |         "event_type": fake.random_element(elements=("page_view", "add_to_cart", "purchase", "logout")),
34 |         "url": fake.uri_path(),
35 |         "session_id": fake.uuid4(),
36 |         "device": fake.random_element(elements=("mobile", "desktop", "tablet")),
37 |         "geo_location": {
38 |             "lat": float(fake.latitude()),
39 |             "lon": float(fake.longitude())
40 |         },
41 |         "purchase_amount": float(random.uniform(0.0, 500.0)) if fake.boolean(chance_of_getting_true=30) else None
42 |     }
43 | 
44 | def delivery_report(err, msg):
45 |     if err is not None:
46 |         logger.error(f"Message delivery failed: {err}")
47 |     else:
48 |         logger.info(f"Message delivered to {msg.topic()} [{msg.partition()}]")
49 | 
50 | def signal_handler(sig, frame):
51 |     logger.info("Data generation stopped.")
52 |     producer.flush()
53 |     sys.exit(0)
54 | 
55 | signal.signal(signal.SIGINT, signal_handler)
56 | 
57 | if __name__ == "__main__":
58 |     try:
59 |         while True:
60 |             event = generate_clickstream_event()
61 |             try:
62 |                 producer.produce(topic, key=event["session_id"], value=json.dumps(event), callback=delivery_report)
63 |             except BufferError as e:
64 |                 logger.error(f"Buffer error: {e}")
65 |             except Exception as e:
66 |                 logger.error(f"Unexpected error: {e}")
67 |             logger.info(json.dumps(event, indent=2))
68 |             time.sleep(1)
69 |             producer.poll(1)
70 |     except KeyboardInterrupt:
71 |         logger.info("Data generation stopped.")
72 |     finally:
73 |         producer.flush()


--------------------------------------------------------------------------------
/producer/requirements.txt:
--------------------------------------------------------------------------------
1 | faker
2 | confluent-kafka
3 | 


--------------------------------------------------------------------------------
/superset/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM apache/superset:latest
 2 | 
 3 | USER root
 4 | 
 5 | RUN pip install psycopg2-binary
 6 | RUN pip install sqlalchemy-trino
 7 | 
 8 | ENV ADMIN_USERNAME $ADMIN_USERNAME
 9 | ENV ADMIN_EMAIL $ADMIN_EMAIL
10 | ENV ADMIN_PASSWORD $ADMIN_PASSWORD
11 | 
12 | COPY ./superset-init.sh /superset-init.sh
13 | RUN chmod +x /superset-init.sh
14 | 
15 | COPY superset_config.py /app/
16 | ENV SUPERSET_CONFIG_PATH /app/superset_config.py
17 | 
18 | USER superset
19 | ENTRYPOINT [ "/superset-init.sh" ]


--------------------------------------------------------------------------------
/superset/superset-init.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # create Admin user, you can read these values from env or anywhere else possible
 4 | superset fab create-admin --username "$ADMIN_USERNAME" --firstname Superset --lastname Admin --email "$ADMIN_EMAIL" --password "$ADMIN_PASSWORD"
 5 | 
 6 | # Upgrading Superset metastore
 7 | superset db upgrade
 8 | 
 9 | # setup roles and permissions
10 | superset superset init 
11 | 
12 | # Starting server
13 | /bin/sh -c /usr/bin/run-server.sh


--------------------------------------------------------------------------------
/superset/superset_config.py:
--------------------------------------------------------------------------------
1 | FEATURE_FLAGS = {
2 |     "ENABLE_TEMPLATE_PROCESSING": True,
3 | }
4 | 
5 | ENABLE_PROXY_FIX = True
6 | SECRET_KEY = "YOUR_OWN_RANDOM_GENERATED_STRING"


--------------------------------------------------------------------------------
/trino/iceberg.properties:
--------------------------------------------------------------------------------
 1 | connector.name=iceberg
 2 | iceberg.catalog.type=rest
 3 | iceberg.rest-catalog.uri=http://rest:8181
 4 | iceberg.rest-catalog.warehouse=s3://warehouse/
 5 | iceberg.file-format=PARQUET
 6 | 
 7 | # S3 Configuration for Iceberg
 8 | fs.native-s3.enabled=true
 9 | s3.endpoint=http://minio:9000
10 | s3.region=us-east-1
11 | s3.path-style-access=true
12 | s3.aws-access-key=admin
13 | s3.aws-secret-key=password


--------------------------------------------------------------------------------