├── .gitignore
├── ARCHITECTURE.md
├── LICENSE
├── README.md
├── airflow
    ├── Dockerfile-Airflow
    ├── Dockerfile-Airflow-test
    ├── config
    │   └── airflow.cfg
    ├── dags
    │   ├── pipeline.py
    │   └── spark.py
    └── tests
    │   ├── common.py
    │   ├── conftest.py
    │   ├── test_callbacks.py
    │   ├── test_clients.py
    │   ├── test_dag_integrity.py
    │   ├── test_e2e.py
    │   ├── test_integration_report.py
    │   ├── test_integration_stream.py
    │   ├── test_unit_report.py
    │   ├── test_unit_spark.py
    │   └── test_unit_stream.py
├── dashboard
    ├── Dockerfile-Dashboard-api
    ├── Dockerfile-Dashboard-api-test
    ├── Dockerfile-Dashboard-ui
    ├── Dockerfile-Dashboard-ui-test
    ├── dashboard_api.py
    ├── dashboard_ui.py
    └── tests
    │   ├── common.py
    │   ├── test_integration_api.py
    │   ├── test_unit_api.py
    │   └── test_unit_ui.py
├── db
    ├── Dockerfile-DB-test
    ├── clickhouse_table_schema.sql
    └── tests
    │   ├── test_clickhouse_schema.py
    │   └── test_minio_setup.py
├── docker-compose.yml
├── env
    ├── airflow.creds
    ├── airflow.env
    ├── clickhouse.creds
    ├── clickhouse.env
    ├── kafka.env
    ├── minio.creds
    ├── minio.env
    ├── postgres.creds
    ├── postgres.env
    └── spark.env
├── images
    ├── Airflow_UI.png
    ├── Chart.png
    └── Pipeline.png
├── kafka
    ├── Dockerfile-Connect-helper
    ├── clickhouse_connector_configuration.json
    └── connectors
    │   └── clickhouse-kafka-connect-v1.3.1-confluent.jar
├── producer
    ├── Dockerfile-Producer
    ├── Dockerfile-Producer-test
    ├── config.py
    ├── custom_types.py
    ├── producer.py
    ├── schema_registry.py
    ├── tests
    │   ├── conftest.py
    │   ├── test_integration.py
    │   └── test_unit.py
    └── user_event_schema.avsc
├── pyproject.toml
├── spark
    ├── Dockerfile-Spark
    ├── Dockerfile-Spark-test
    └── tests
    │   └── test_spark.py
└── tests
    ├── airflow
        └── config
        │   └── airflow.cfg
    ├── docker-compose.test.airflow.yml
    ├── docker-compose.test.dashboard.yml
    ├── docker-compose.test.db.yml
    ├── docker-compose.test.producer.yml
    ├── docker-compose.test.spark.yml
    ├── env-test
        ├── airflow.creds
        ├── airflow.env
        ├── clickhouse.creds
        ├── clickhouse.env
        ├── kafka.env
        ├── minio.creds
        ├── minio.env
        ├── postgres.creds
        ├── postgres.env
        └── spark.env
    └── files
        └── clickhouse_table_schema.sql


/.gitignore:
--------------------------------------------------------------------------------
1 | # env/ should be here, but left out for demonstration.
2 | *__pycache__*
3 | *logs/
4 | 


--------------------------------------------------------------------------------
/ARCHITECTURE.md:
--------------------------------------------------------------------------------
 1 | # System Architecture
 2 | 
 3 | ## 1. Data Generation (`producer`)
 4 | -   **Purpose:** Simulates simplified user interaction events for an e-commerce platform.
 5 | -   **Technology:** Python, `confluent-kafka-python`.
 6 | -   **Key Logic:**
 7 |     -   Generates events.
 8 |     -   Uses an Avro schema (`user_event_schema.avsc`) for data contracts.
 9 |     -   Serializes message values with `AvroSerializer` which automatically registers schemas with the Schema Registry.
10 |     -   Serializes message keys (`user_id`) with a custom UUID serializer.
11 |     -   Runs as a multi-process application to generate a higher volume of data.
12 | 
13 | ## 2. Ingestion & Streaming (`broker`, `schema-registry`, `connect`)
14 | -   **Kafka (`broker`):**
15 |     -   Runs in KRaft mode.
16 | -   **Schema Registry (`schema-registry`):**
17 |     -   Stores and serves the Avro schemas.
18 |     -   Ensures that data written to Kafka conforms to a known, versioned structure.
19 | -   **Kafka Connect (`connect` & `connect-helper`):**
20 |     -   Provides a scalable and reliable way to stream data between Kafka and other systems.
21 |     -   The official ClickHouse Sink Connector is used to move data from the Kafka topic to the ClickHouse database.
22 |     -   The configuration (`clickhouse_connector_configuration.json`) is dynamically populated with environment variables by the `connect-helper` service, which uses `envsubst`.
23 |     -   It uses the `AvroConverter` to deserialize messages, validating them against the schema from the Schema Registry before writing to ClickHouse.
24 | 
25 | ## 3. Data Warehouse (`clickhouse`)
26 | -   **Purpose:** Stores the raw event stream.
27 | -   **Technology:** ClickHouse.
28 | -   **Key Features:**
29 |     -   **Schema:** The table `user_interactions` is defined in `clickhouse_table_schema.sql`.
30 |     -   **Partitioning:** Data is partitioned by `event_minute` (a `MATERIALIZED` column). This is critical for performance, as it allows Airflow to efficiently query only the data for a specific minute without scanning the entire table.
31 |     -   **Engine:** Uses the `MergeTree` engine, which is optimized for high-volume writes and fast analytical queries.
32 | 
33 | ## 4. Orchestration (`airflow`)
34 | -   **Purpose:** Manages the periodic batch analysis pipeline.
35 | -   **Technology:** Apache Airflow with the `CeleryExecutor`.
36 | -   **Components:**
37 |     -   `Postgres`: Stores Airflow metadata (DAG states, task instances, connections, etc.).
38 |     -   `Redis`: Acts as the message broker for Celery, queuing tasks for workers.
39 | -   **The `etar_pipeline` DAG:**
40 |     1. **Extract:** Runs every minute. Queries ClickHouse for data from the *previous* minute.
41 |     2. **Transform/Load:** If data exists, it's converted to a Pandas DataFrame, then to an Arrow Table, and finally written as a Parquet file to a temporary location.
42 |     3. **Store:** The Parquet file is streamed into a MinIO bucket. The object name is the timestamp (e.g., `2025-08-09_10-30.parquet`).
43 |     4. **Analyze:** Triggers a `SparkSubmitOperator` task, passing the S3A path of the Parquet file as an argument.
44 |     5. **Report:** After the Spark job completes, a final task fetches the JSON analysis result from MinIO and POSTs it to the Dashboard API.
45 | 
46 | ## 5. Batch Processing (`spark`)
47 | -   **Purpose:** Performs the analysis on the minutely data extracts.
48 | -   **Technology:** Apache Spark, PySpark.
49 | -   **Key Logic:**
50 |     -   The Spark application (`spark.py`) is submitted by Airflow.
51 |     -   It reads a single Parquet file from MinIO. The S3A connector configuration is passed directly from the Airflow DAG.
52 |     -   It performs a simple aggregation: counts total events, total errors, and success/error counts per event type.
53 |     -   The result is written back to MinIO as a JSON file (e.g., `2025-08-09_10-30.json`).
54 |     -   Exits with code `0` on success or non-zero on failure, signaling the status back to the Airflow task instance.
55 | 
56 | ## 6. Storage (`minio`)
57 | -   **Purpose:** Acts as the intermediate storage layer between the "Extract" and "Analyze" stages.
58 | -   **Technology:** MinIO (S3-compatible object storage).
59 | -   **Usage:**
60 |     -   Stores minutely data extracts in Parquet format.
61 |     -   Stores the JSON analysis results from Spark.
62 | 
63 | ## 7. Presentation (`dashboard`)
64 | -   **Purpose:** Displays the latest analysis results to the user.
65 | -   **Technology:** FastAPI, Streamlit.
66 | -   **Architecture:**
67 |     -   **`dashboard-api`:** A simple FastAPI application with a single in-memory deque to store the most recent report. It provides a `/report` endpoint for Airflow to POST results to and for the UI to GET results from.
68 |     -   **`dashboard-ui`:** A Streamlit application that runs in a loop, periodically polling the `/report` endpoint of the API. When it receives a new report, it updates the displayed chart and statistics.
69 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Xadra
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![Overview of the architecture](images/Pipeline.png)
  2 | ![Sample of the dashboard report](images/Chart.png)
  3 | ![Sample of the Airflow UI](images/Airflow_UI.png)
  4 | # Goal
  5 | This project demonstrates an end-to-end data pipeline that simulates, ingests, analyzes, and visualizes user interaction events in near real-time.
  6 | <br/> <br/>
  7 | 
  8 | # Architecture
  9 | 1. **Data Simulation:** A Python script generates simplified user interaction events.
 10 | 2. **Ingestion:** Events are produced to a Kafka topic using an Avro schema for data integrity.
 11 | 3. **Real-time Sink:** Kafka Connect, using the official ClickHouse sink connector, streams data from Kafka into a ClickHouse database for permanent storage and fast querying.
 12 | 4. **Batch Analysis:** An Airflow DAG runs every minute to:
 13 | 
 14 |     a. Extract the previous minute's data from ClickHouse and save it as a Parquet file in MinIO (S3-compatible object storage).
 15 |     
 16 |     b. Trigger a PySpark job to analyze the Parquet file, performing anomaly detection (calculating success/error rates for each event type).
 17 |     
 18 |     c. Push the JSON analysis result to a dashboard API.
 19 | 5. **Visualization:** A Streamlit dashboard polls the API and displays the latest analysis results.
 20 | 
 21 | For a more detailed explanation of the components and their interactions, please see [`ARCHITECTURE.md`](./ARCHITECTURE.md).
 22 | <br/><br/>
 23 | 
 24 | # Component Overview
 25 | The project is broken down into several standalone services, each with a specific responsibility.
 26 | 
 27 | 1. **Producer**:
 28 |     - Generates user interaction data.
 29 |     - Sends data to Kafka, serialized using Avro.
 30 |     - Defines the Avro schema and handles schema registration.
 31 | 
 32 | 2. **Kafka Stack**:
 33 |     - **Kafka Broker**: Manages the real-time data stream from the producer to Kafka Connect. Runs in KRaft mode.
 34 |     - **Schema Registry**: Enforces the data contract via Avro schemas.
 35 |     - **Kafka Connect**: Streams data from the `user-interactions` topic into ClickHouse.
 36 | 
 37 | 3. **Databases & Storage**:
 38 |     - **ClickHouse**: Stores the raw user interaction event stream..
 39 |     - **Postgres**: Serves as the metadata backend for Airflow.
 40 |     - **Redis**: Serves as the message broker for Airflow's CeleryExecutor.
 41 |     - **MinIO**: An S3-compatible object storage used to:
 42 |         - Store minutely data extracts in Parquet format for Spark.
 43 |         - Store the JSON analysis results generated by Spark.
 44 | 
 45 | 4. **Orchestration & Processing**:
 46 |     - **Airflow**: Orchestrates the minutely batch analysis workflow (Extract -> Analyze -> Report).
 47 |     - **Spark**: Performs the batch analysis on the Parquet data stored in MinIO.
 48 |     
 49 | 5. **Dashboard**:
 50 |     - **Dashboard API**: A FastAPI endpoint that receives analysis results from Airflow.
 51 |     - **Dashboard UI**: A Streamlit application that visualizes the latest analysis report.
 52 | <br/><br/>
 53 | 
 54 | # How to Run the Project
 55 | Ensure you have **Docker and Docker Compose**.
 56 | 
 57 | First, run airflow-init to create the Airflow config file:
 58 | ```bash
 59 | docker compose --project-name lp --project-directory . --env-file ./env/airflow.env --env-file ./env/airflow.creds --env-file ./env/clickhouse.env --env-file ./env/clickhouse.creds --env-file ./env/kafka.env --env-file ./env/minio.env --env-file ./env/minio.creds --env-file ./env/postgres.env --env-file ./env/postgres.creds --env-file ./env/spark.env up airflow-init -d
 60 | ```
 61 | 
 62 | Wait unitl airflow-init finishes, then run with:
 63 | ```bash
 64 | docker compose --project-name lp --project-directory . --env-file ./env/airflow.env --env-file ./env/airflow.creds --env-file ./env/clickhouse.env --env-file ./env/clickhouse.creds --env-file ./env/kafka.env --env-file ./env/minio.env --env-file ./env/minio.creds --env-file ./env/postgres.env --env-file ./env/postgres.creds --env-file ./env/spark.env up -d
 65 | ```
 66 | 
 67 | Stop it with
 68 | ```bash
 69 | docker compose --project-name lp --project-directory . --env-file ./env/airflow.env --env-file ./env/airflow.creds --env-file ./env/clickhouse.env --env-file ./env/clickhouse.creds --env-file ./env/kafka.env --env-file ./env/minio.env --env-file ./env/minio.creds --env-file ./env/postgres.env --env-file ./env/postgres.creds --env-file ./env/spark.env down -v --remove-orphans
 70 | ```
 71 | 
 72 | Then head to `localhost:8501` to see the analysis result.
 73 | 
 74 | ## Running Tests
 75 | 
 76 | airflow:
 77 | ```bash
 78 | docker compose --project-name test --project-directory . --env-file ./tests/env-test/airflow.env --env-file ./tests/env-test/airflow.creds --env-file ./tests/env-test/clickhouse.env --env-file ./tests/env-test/clickhouse.creds --env-file ./tests/env-test/minio.env --env-file ./tests/env-test/minio.creds --env-file ./tests/env-test/postgres.env --env-file ./tests/env-test/postgres.creds --env-file ./tests/env-test/spark.env -f tests/docker-compose.test.airflow.yml up -d
 79 | docker compose --project-name test --project-directory . --env-file ./tests/env-test/airflow.env --env-file ./tests/env-test/airflow.creds --env-file ./tests/env-test/clickhouse.env --env-file ./tests/env-test/clickhouse.creds --env-file ./tests/env-test/minio.env --env-file ./tests/env-test/minio.creds --env-file ./tests/env-test/postgres.env --env-file ./tests/env-test/postgres.creds --env-file ./tests/env-test/spark.env -f tests/docker-compose.test.airflow.yml down -v --remove-orphans
 80 | ```
 81 | 
 82 | dashboard:
 83 | ```bash
 84 | docker compose --project-name test --project-directory . -f tests/docker-compose.test.dashboard.yml up -d
 85 | docker compose --project-name test --project-directory . -f tests/docker-compose.test.dashboard.yml down -v --remove-orphans
 86 | ```
 87 | 
 88 | db:
 89 | ```bash
 90 | docker compose --project-name test --project-directory . -f tests/docker-compose.test.db.yml up -d
 91 | docker compose --project-name test --project-directory . -f tests/docker-compose.test.db.yml down -v --remove-orphans
 92 | ```
 93 | 
 94 | producer:
 95 | ```bash
 96 | docker compose --project-name test --project-directory . -f tests/docker-compose.test.producer.yml up -d
 97 | docker compose --project-name test --project-directory . -f tests/docker-compose.test.producer.yml down -v --remove-orphans
 98 | ```
 99 | 
100 | spark:
101 | ```bash
102 | docker compose --project-name test --project-directory . -f tests/docker-compose.test.spark.yml up -d
103 | docker compose --project-name test --project-directory . -f tests/docker-compose.test.spark.yml down -v --remove-orphans
104 | ```
105 | <br/>
106 | 
107 | # Contribution Areas
108 | There are many aspects to be further improved and features to be added:
109 | -   Add `just` to improve commands.
110 | -   Bake application files into Docker images instead of using volume mounts for better performance and immutable infrastructure.
111 | -   Improve error message consistency between codes and tests.
112 | -   Improve `command` attribute of containers by replacing hard-coded values with variables.
113 | -   Improve `clickhouse_table_schema.sql` by replacing hard-coded values with variables (similar to `clickhouse_connector_configuration.json`).
114 | -   Code improvements like creating a function to get ClickHouse client (similar to `get_minio_client`), so it would be cleaner and more testable.
115 | -   Improve error handling and avoid catching exceptions blindly. 
116 | -   Switch to `SASL_SSL` from `PLAINTEXT`.
117 | -   Refactor shared logic (e.g., client initializations) into common utility modules to adhere to the DRY principle.
118 | -   Simulate and handle service failures.
119 | -   Implement a Dead-Letter Queue (DLQ) in Kafka Connect and the producer to handle malformed or problematic messages gracefully.
120 | -   Event-Driven approach instead of REST API for reporting the analysis result.
121 | 


--------------------------------------------------------------------------------
/airflow/Dockerfile-Airflow:
--------------------------------------------------------------------------------
 1 | FROM apache/airflow:3.0.2-python3.11
 2 | 
 3 | RUN pip install --no-cache-dir \
 4 |     "apache-airflow==${AIRFLOW_VERSION}" \
 5 |     clickhouse-connect==0.8.18 \
 6 |     dotenv==0.9.9 \
 7 |     pandas==2.1.4 \
 8 |     pyarrow==16.1.0 \
 9 |     requests==2.32.3 \
10 |     minio==7.2.16 \
11 |     apache-airflow-providers-apache-spark==5.3.2
12 | 
13 | USER root
14 | 
15 | ENV DEBIAN_FRONTEND=noninteractive
16 | 
17 | RUN apt-get update && apt-get install -y --no-install-recommends openjdk-17-jre-headless wget && rm -rf /var/lib/apt/lists/*
18 | 
19 | ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
20 | 
21 | ENV SPARK_VERSION=3.5.6
22 | 
23 | RUN wget https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz && \
24 |     tar -xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz -C /opt && \
25 |     ln -s /opt/spark-${SPARK_VERSION}-bin-hadoop3 /opt/spark && \
26 |     rm spark-${SPARK_VERSION}-bin-hadoop3.tgz
27 | 
28 | ARG HADOOP_AWS_VERSION=3.3.4
29 | ARG AWS_JAVA_SDK_VERSION=1.12.262
30 | 
31 | RUN mkdir -p /opt/spark/jars && \
32 |     wget -P /opt/spark/jars https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_AWS_VERSION}/hadoop-aws-${HADOOP_AWS_VERSION}.jar && \
33 |     wget -P /opt/spark/jars https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_JAVA_SDK_VERSION}/aws-java-sdk-bundle-${AWS_JAVA_SDK_VERSION}.jar && \
34 |     apt-get remove -y wget && \
35 |     apt-get autoremove -y && \
36 |     rm -rf /var/lib/apt/lists/*
37 | 
38 | ENV SPARK_HOME=/opt/spark
39 | ENV PATH=$PATH:$SPARK_HOME/bin
40 | 
41 | USER airflow
42 | 


--------------------------------------------------------------------------------
/airflow/Dockerfile-Airflow-test:
--------------------------------------------------------------------------------
1 | FROM lp/airflow:latest
2 | 
3 | RUN pip install --no-cache-dir "apache-airflow==${AIRFLOW_VERSION}" pytest==8.4.1 pytest-mock==3.14.1
4 | 


--------------------------------------------------------------------------------
/airflow/dags/pipeline.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import json
  3 | import logging
  4 | import os
  5 | from datetime import datetime, timedelta
  6 | from typing import Any
  7 | from uuid import UUID
  8 | try:
  9 |     from zoneinfo import ZoneInfo  # Python 3.9+
 10 | except ImportError:
 11 |     from backports.zoneinfo import ZoneInfo  # For test on spark which has python 3.8
 12 | 
 13 | import clickhouse_connect
 14 | import pyarrow as pa
 15 | import pyarrow.fs as fs
 16 | import pyarrow.parquet as pq
 17 | import requests
 18 | from airflow.hooks.base import BaseHook
 19 | from airflow.providers.apache.spark.operators.spark_submit import SparkSubmitOperator
 20 | from airflow.sdk import dag, task
 21 | from clickhouse_connect.driver.exceptions import ClickHouseError
 22 | from dotenv import load_dotenv
 23 | from minio import Minio
 24 | from minio.error import S3Error
 25 | 
 26 | row_type = tuple[UUID, UUID, UUID, str, datetime, int, str, int | None, int | None]
 27 | 
 28 | load_dotenv()
 29 | 
 30 | schema = pa.schema([
 31 |     pa.field('event_type', pa.string()),
 32 |     pa.field('status', pa.string()),
 33 | ])
 34 | 
 35 | logger = logging.getLogger(__name__)
 36 | 
 37 | CLICKHOUSE_CONN_NAME = os.environ['CLICKHOUSE_CONN_NAME']
 38 | MINIO_CONN_NAME = os.environ['MINIO_CONN_NAME']
 39 | SPARK_CONN_NAME = os.environ['SPARK_CONN_NAME']
 40 | SPARK_APPLICATION_PATH = os.environ['SPARK_APPLICATION_PATH']
 41 | MINIO_BUCKET_NAME = os.environ['MINIO_BUCKET_NAME']
 42 | 
 43 | 
 44 | def get_minio_client() -> Minio:
 45 |     minio_conn = BaseHook.get_connection(MINIO_CONN_NAME)
 46 |     minio_client = Minio(
 47 |         endpoint=minio_conn.extra_dejson.get('host').replace('http://', ''),
 48 |         access_key=minio_conn.login,
 49 |         secret_key=minio_conn.password,
 50 |         secure=False
 51 |     )
 52 |     return minio_client
 53 | 
 54 | 
 55 | def on_success_callback_func(context: dict[str, Any]) -> None:
 56 |     """Log successful task completion."""
 57 |     dag_run = context['dag_run']
 58 |     task_instance = context['task_instance']
 59 |     logger.info(
 60 |         "DAG '%s' - Task '%s' succeeded. Run ID: %s",
 61 |         dag_run.dag_id,
 62 |         task_instance.task_id,
 63 |         dag_run.run_id
 64 |     )
 65 | 
 66 | 
 67 | def on_failure_callback_func(context: dict[str, Any]) -> None:
 68 |     """Log failed task  and exception."""
 69 |     dag_run = context['dag_run']
 70 |     task_instance = context['task_instance']
 71 |     exception = context.get('exception')
 72 |     logger.error(
 73 |         "DAG '%s' - Task '%s' failed. Run ID: %s. Exception: %s",
 74 |         dag_run.dag_id,
 75 |         task_instance.task_id,
 76 |         dag_run.run_id,
 77 |         exception
 78 |     )
 79 | 
 80 | 
 81 | @dag(
 82 |     dag_id='clickHouse_pyspark_dashboard',
 83 |     description='Extract data from ClickHouse, stream to minio, run spark analysis, report to dashboard.',
 84 |     schedule='* * * * *',
 85 |     start_date=datetime(2025, 8, 9, tzinfo=ZoneInfo('UTC')),
 86 |     default_args={
 87 |         'retries': 1,
 88 |         'retry_delay': timedelta(seconds=3),
 89 |         'on_success_callback': on_success_callback_func,
 90 |         'on_failure_callback': on_failure_callback_func,
 91 |     },
 92 |     max_active_runs=2,
 93 |     catchup=False,
 94 |     doc_md="""
 95 |     ### ETAR Pipeline
 96 |     1. Extract the previous minute data from ClickHouse and stream it into MinIO.
 97 |     2. Analyze the data with Spark.
 98 |     3. Send the analysis result to the dashboard API.
 99 |     """,
100 |     is_paused_upon_creation=False,
101 |     fail_fast=True,
102 | )
103 | def etar_pipeline() -> None:
104 |     """Extract-Transform-Analyze-Report Pipeline:
105 |         1- Stream the previous minute data from ClickHouse into MinIO as a Parquet file.
106 |         2- Trigger Spark analysis.
107 |         3- Report the result back to the dashboard.
108 |     """
109 |     
110 |     @task
111 |     def stream_from_clickhouse_to_minio(data_interval_start: datetime) -> str:
112 |         """Stream data from ClickHouse to MinIO, return the s3a file path if a Parquet file was stored, otherwise return the file name.
113 |         
114 |         Args:
115 |             data_interval_start: Task start time. Comes from Airflow.
116 |         
117 |         Returns:
118 |             MinIO path of the file or the timestamp converted to string.
119 |         
120 |         Raises:
121 |             ClickHouseError: If ClickHouse error happens.
122 |             S3Error: If MinIO error happens.
123 |         """
124 |         ch_conn = BaseHook.get_connection(CLICKHOUSE_CONN_NAME)
125 |         clickhouse_client = clickhouse_connect.get_client(
126 |             host=ch_conn.host,
127 |             port=ch_conn.port,
128 |             user=ch_conn.login,
129 |             password=ch_conn.password,
130 |             database=ch_conn.schema,
131 |         )
132 |         
133 |         minio_conn = BaseHook.get_connection(MINIO_CONN_NAME)
134 |         s3_fs = fs.S3FileSystem(
135 |             access_key=minio_conn.login,
136 |             secret_key=minio_conn.password,
137 |             endpoint_override=minio_conn.extra_dejson['host']
138 |         )
139 |         
140 |         timestamp = data_interval_start.astimezone(ZoneInfo('Asia/Tehran')).replace(second=0, microsecond=0) - timedelta(minutes=1)
141 |         timestamp_str = timestamp.strftime('%Y-%m-%d_%H-%M')
142 |         parquet_path = f'{MINIO_BUCKET_NAME}/{timestamp_str}.parquet'
143 |         
144 |         table = os.environ['CLICKHOUSE_TABLE']
145 |         query = 'SELECT event_type, status FROM %(table)s WHERE event_minute = %(timestamp)s;'
146 |         total_rows = 0
147 |         try:
148 |             with (
149 |                 s3_fs.open_output_stream(path=parquet_path) as s3_stream,
150 |                 pq.ParquetWriter(where=s3_stream, schema=schema) as writer,
151 |                 clickhouse_client.query_df_stream(
152 |                     query=query,
153 |                     parameters={'table': table, 'timestamp': timestamp},
154 |                     settings={'max_block_size': 100000}
155 |                 ) as ch_stream
156 |             ):
157 |                 for df_chunk in ch_stream:
158 |                     if df_chunk.empty:
159 |                         break
160 |                     total_rows += len(df_chunk)
161 |                     
162 |                     table = pa.Table.from_pandas(df=df_chunk, schema=schema, preserve_index=False)
163 |                     writer.write_table(table=table)
164 |         except ClickHouseError:
165 |             logger.exception('ClickHouse error occurred while streaming from ClickHouse to MinIO.')
166 |             raise
167 |         except S3Error:
168 |             logger.exception('MinIO error occured while streaming from ClickHouse to MinIO.')
169 |             raise
170 |         except Exception:
171 |             logger.exception('Unexpected error occured while streaming from ClickHouse to MinIO.')
172 |             raise
173 |         finally:
174 |             clickhouse_client.close()
175 |         
176 |         if total_rows == 0:
177 |             logger.warning('No data found for minute: %s.', timestamp_str)
178 |             s3_fs.delete_file(parquet_path)
179 |             return 's3a://' + parquet_path.replace('.parquet', '')
180 |         
181 |         logger.info('Successfully uploaded Parquet file to %s. Number of rows written: %d', parquet_path, total_rows)
182 |         
183 |         return 's3a://' + parquet_path
184 |     
185 |     file_path = stream_from_clickhouse_to_minio()
186 |     
187 |     spark_analysis = SparkSubmitOperator(
188 |         task_id='spark_analysis',
189 |         conn_id=SPARK_CONN_NAME,
190 |         application=SPARK_APPLICATION_PATH,
191 |         application_args=[file_path],
192 |         deploy_mode='client',
193 |         conf={
194 |             'spark.hadoop.fs.s3a.endpoint': f'{{{{ conn.{MINIO_CONN_NAME}.extra_dejson.get("host") }}}}',
195 |             'spark.hadoop.fs.s3a.access.key': f'{{{{ conn.{MINIO_CONN_NAME}.login }}}}',
196 |             'spark.hadoop.fs.s3a.secret.key': f'{{{{ conn.{MINIO_CONN_NAME}.password }}}}',
197 |             'spark.hadoop.fs.s3a.path.style.access': 'true',
198 |             'spark.hadoop.fs.s3a.impl': 'org.apache.hadoop.fs.s3a.S3AFileSystem',
199 |             'spark.hadoop.fs.s3a.connection.ssl.enabled': 'false',
200 |             'spark.eventLog.enabled': os.environ['SPARK_EVENT_LOG_ENABLED'],
201 |             'spark.eventLog.dir': '/opt/airflow/logs/spark',
202 |         },
203 |         driver_memory='512m',
204 |         executor_memory='512m',
205 |         executor_cores=2,
206 |         num_executors=2,
207 |         verbose=False
208 |     )
209 |     
210 |     @task
211 |     def send_to_dashboard(file_path: str) -> None:
212 |         """Send analysis result to the dashboard api.
213 |         
214 |         Args:
215 |             file_path: MinIO path for the analysis report.
216 |         
217 |         Raises:
218 |             S3Error: If the file cannot be fetched from MinIO.
219 |             JSONDecodeError: If the file contains invalid JSON.
220 |             RequestException: If the dashboard API request fails.
221 |         """
222 |         if 'parquet' in file_path:
223 |             file_path = file_path.replace('parquet', 'json')
224 |         else:
225 |             file_path += '.json'
226 |         
227 |         file_name = file_path.split(os.sep)[-1]
228 |         minio_client = get_minio_client()
229 |         minio_response = None
230 |         try:
231 |             minio_response = minio_client.get_object(bucket_name=MINIO_BUCKET_NAME, object_name=file_name)
232 |             result = json.loads(minio_response.read())
233 |             dashboard_response = requests.post(url=os.environ['DASHBOARD_API_URL'], json=result)
234 |             dashboard_response.raise_for_status()
235 |         except S3Error:
236 |             logger.exception('Failed to fetch %s from MinIO', file_name)
237 |             raise
238 |         except json.JSONDecodeError:
239 |             logger.exception('Invalid JSON payload in %s', file_name)
240 |             raise
241 |         except requests.RequestException:
242 |             logger.exception('Dashboard API request failed for %s', file_name)
243 |             raise
244 |         except Exception:
245 |             logger.exception('An unexpected in send_to_dashboard')
246 |             raise
247 |         finally:
248 |             if minio_response:
249 |                 minio_response.close()
250 |                 minio_response.release_conn()
251 |     
252 |     file_path >> spark_analysis >> send_to_dashboard(file_path=file_path)
253 | 
254 | 
255 | etar_pipeline()
256 | 


--------------------------------------------------------------------------------
/airflow/dags/spark.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import json
  3 | import logging
  4 | import os
  5 | import time
  6 | import sys
  7 | from typing import Any, Dict
  8 | 
  9 | from minio import Minio
 10 | from pyspark.sql import SparkSession
 11 | from pyspark.sql import functions as F
 12 | 
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def analyze_events(*, spark: SparkSession, file_path: str) -> Dict[str, Any]:
 18 |     """Read a Parquet file from S3, perform analysis and return results.
 19 |     
 20 |     Returns:
 21 |         Analysis result.
 22 |     """
 23 |     result = {}
 24 |     df = spark.read.parquet(file_path).cache()
 25 |     result['total_events'] = df.count()
 26 |     
 27 |     status_counts_df = (
 28 |         df.groupBy('event_type')
 29 |         .pivot('status', ['ERROR', 'SUCCESS'])
 30 |         .count()
 31 |         .fillna(0)
 32 |     ).orderBy('event_type')
 33 |     error_count = status_counts_df.select(F.sum('ERROR')).first()[0]
 34 |     result['total_errors'] = int(error_count) if error_count else 0
 35 |     
 36 |     event_type_stats = {}
 37 |     for row in status_counts_df.collect():
 38 |         event_type = row.asDict()['event_type']
 39 |         event_type_stats[event_type] = {
 40 |             'SUCCESS': row.asDict().get('SUCCESS', 0),
 41 |             'ERROR': row.asDict().get('ERROR', 0),
 42 |         }
 43 |     result['by_event_type'] = event_type_stats
 44 |     
 45 |     df.unpersist()
 46 |     return result
 47 | 
 48 | 
 49 | def main() -> None:
 50 |     """Run the analysis on the given Parquet file path and save the result to MinIO."""
 51 |     spark = SparkSession.builder.appName('EventAnalysis').getOrCreate()
 52 |     
 53 |     if len(sys.argv) != 2:
 54 |         logger.error('Error in calling spark.py. Usage: spark.py <s3a_file_path>')
 55 |         spark.stop()
 56 |         sys.exit(-1)
 57 |     
 58 |     bucket_name = os.environ['MINIO_BUCKET_NAME']
 59 |     minio_client = Minio(
 60 |         endpoint='minio:9000',
 61 |         access_key=os.environ['MINIO_ROOT_USER'],
 62 |         secret_key=os.environ['MINIO_ROOT_PASSWORD'],
 63 |         secure=False
 64 |     )
 65 |     
 66 |     file_path = sys.argv[1]
 67 |     file_name = file_path.split(os.sep)[-1]
 68 |     
 69 |     if 'parquet' not in file_name:
 70 |         logger.info('Empty file for spark: %s', file_name)
 71 |         analysis_result = json.dumps({'report': f'No data for {file_name}.'})
 72 |         file_name += '.json'
 73 |         spark.stop()
 74 |     else:
 75 |         start_time = time.time()
 76 |         analysis_result = {}
 77 |         try:
 78 |             analysis_result.update(analyze_events(spark=spark, file_path=file_path))
 79 |         except Exception as e:
 80 |             logger.exception('Analysis failed for %s', file_name)
 81 |             analysis_result['error'] = str(e)
 82 |             raise
 83 |         finally:
 84 |             spark.stop()
 85 |         
 86 |         file_name = file_name.replace('parquet', 'json')
 87 |         analysis_result['process_time'] = time.time() - start_time
 88 |         analysis_result['file_name'] = file_name
 89 |         analysis_result = {'report': analysis_result}
 90 |         analysis_result = json.dumps(analysis_result)
 91 |     
 92 |     minio_client.put_object(
 93 |         bucket_name=bucket_name,
 94 |         object_name=file_name,
 95 |         data=io.BytesIO(analysis_result.encode('utf-8')),
 96 |         length=len(analysis_result)
 97 |     )
 98 |     sys.exit(0)
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     main()
103 | 


--------------------------------------------------------------------------------
/airflow/tests/common.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import pandas as pd
 4 | import random
 5 | import sys
 6 | from datetime import datetime
 7 | from uuid import uuid4
 8 | 
 9 | 
10 | CLICKHOUSE_HOST = os.environ['CLICKHOUSE_HOST']
11 | CLICKHOUSE_PORT = int(os.environ['CLICKHOUSE_PORT'])
12 | CLICKHOUSE_TABLE = os.environ['CLICKHOUSE_TABLE']
13 | CLICKHOUSE_USER = os.environ['CLICKHOUSE_USER']
14 | CLICKHOUSE_PASSWORD = os.environ['CLICKHOUSE_PASSWORD']
15 | CLICKHOUSE_DB = os.environ['CLICKHOUSE_DB']
16 | 
17 | DASHBOARD_API_URL = os.environ['DASHBOARD_API_URL']
18 | 
19 | EVENTS = ['ADD_TO_CART', 'CHECKOUT', 'PAYMENT', 'SEARCH', 'VIEW_PRODUCT']
20 | 
21 | MINIO_BUCKET_NAME = os.environ['MINIO_BUCKET_NAME']
22 | 
23 | REPORT_SAMPLE = {
24 |     'report': {
25 |         'total_events': 5805,
26 |         'total_errors': 1398,
27 |         'by_event_type': {
28 |             'ADD_TO_CART': {'SUCCESS': 876, 'ERROR': 292},
29 |             'CHECKOUT': {'SUCCESS': 846, 'ERROR': 289},
30 |             'PAYMENT': {'SUCCESS': 884, 'ERROR': 281},
31 |             'SEARCH': {'SUCCESS': 933, 'ERROR': 261},
32 |             'VIEW_PRODUCT': {'SUCCESS': 868, 'ERROR': 275}
33 |         },
34 |         'process_time': 22.15983009338379,
35 |         'file_name': '2025-08-04_19-04.json'
36 |     }
37 | }
38 | 
39 | logger = logging.getLogger(__name__)
40 | logging.basicConfig(
41 |     level=logging.INFO,
42 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
43 |     stream=sys.stdout,
44 | )
45 | 
46 | 
47 | def insert_test_data(clickhouse_client, timestamp: datetime, num_rows: int = 10) -> pd.DataFrame:
48 |     """Create test data, insert it into ClickHouse and return the DataFrame.
49 |     
50 |     Returns:
51 |         Test data.
52 |     """
53 |     row_count = clickhouse_client.command(f'SELECT COUNT(*) FROM {CLICKHOUSE_TABLE}')
54 |     logger.info('ClickHouse table: %s number, of rows before data insertion: %d.', CLICKHOUSE_TABLE, row_count)
55 |     logger.info('Inserting data to ClickHouse...')
56 |     
57 |     rows = []
58 |     for _ in range(num_rows):
59 |         error_probability = random.uniform(0, 0.5)
60 |         has_error = random.random() < error_probability
61 |         event_type = random.choice(EVENTS)
62 |         row = {
63 |             'event_id': str(uuid4()),
64 |             'user_id': str(uuid4()),
65 |             'session_id': str(uuid4()),
66 |             'event_type': event_type,
67 |             'event_timestamp': timestamp,
68 |             'request_latency_ms': random.randint(50, 1500),
69 |             'status': 'ERROR' if has_error else 'SUCCESS',
70 |             'error_code': random.randint(400, 599) if has_error else None,
71 |             'product_id': random.randint(1, 10000) if event_type in {'VIEW_PRODUCT', 'ADD_TO_CART'} else None
72 |         }
73 |         rows.append(row)
74 |     
75 |     df = pd.DataFrame(rows)
76 |     clickhouse_client.insert_df(CLICKHOUSE_TABLE, df)
77 |     
78 |     row_count = clickhouse_client.command(f'SELECT COUNT(*) FROM {CLICKHOUSE_TABLE}')
79 |     logger.info('ClickHouse table: %s number, of rows after data insertion: %d.', CLICKHOUSE_TABLE, row_count)
80 |     return df
81 | 


--------------------------------------------------------------------------------
/airflow/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from datetime import datetime
  3 | from unittest.mock import Mock
  4 | from zoneinfo import ZoneInfo
  5 | 
  6 | import clickhouse_connect
  7 | import pytest
  8 | from minio import Minio
  9 | from minio.deleteobjects import DeleteObject
 10 | from minio.error import S3Error
 11 | 
 12 | from pipeline import etar_pipeline, get_minio_client
 13 | from common import CLICKHOUSE_HOST, CLICKHOUSE_PORT, CLICKHOUSE_TABLE, CLICKHOUSE_USER, CLICKHOUSE_PASSWORD, CLICKHOUSE_DB, MINIO_BUCKET_NAME
 14 | 
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | @pytest.fixture(scope='module')
 20 | def dag():
 21 |     """Load the DAG instance."""
 22 |     return etar_pipeline()
 23 | 
 24 | 
 25 | @pytest.fixture(scope='session')
 26 | def minio_client() -> Minio:
 27 |     """Provide a real Minio client."""
 28 |     return get_minio_client()
 29 | 
 30 | 
 31 | @pytest.fixture(scope='module')
 32 | def report_func(dag):
 33 |     """Extract the send_to_dashboard task callable."""
 34 |     return dag.get_task('send_to_dashboard').python_callable
 35 | 
 36 | 
 37 | @pytest.fixture(scope='module')
 38 | def stream_func(dag):
 39 |     """Extract the stream_from_clickhouse_to_minio task callable."""
 40 |     return dag.get_task('stream_from_clickhouse_to_minio').python_callable
 41 | 
 42 | 
 43 | @pytest.fixture
 44 | def mock_ch_client(mocker):
 45 |     """Mock ClickHouse client."""
 46 |     mock_client = Mock()
 47 |     mocker.patch('pipeline.clickhouse_connect.get_client', return_value=mock_client)
 48 |     return mock_client
 49 | 
 50 | 
 51 | @pytest.fixture
 52 | def clickhouse_client():
 53 |     """Create a real ClickHouse client for integration testing."""
 54 |     client = clickhouse_connect.get_client(
 55 |         host=CLICKHOUSE_HOST,
 56 |         port=CLICKHOUSE_PORT,
 57 |         user=CLICKHOUSE_USER,
 58 |         password=CLICKHOUSE_PASSWORD,
 59 |         database=CLICKHOUSE_DB,
 60 |     )
 61 |     yield client
 62 |     client.close()
 63 | 
 64 | 
 65 | @pytest.fixture
 66 | def test_timestamp() -> datetime:
 67 |     """Generate a test timestamp for consistent testing."""
 68 |     return datetime(2025, 1, 1, 10, 30, tzinfo=ZoneInfo('UTC'))
 69 | 
 70 | 
 71 | @pytest.fixture
 72 | def delete_all_data(clickhouse_client, minio_client):
 73 |     """Clean up all test data before and after each test.
 74 |     
 75 |     This fixture does not need to be called inside the test.
 76 |     """
 77 |     logger.info('Calling `delete_all_data` before test.')
 78 |     # Before test
 79 |     _delete_all_data_clickhouse(clickhouse_client)
 80 |     _delete_all_data_minio(minio_client)
 81 |     
 82 |     yield
 83 |     
 84 |     # After test
 85 |     logger.info('Calling `delete_all_data` after test.')
 86 |     _delete_all_data_clickhouse(clickhouse_client)
 87 |     _delete_all_data_minio(minio_client)
 88 | 
 89 | 
 90 | def _delete_all_data_clickhouse(client) -> None:
 91 |     """Remove all data from ClickHouse."""
 92 |     count_query = 'SELECT COUNT(*) FROM {table}'
 93 |     report = 'ClickHouse: db: {db}, table: {table}, number of rows {stage} truncating: {row_count}.'
 94 |     
 95 |     row_count = client.command(count_query.format(table=CLICKHOUSE_TABLE))
 96 |     logger.info(report.format(db=CLICKHOUSE_DB, table=CLICKHOUSE_TABLE, stage='before', row_count=row_count))
 97 |     
 98 |     client.command(f'TRUNCATE TABLE IF EXISTS {CLICKHOUSE_TABLE}')
 99 |     
100 |     row_count = client.command(count_query.format(table=CLICKHOUSE_TABLE))
101 |     logger.info(report.format(db=CLICKHOUSE_DB, table=CLICKHOUSE_TABLE, stage='after', row_count=row_count))
102 | 
103 | 
104 | def _delete_all_data_minio(client) -> None:
105 |     """Remove all files from MinIO.
106 |     
107 |     Args:
108 |         client: MinIO client.
109 |     
110 |     Raises:
111 |         S3Error: If fails to delete an object from MinIO.
112 |     """
113 |     report = 'Minio: bucket: {bucket}, objects {stage} delete: {object_names}'
114 |     
115 |     objects_to_delete = client.list_objects(bucket_name=MINIO_BUCKET_NAME, recursive=True)
116 |     object_names = [obj.object_name for obj in objects_to_delete]
117 |     logger.info(report.format(bucket=MINIO_BUCKET_NAME, stage='before', object_names=object_names))
118 |     
119 |     if object_names:
120 |         delete_object_list = [DeleteObject(name) for name in object_names]
121 |         errors = client.remove_objects(bucket_name=MINIO_BUCKET_NAME, delete_object_list=delete_object_list)
122 |         
123 |         has_errors = False
124 |         for error in errors:
125 |             has_errors = True
126 |             logger.error('Error occurred when trying to delete object %s from MinIO bucket %s.', error, MINIO_BUCKET_NAME)
127 |         
128 |         if has_errors:
129 |             raise S3Error(message='Failed to delete one or more objects from Minio. Check logs for details.')
130 |         
131 |         logger.info('Minio: bucket %s cleared.', MINIO_BUCKET_NAME)
132 |     else:
133 |         logger.info('Minio bucket %s was empty.', MINIO_BUCKET_NAME)
134 |     
135 |     objects_to_delete = client.list_objects(bucket_name=MINIO_BUCKET_NAME, recursive=True)
136 |     object_names = [obj.object_name for obj in objects_to_delete]
137 |     logger.info(report.format(bucket=MINIO_BUCKET_NAME, stage='after', object_names=object_names))
138 | 


--------------------------------------------------------------------------------
/airflow/tests/test_callbacks.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from unittest.mock import Mock, patch
 4 | 
 5 | from pipeline import on_success_callback_func, on_failure_callback_func
 6 | 
 7 | 
 8 | def test_on_success_callback_func() -> None:
 9 |     """Test the success callback logs correctly."""
10 |     mock_dag_run = Mock()
11 |     dag_id = 'test_dag'
12 |     run_id = 'test_run_123'
13 |     task_id = 'test_task'
14 |     mock_dag_run.dag_id = dag_id
15 |     mock_dag_run.run_id = run_id
16 |     
17 |     mock_task_instance = Mock()
18 |     mock_task_instance.task_id = task_id
19 |     
20 |     context = {
21 |         'dag_run': mock_dag_run,
22 |         'task_instance': mock_task_instance
23 |     }
24 |     
25 |     with patch('pipeline.logger', autospec=True) as mock_logger:
26 |         on_success_callback_func(context)
27 |         
28 |         mock_logger.info.assert_called_once_with(
29 |             "DAG '%s' - Task '%s' succeeded. Run ID: %s",
30 |             dag_id,
31 |             task_id,
32 |             run_id,
33 |         )
34 | 
35 | 
36 | def test_on_failure_callback_func() -> None:
37 |     """Test the failure callback logs correctly."""
38 |     mock_dag_run = Mock()
39 |     dag_id = 'test_dag'
40 |     run_id = 'test_run_123'
41 |     task_id = 'test_task'
42 |     mock_dag_run.dag_id = dag_id
43 |     mock_dag_run.run_id = run_id
44 |     
45 |     mock_task_instance = Mock()
46 |     mock_task_instance.task_id = task_id
47 |     
48 |     test_exception = ValueError('Test error')
49 |     
50 |     context = {
51 |         'dag_run': mock_dag_run,
52 |         'task_instance': mock_task_instance,
53 |         'exception': test_exception
54 |     }
55 |     
56 |     with patch('pipeline.logger', autospec=True) as mock_logger:
57 |         on_failure_callback_func(context)
58 |         
59 |         mock_logger.error.assert_called_once_with(
60 |             "DAG '%s' - Task '%s' failed. Run ID: %s. Exception: %s",
61 |             dag_id,
62 |             task_id,
63 |             run_id,
64 |             test_exception
65 |         )
66 | 


--------------------------------------------------------------------------------
/airflow/tests/test_clients.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | from unittest.mock import MagicMock
 5 | 
 6 | from airflow.hooks.base import BaseHook
 7 | from airflow.models import Connection
 8 | 
 9 | from pipeline import get_minio_client
10 | 
11 | 
12 | MINIO_ENDPOINT = 'minio:9000'
13 | MINIO_ROOT_USER = os.environ['MINIO_ROOT_USER']
14 | MINIO_ROOT_PASSWORD = os.environ['MINIO_ROOT_PASSWORD']
15 | 
16 | 
17 | def test_get_minio_client(mocker):
18 |     """Test get_minio_client retrieves connection and creates MinIO client correctly."""
19 |     mock_minio_conn = MagicMock(spec=Connection)
20 |     mock_minio_conn.extra_dejson = {'host': MINIO_ENDPOINT}
21 |     mock_minio_conn.login = MINIO_ROOT_USER
22 |     mock_minio_conn.password = MINIO_ROOT_PASSWORD
23 |     mocker.patch.object(BaseHook, 'get_connection', return_value=mock_minio_conn, autospec=True)
24 |     mock_minio_class = mocker.patch('pipeline.Minio', autospec=True)
25 |     
26 |     client = get_minio_client()
27 |     
28 |     mock_minio_class.assert_called_once_with(
29 |         endpoint=MINIO_ENDPOINT,
30 |         access_key=MINIO_ROOT_USER,
31 |         secret_key=MINIO_ROOT_PASSWORD,
32 |         secure=False
33 |     )
34 |     assert client == mock_minio_class.return_value
35 | 


--------------------------------------------------------------------------------
/airflow/tests/test_dag_integrity.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import pytest
 4 | from airflow.models.dagbag import DagBag
 5 | 
 6 | 
 7 | @pytest.fixture(scope='module')
 8 | def dagbag():
 9 |     """Load the DAGBag."""
10 |     return DagBag(dag_folder='.')
11 | 
12 | 
13 | def test_dag_imports_without_error(dagbag):
14 |     errors = dagbag.import_errors
15 |     assert len(errors) == 0, f'Found error(s) importing dagbag: {errors}'
16 | 
17 | 
18 | def test_dag_is_present(dagbag):
19 |     """Verify 'clickHouse_pyspark_dashboard' dag is present."""
20 |     dag_id = 'clickHouse_pyspark_dashboard'
21 |     assert dag_id in dagbag.dags, f"'{dag_id}' NOT found in dagbag. Available dags are: {list(dagbag.dags.keys())}"
22 |     dag = dagbag.dags.get(dag_id)
23 |     assert dag is not None, f"'{dag_id}' NOT found in dagbag."
24 | 
25 | 
26 | def test_tasks_are_present(dagbag):
27 |     """Verify 'clickHouse_pyspark_dashboard' tasks are present."""
28 |     dag_id = 'clickHouse_pyspark_dashboard'
29 |     dag = dagbag.dags.get(dag_id)
30 |     tasks = {t.task_id for t in dag.tasks}
31 |     assert tasks == {
32 |         'stream_from_clickhouse_to_minio',
33 |         'spark_analysis',
34 |         'send_to_dashboard',
35 |     }, f"Found these tasks for '{dag_id}': [{tasks}]"
36 | 
37 | 
38 | def test_upstream_and_downstream_relations(dagbag):
39 |     """Verify the structure of the 'clickHouse_pyspark_dashboard' dag."""
40 |     dag = dagbag.dags.get('clickHouse_pyspark_dashboard')
41 |     stream_task = dag.get_task('stream_from_clickhouse_to_minio')
42 |     spark_task = dag.get_task('spark_analysis')
43 |     dashboard_task = dag.get_task('send_to_dashboard')
44 |     
45 |     assert len(stream_task.upstream_task_ids) == 0
46 |     assert len(dashboard_task.downstream_task_ids) == 0
47 |     assert spark_task.task_id in stream_task.downstream_task_ids, "The spark task is NOT correctly set as the stream's downstream task."
48 |     assert dashboard_task.task_id in spark_task.downstream_task_ids, "The dashboard task is NOT correctly set as the spark's downstream task."
49 |     assert stream_task.task_id in spark_task.upstream_task_ids, "The stream task is NOT correctly set as the spark's upstream task."
50 |     assert spark_task.task_id in dashboard_task.upstream_task_ids, "The spark task is NOT correctly set as the dashboard's upstream task."
51 | 


--------------------------------------------------------------------------------
/airflow/tests/test_e2e.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import json
 4 | from datetime import datetime, timedelta
 5 | from zoneinfo import ZoneInfo
 6 | 
 7 | from common import MINIO_BUCKET_NAME, insert_test_data
 8 | 
 9 | 
10 | def test_e2e_with_data(dag, clickhouse_client, minio_client, delete_all_data):
11 |     """Test end-to-end with data."""
12 |     num_rows = 5
13 |     test_timestamp = datetime.now(tz=ZoneInfo('UTC')) - timedelta(minutes=1)
14 |     insert_test_data(clickhouse_client, test_timestamp, num_rows=num_rows)
15 |     
16 |     dag.test(logical_date=test_timestamp + timedelta(minutes=1))
17 |     
18 |     timestamp_str = test_timestamp.astimezone(ZoneInfo('Asia/Tehran')).strftime('%Y-%m-%d_%H-%M')
19 |     file_name = f'{timestamp_str}.json'
20 |     minio_response = minio_client.get_object(bucket_name=MINIO_BUCKET_NAME, object_name=file_name)
21 |     report = json.loads(minio_response.read())
22 |     assert isinstance(report['report'], dict)
23 |     assert report['report']['total_events'] == num_rows
24 | 
25 | 
26 | def test_e2e_without_data(dag, minio_client, delete_all_data):
27 |     """Test end-to-end with data."""
28 |     test_timestamp = datetime.now(tz=ZoneInfo('UTC')) - timedelta(minutes=1)
29 |     
30 |     dag.test(logical_date=test_timestamp + timedelta(minutes=1))
31 |     
32 |     timestamp_str = test_timestamp.astimezone(ZoneInfo('Asia/Tehran')).strftime('%Y-%m-%d_%H-%M')
33 |     file_name = f'{timestamp_str}.json'
34 |     minio_response = minio_client.get_object(bucket_name=MINIO_BUCKET_NAME, object_name=file_name)
35 |     report = json.loads(minio_response.read())
36 |     assert report['report'] == f'No data for {timestamp_str}.'
37 | 


--------------------------------------------------------------------------------
/airflow/tests/test_integration_report.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import io
 4 | import json
 5 | import os
 6 | 
 7 | import pytest
 8 | import requests
 9 | from minio.error import S3Error
10 | 
11 | from common import DASHBOARD_API_URL, MINIO_BUCKET_NAME, REPORT_SAMPLE
12 | 
13 | 
14 | @pytest.fixture(autouse=True)
15 | def test_setup_teardown(minio_client):
16 |     """Clean the api storage before and after each test.
17 |     
18 |     Raises:
19 |         S3Error: If fails to remove an object from MinIO.
20 |     """
21 |     delete_url = f'{DASHBOARD_API_URL}/report'
22 |     requests.delete(delete_url)
23 |     
24 |     yield
25 |     
26 |     requests.delete(delete_url)
27 |     
28 |     try:
29 |         objects = minio_client.list_objects(MINIO_BUCKET_NAME, recursive=True)
30 |         for obj in objects:
31 |             minio_client.remove_object(MINIO_BUCKET_NAME, obj.object_name)
32 |     except S3Error as e:
33 |         print(f'Could not clean up MinIO bucket: {e}')
34 |         raise
35 | 
36 | 
37 | def test_integration_dashboard_success(report_func, minio_client):
38 |     """Test that a valid JSON report is read from MinIO and sent to the dashboard API."""
39 |     report_json = json.dumps(REPORT_SAMPLE)
40 |     object_name = '2025-08-10_12-00.json'
41 |     
42 |     minio_client.put_object(
43 |         bucket_name=MINIO_BUCKET_NAME,
44 |         object_name=object_name,
45 |         data=io.BytesIO(report_json.encode('utf-8')),
46 |         length=len(report_json)
47 |     )
48 |     
49 |     file_path = f's3a://{MINIO_BUCKET_NAME}/{object_name.replace("json", "parquet")}'
50 |     report_func(file_path=file_path)
51 |     
52 |     response = requests.get(DASHBOARD_API_URL)
53 |     response.raise_for_status()
54 |     
55 |     received_report = response.json()
56 |     assert received_report == REPORT_SAMPLE
57 | 
58 | 
59 | def test_integration_dashboard_invalid_filename_failure(report_func):
60 |     """Test that an invalid file path causes S3Error."""
61 |     object_name = 'invalid_filename.json'
62 |     
63 |     file_path = f's3a://{MINIO_BUCKET_NAME}/{object_name.replace("json", "parquet")}'
64 |     
65 |     with pytest.raises(S3Error) as exc_info:
66 |         report_func(file_path=file_path)
67 |     
68 |     assert exc_info.value.code == 'NoSuchKey'
69 | 
70 | 
71 | def test_integration_dashboard_invalid_json_failure(report_func, minio_client):
72 |     """Test that an invalid json file causes JSONDecodeError."""
73 |     report_json = "{'bad dict': {'total_events': }}"
74 |     object_name = '2025-08-10_12-00.json'
75 |     
76 |     minio_client.put_object(
77 |         bucket_name=MINIO_BUCKET_NAME,
78 |         object_name=object_name,
79 |         data=io.BytesIO(report_json.encode('utf-8')),
80 |         length=len(report_json)
81 |     )
82 |     
83 |     file_path = f's3a://{MINIO_BUCKET_NAME}/{object_name.replace("json", "parquet")}'
84 |     
85 |     with pytest.raises(json.JSONDecodeError):
86 |         report_func(file_path=file_path)
87 | 


--------------------------------------------------------------------------------
/airflow/tests/test_integration_stream.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import tempfile
  4 | from datetime import datetime, timedelta
  5 | from uuid import uuid4
  6 | from zoneinfo import ZoneInfo
  7 | 
  8 | import pandas as pd
  9 | import pyarrow.parquet as pq
 10 | import pytest
 11 | from minio.error import S3Error
 12 | 
 13 | from common import insert_test_data, CLICKHOUSE_TABLE, MINIO_BUCKET_NAME
 14 | from pipeline import schema
 15 | 
 16 | 
 17 | def test_integration_stream_with_data(stream_func, clickhouse_client, minio_client, test_timestamp, delete_all_data):
 18 |     """Test streaming data from ClickHouse to MinIO with real services."""
 19 |     num_rows = 5
 20 |     insert_test_data(clickhouse_client, test_timestamp, num_rows=num_rows)
 21 |     
 22 |     data_interval_start = test_timestamp + timedelta(minutes=1)
 23 |     result = stream_func(data_interval_start=data_interval_start)
 24 |     
 25 |     timestamp_str = test_timestamp.astimezone(ZoneInfo('Asia/Tehran')).strftime('%Y-%m-%d_%H-%M')
 26 |     expected_path = f's3a://{MINIO_BUCKET_NAME}/{timestamp_str}.parquet'
 27 |     assert result == expected_path
 28 |     
 29 |     object_name = f'{timestamp_str}.parquet'
 30 |     try:
 31 |         stat = minio_client.stat_object(MINIO_BUCKET_NAME, object_name)
 32 |         assert stat.size > 0
 33 |     except S3Error:
 34 |         pytest.fail(f"Expected object {object_name} not found in MinIO")
 35 |     
 36 |     with tempfile.NamedTemporaryFile(suffix='.parquet') as tmp:
 37 |         minio_client.fget_object(MINIO_BUCKET_NAME, object_name, tmp.name)
 38 |         
 39 |         table = pq.read_table(tmp.name)
 40 |         assert table.num_rows == num_rows
 41 |         assert table.schema == schema
 42 | 
 43 | 
 44 | def test_integration_stream_no_data(stream_func, minio_client, test_timestamp, delete_all_data):
 45 |     """Test streaming when no data exists in ClickHouse."""
 46 |     data_interval_start = test_timestamp + timedelta(minutes=1)
 47 |     result = stream_func(data_interval_start=data_interval_start)
 48 |     
 49 |     timestamp_str = test_timestamp.astimezone(ZoneInfo('Asia/Tehran')).strftime('%Y-%m-%d_%H-%M')
 50 |     expected_path = f's3a://{MINIO_BUCKET_NAME}/{timestamp_str}'
 51 |     assert result == expected_path
 52 |     
 53 |     object_name = f'{timestamp_str}.parquet'
 54 |     with pytest.raises(S3Error) as exc_info:
 55 |         minio_client.stat_object(MINIO_BUCKET_NAME, object_name)
 56 |     
 57 |     assert exc_info.value.code == 'NoSuchKey'
 58 | 
 59 | 
 60 | def test_integration_stream_large_dataset(stream_func, clickhouse_client, minio_client, test_timestamp, delete_all_data):
 61 |     """Test streaming with a larger dataset to verify chunking works correctly."""
 62 |     insert_test_data(clickhouse_client, test_timestamp, num_rows=1000)
 63 |     
 64 |     data_interval_start = test_timestamp + timedelta(minutes=1)
 65 |     stream_func(data_interval_start=data_interval_start)
 66 |     
 67 |     timestamp_str = test_timestamp.astimezone(ZoneInfo('Asia/Tehran')).strftime('%Y-%m-%d_%H-%M')
 68 |     object_name = f'{timestamp_str}.parquet'
 69 |     
 70 |     with tempfile.NamedTemporaryFile(suffix='.parquet') as tmp:
 71 |         minio_client.fget_object(MINIO_BUCKET_NAME, object_name, tmp.name)
 72 |         
 73 |         table = pq.read_table(tmp.name)
 74 |         assert table.num_rows == 1000
 75 |         
 76 |         df_result = table.to_pandas()
 77 |         assert df_result['event_type'].dtype == 'object'
 78 |         assert df_result['status'].dtype == 'object'
 79 | 
 80 | 
 81 | def test_integration_stream_data_transformation(stream_func, clickhouse_client, minio_client, test_timestamp, delete_all_data):
 82 |     """Test that data transformations are applied correctly in the real pipeline."""
 83 |     timestamp_with_microseconds = test_timestamp.replace(microsecond=123456)
 84 |     
 85 |     latency = 100
 86 |     product_id = 9900
 87 |     event_type = 'VIEW_PRODUCT'
 88 |     status = 'SUCCESS'
 89 |     test_data = pd.DataFrame([{
 90 |         'event_id': str(uuid4()),
 91 |         'user_id': str(uuid4()),
 92 |         'session_id': str(uuid4()),
 93 |         'event_type': event_type,
 94 |         'event_timestamp': timestamp_with_microseconds,
 95 |         'request_latency_ms': latency,
 96 |         'status': status,
 97 |         'error_code': None,
 98 |         'product_id': product_id,
 99 |     }])
100 |     
101 |     clickhouse_client.insert_df(CLICKHOUSE_TABLE, test_data)
102 |     
103 |     data_interval_start = test_timestamp + timedelta(minutes=1)
104 |     stream_func(data_interval_start=data_interval_start)
105 |     
106 |     timestamp_str = test_timestamp.astimezone(ZoneInfo('Asia/Tehran')).strftime('%Y-%m-%d_%H-%M')
107 |     object_name = f'{timestamp_str}.parquet'
108 |     
109 |     with tempfile.NamedTemporaryFile(suffix='.parquet') as tmp:
110 |         minio_client.fget_object(MINIO_BUCKET_NAME, object_name, tmp.name)
111 |         
112 |         table = pq.read_table(tmp.name)
113 |         df_result = table.to_pandas()
114 |         
115 |         assert isinstance(df_result['event_type'].iloc[0], str)
116 |         assert isinstance(df_result['status'].iloc[0], str)
117 |         
118 |         assert df_result['event_type'].iloc[0] == event_type
119 |         assert df_result['status'].iloc[0] == status
120 | 
121 | 
122 | def test_integration_timezone_handling(stream_func, clickhouse_client, minio_client, delete_all_data):
123 |     """Test that timezone conversions are handled correctly."""
124 |     utc_timestamp = datetime(2025, 1, 2, 6, 30, 0, tzinfo=ZoneInfo('UTC'))
125 |     tehran_timestamp = utc_timestamp.astimezone(ZoneInfo('Asia/Tehran'))
126 |     
127 |     insert_test_data(clickhouse_client, tehran_timestamp, num_rows=1)
128 |     
129 |     data_interval_start = utc_timestamp + timedelta(minutes=1)
130 |     result = stream_func(data_interval_start=data_interval_start)
131 |     
132 |     expected_filename = tehran_timestamp.strftime('%Y-%m-%d_%H-%M')
133 |     assert expected_filename in result
134 |     
135 |     object_name = f'{expected_filename}.parquet'
136 |     try:
137 |         stat = minio_client.stat_object(MINIO_BUCKET_NAME, object_name)
138 |         assert stat.size > 0
139 |     except S3Error:
140 |         pytest.fail(f"Expected object {object_name} not found in MinIO")
141 | 


--------------------------------------------------------------------------------
/airflow/tests/test_unit_report.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import json
  4 | from unittest.mock import Mock
  5 | from urllib3.response import HTTPResponse
  6 | 
  7 | import pytest
  8 | from minio import Minio
  9 | 
 10 | from common import DASHBOARD_API_URL, MINIO_BUCKET_NAME, REPORT_SAMPLE
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def mock_minio_response(mocker):
 15 |     """Mock minio client and its `get_object` response."""
 16 |     mock_response = Mock(spec=HTTPResponse)
 17 |     mock_response.read.return_value = json.dumps(REPORT_SAMPLE)
 18 |     
 19 |     mock_minio = Mock(spec=Minio)
 20 |     mock_minio.get_object.return_value = mock_response
 21 |     
 22 |     mocker.patch('pipeline.get_minio_client', return_value=mock_minio, autospec=True)
 23 |     return mock_minio, mock_response
 24 | 
 25 | 
 26 | @pytest.fixture
 27 | def mock_request_post(mocker):
 28 |     """Mock `requests.post`."""
 29 |     mock_post = mocker.patch('pipeline.requests.post', autospec=True)
 30 |     return mock_post
 31 | 
 32 | 
 33 | def test_send_to_dashboard_with_parquet_path(report_func, mock_minio_response, mock_request_post):
 34 |     """Test send_to_dashboard handles path with '.parquet', fetches JSON, and sends to dashboard API."""
 35 |     mock_minio, mock_response = mock_minio_response
 36 |     filename = '2025-08-10_12-00'
 37 |     report_func(f's3a://{MINIO_BUCKET_NAME}/{filename}.parquet')
 38 |     
 39 |     mock_minio.get_object.assert_called_once_with(
 40 |         bucket_name=MINIO_BUCKET_NAME,
 41 |         object_name=f'{filename}.json'
 42 |     )
 43 |     
 44 |     mock_request_post.assert_called_once_with(
 45 |         url=DASHBOARD_API_URL,
 46 |         json=REPORT_SAMPLE
 47 |     )
 48 |     
 49 |     mock_response.close.assert_called_once()
 50 |     mock_response.release_conn.assert_called_once()
 51 | 
 52 | 
 53 | def test_send_to_dashboard_without_parquet_path(report_func, mock_minio_response, mock_request_post):
 54 |     """Test send_to_dashboard handles path without '.parquet' by appending '.json'."""
 55 |     mock_minio, mock_response = mock_minio_response
 56 |     
 57 |     filename = '2025-08-10_12-00'
 58 |     report_func(f's3a://{MINIO_BUCKET_NAME}/{filename}')
 59 |     
 60 |     mock_minio.get_object.assert_called_once_with(
 61 |         bucket_name=MINIO_BUCKET_NAME,
 62 |         object_name=f'{filename}.json'
 63 |     )
 64 |     
 65 |     mock_request_post.assert_called_once_with(
 66 |         url=DASHBOARD_API_URL,
 67 |         json=REPORT_SAMPLE
 68 |     )
 69 |     mock_response.close.assert_called_once()
 70 |     mock_response.release_conn.assert_called_once()
 71 | 
 72 | 
 73 | def test_send_to_dashboard_exception(mocker, report_func):
 74 |     """Test send_to_dashboard raises exception on failure and cleans up."""
 75 |     mock_response = Mock()
 76 |     err_msg = 'Read failed'
 77 |     mock_response.read.side_effect = ValueError(err_msg)
 78 |     mock_minio = Mock()
 79 |     mock_minio.get_object.return_value = mock_response
 80 |     
 81 |     mocker.patch('pipeline.get_minio_client', return_value=mock_minio, autospec=True)
 82 |     
 83 |     with pytest.raises(ValueError, match=err_msg):
 84 |         report_func(f's3a://{MINIO_BUCKET_NAME}/2025-08-10_12-00.parquet')
 85 |     
 86 |     mock_response.close.assert_called_once()
 87 |     mock_response.release_conn.assert_called_once()
 88 | 
 89 | 
 90 | def test_send_to_dashboard_s3_error(mocker, report_func):
 91 |     """Test send_to_dashboard handles S3Error correctly."""
 92 |     from minio.error import S3Error
 93 |     
 94 |     mock_minio = Mock(spec=Minio)
 95 |     filename = '2025-08-10_12-00'
 96 |     code = 'NoSuchKey'
 97 |     resource = f'{filename}.json'
 98 |     mock_minio.get_object.side_effect = S3Error(
 99 |         code=code,
100 |         message='The specified key does not exist.',
101 |         resource=resource,
102 |         request_id='test-request-id',
103 |         host_id='test-host-id',
104 |         response='test-response'
105 |     )
106 |     
107 |     mocker.patch('pipeline.get_minio_client', return_value=mock_minio)
108 |     
109 |     with pytest.raises(S3Error) as exc_info:
110 |         report_func(f's3a://{MINIO_BUCKET_NAME}/{filename}.parquet')
111 |     
112 |     assert exc_info.value.code == 'NoSuchKey'
113 |     assert exc_info.value._resource == resource
114 | 
115 | 
116 | def test_send_to_dashboard_json_decode_error(report_func, mock_minio_response):
117 |     """Test send_to_dashboard handles JSONDecodeError correctly."""
118 |     mock_minio, mock_response = mock_minio_response
119 |     mock_response.read.return_value = 'invalid json'
120 |     
121 |     with pytest.raises(json.JSONDecodeError):  # exc_info was not much informative
122 |         report_func(f's3a://{MINIO_BUCKET_NAME}/2025-08-10_12-00.parquet')
123 |     
124 |     mock_response.close.assert_called_once()
125 |     mock_response.release_conn.assert_called_once()
126 | 
127 | 
128 | def test_send_to_dashboard_request_exception(report_func, mock_minio_response, mock_request_post):
129 |     """Test send_to_dashboard handles RequestException correctly."""
130 |     import requests
131 |     
132 |     mock_minio, mock_response = mock_minio_response
133 |     mock_request_post.side_effect = requests.RequestException('Connection failed')
134 |     
135 |     with pytest.raises(requests.RequestException):
136 |         report_func(f's3a://{MINIO_BUCKET_NAME}/2025-08-10_12-00.parquet')
137 |     
138 |     mock_response.close.assert_called_once()
139 |     mock_response.release_conn.assert_called_once()
140 | 


--------------------------------------------------------------------------------
/airflow/tests/test_unit_spark.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | 
 5 | 
 6 | def test_spark_task_configuration(dag):
 7 |     """Test that the Spark task is properly configured."""
 8 |     spark_task = dag.get_task('spark_analysis')
 9 |     
10 |     assert spark_task._conn_id == os.environ['SPARK_CONN_NAME']
11 |     assert spark_task._deploy_mode == 'client'
12 |     assert spark_task._driver_memory == '512m'
13 |     assert spark_task._executor_memory == '512m'
14 |     assert spark_task._executor_cores == 2
15 |     assert spark_task._num_executors == 2
16 |     
17 |     expected_conf_keys = {
18 |         'spark.hadoop.fs.s3a.endpoint',
19 |         'spark.hadoop.fs.s3a.access.key',
20 |         'spark.hadoop.fs.s3a.secret.key',
21 |         'spark.hadoop.fs.s3a.path.style.access',
22 |         'spark.hadoop.fs.s3a.impl',
23 |         'spark.hadoop.fs.s3a.connection.ssl.enabled',
24 |         'spark.eventLog.enabled',
25 |         'spark.eventLog.dir'
26 |     }
27 |     
28 |     actual_conf_keys = set(spark_task.conf.keys())
29 |     assert expected_conf_keys.issubset(actual_conf_keys)
30 | 


--------------------------------------------------------------------------------
/airflow/tests/test_unit_stream.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | from datetime import datetime, timedelta
  5 | from unittest.mock import Mock, MagicMock
  6 | from zoneinfo import ZoneInfo
  7 | 
  8 | import pandas as pd
  9 | import pyarrow as pa
 10 | import pytest
 11 | from airflow.sdk import Connection
 12 | from clickhouse_connect.driver.client import Client
 13 | from pyarrow.fs import S3FileSystem
 14 | 
 15 | from common import CLICKHOUSE_HOST, CLICKHOUSE_PORT, CLICKHOUSE_USER, CLICKHOUSE_PASSWORD, CLICKHOUSE_DB, MINIO_BUCKET_NAME
 16 | from pipeline import schema
 17 | 
 18 | 
 19 | df_chunk = pd.DataFrame({
 20 |     'event_type': ['VIEW_PRODUCT'],
 21 |     'status': ['SUCCESS'],
 22 | })
 23 | 
 24 | 
 25 | @pytest.fixture
 26 | def mock_ch_client(mocker) -> Mock:
 27 |     """Mock ClickHouse client."""
 28 |     mock_client = Mock(spec=Client)
 29 |     mocker.patch('pipeline.clickhouse_connect.get_client', return_value=mock_client)
 30 |     return mock_client
 31 | 
 32 | 
 33 | @pytest.fixture
 34 | def mock_connections(mocker):
 35 |     """Mock ClickHouse and MinIO connections and patch `get_connection`."""
 36 |     mock_ch_conn = Mock(spec=Connection)
 37 |     mock_ch_conn.host = CLICKHOUSE_HOST
 38 |     mock_ch_conn.port = CLICKHOUSE_PORT
 39 |     mock_ch_conn.login = CLICKHOUSE_USER
 40 |     mock_ch_conn.password = CLICKHOUSE_PASSWORD
 41 |     mock_ch_conn.schema = CLICKHOUSE_DB
 42 |     
 43 |     mock_minio_conn = Mock(spec=Connection)
 44 |     mock_minio_conn.login = os.environ['MINIO_ROOT_USER']
 45 |     mock_minio_conn.password = os.environ['MINIO_ROOT_PASSWORD']
 46 |     mock_minio_conn.extra_dejson = {'host': 'http://minio:9000'}
 47 |     
 48 |     def get_connection_side_effect(conn_name: str):
 49 |         if conn_name == os.environ['CLICKHOUSE_CONN_NAME']:
 50 |             return mock_ch_conn
 51 |         
 52 |         if conn_name == os.environ['MINIO_CONN_NAME']:
 53 |             return mock_minio_conn
 54 |         
 55 |         msg = f'Unknown connection name: {conn_name}'
 56 |         raise ValueError(msg)
 57 |     
 58 |     mocker.patch('pipeline.BaseHook.get_connection', side_effect=get_connection_side_effect)
 59 |     return mock_ch_conn, mock_minio_conn
 60 | 
 61 | 
 62 | @pytest.fixture
 63 | def mock_s3_fs(mocker):
 64 |     """Mock S3 Filesystem and its stream operation."""
 65 |     mock_fs = MagicMock(spec=S3FileSystem)
 66 |     mock_s3_stream = Mock()
 67 |     mock_fs.open_output_stream.return_value.__enter__.return_value = mock_s3_stream
 68 |     mocker.patch('pipeline.fs.S3FileSystem', return_value=mock_fs)
 69 |     return mock_fs, mock_s3_stream
 70 | 
 71 | 
 72 | @pytest.fixture
 73 | def mock_parquet_writer(mocker):
 74 |     """Mock Parquet Writer."""
 75 |     mock_writer_instance = Mock()
 76 |     mock_context_manager = MagicMock()
 77 |     mock_context_manager.__enter__.return_value = mock_writer_instance
 78 |     mock_writer_class = mocker.patch('pipeline.pq.ParquetWriter', return_value=mock_context_manager)
 79 |     return mock_writer_class, mock_writer_instance
 80 | 
 81 | 
 82 | def create_mock_df_stream(dataframes):
 83 |     """Helper to create a mock stream that yields dataframes."""
 84 |     mock_stream = MagicMock()
 85 |     mock_stream.__enter__.return_value = iter(dataframes)
 86 |     return mock_stream
 87 | 
 88 | 
 89 | def test_stream_from_clickhouse_to_minio_with_data(stream_func, mock_ch_client, mock_s3_fs, mock_parquet_writer, mock_connections):
 90 |     """Test stream_from_clickhouse_to_minio handles data streaming and MinIO upload."""
 91 |     date_time = datetime(2025, 8, 10, 13, 5, tzinfo=ZoneInfo('UTC'))
 92 |     mock_ch_client.query_df_stream.return_value = create_mock_df_stream([df_chunk])
 93 |     data_interval_start = date_time + timedelta(minutes=1)
 94 |     date_time_str = date_time.astimezone(ZoneInfo('Asia/Tehran')).strftime('%Y-%m-%d_%H-%M')
 95 |     parquet_path = f's3a://{MINIO_BUCKET_NAME}/{date_time_str}.parquet'
 96 |     mock_s3, mock_stream = mock_s3_fs
 97 |     mock_writer_class, mock_writer_instance = mock_parquet_writer
 98 |     
 99 |     result = stream_func(data_interval_start=data_interval_start)
100 |     
101 |     mock_ch_client.query_df_stream.assert_called_once_with(
102 |         query='SELECT event_type, status FROM %(table)s WHERE event_minute = %(timestamp)s;',
103 |         parameters={'table': os.environ['CLICKHOUSE_TABLE'], 'timestamp': date_time},
104 |         settings={'max_block_size': 100000}
105 |     )
106 |     mock_writer_class.assert_called_once_with(where=mock_stream, schema=schema)
107 |     mock_writer_instance.write_table.assert_called_once()
108 |     written_table = mock_writer_instance.write_table.call_args[1]['table']
109 |     assert written_table.schema == schema
110 |     mock_s3.open_output_stream.assert_called_once_with(path=parquet_path.replace('s3a://', ''))
111 |     assert result == parquet_path
112 | 
113 | 
114 | def test_stream_from_clickhouse_to_minio_no_data(mocker, stream_func, mock_parquet_writer, mock_ch_client, mock_s3_fs, mock_connections):
115 |     """Test stream_from_clickhouse_to_minio handles no data case without upload."""
116 |     mock_ch_client.query_df_stream.return_value = create_mock_df_stream([])
117 |     data_interval_start = datetime(2025, 8, 10, 8, 31, 0, tzinfo=ZoneInfo('UTC'))
118 |     filename = (data_interval_start.astimezone(ZoneInfo('Asia/Tehran')) - timedelta(minutes=1)).strftime("%Y-%m-%d_%H-%M")
119 |     mock_table = Mock()
120 |     mocker.patch('pipeline.pa.Table', return_value=mock_table)
121 |     mock_s3, mock_stream = mock_s3_fs
122 |     mock_writer_class, mock_writer_instance = mock_parquet_writer
123 |     
124 |     result = stream_func(data_interval_start=data_interval_start)
125 |     
126 |     assert result == f's3a://{MINIO_BUCKET_NAME}/{filename}'
127 |     mock_writer_class.assert_called_once_with(where=mock_stream, schema=schema)
128 |     mock_table.from_pandas.assert_not_called()
129 |     mock_writer_instance.write_table.assert_not_called()
130 |     mock_s3.delete_file.assert_called_once_with(f'{MINIO_BUCKET_NAME}/{filename}.parquet')
131 |     mock_ch_client.close.assert_called_once()
132 | 
133 | 
134 | def test_stream_from_clickhouse_to_minio_exception(stream_func, mock_ch_client):
135 |     """Test stream_from_clickhouse_to_minio raises exception on failure."""
136 |     mock_ch_client.query_df_stream.side_effect = ValueError('Query failed')
137 |     data_interval_start = datetime(2025, 8, 10, 8, 31, 0, tzinfo=ZoneInfo('UTC'))
138 |     
139 |     with pytest.raises(ValueError, match='Query failed'):
140 |         stream_func(data_interval_start)
141 | 
142 | 
143 | def test_error_propagation(mocker, stream_func) -> None:
144 |     """Test that Exception error is propagated."""
145 |     err_msg = 'Connection not found'
146 |     mocker.patch('pipeline.BaseHook.get_connection', side_effect=Exception(err_msg))
147 |     
148 |     with pytest.raises(Exception) as exc_info:
149 |         stream_func(data_interval_start=datetime.now(ZoneInfo('Asia/Tehran')))
150 |     
151 |     assert err_msg in str(exc_info.value)
152 | 
153 | 
154 | def test_data_transformation_in_stream(stream_func, mock_ch_client, mock_parquet_writer, mock_s3_fs, mock_connections):
155 |     """Test that data transformations are applied correctly."""
156 |     date_time = datetime(2025, 8, 10, 12, 0, tzinfo=ZoneInfo('UTC'))
157 |     chunk1 = df_chunk.copy()
158 |     chunk2 = df_chunk.copy()
159 |     chunk2['status'] = ['ERROR']
160 |     mock_ch_client.query_df_stream.return_value = create_mock_df_stream([chunk1, chunk2])
161 |     _, mock_writer_instance = mock_parquet_writer
162 |     written_tables = []
163 |     mock_writer_instance.write_table.side_effect = lambda table: written_tables.append(table)
164 |     mock_s3, _ = mock_s3_fs
165 |     expected_filename = date_time.astimezone(ZoneInfo('Asia/Tehran')).strftime('%Y-%m-%d_%H-%M')
166 |     
167 |     result = stream_func(data_interval_start=date_time + timedelta(minutes=1))
168 |     
169 |     mock_s3.open_output_stream.assert_called_once_with(path=f'{MINIO_BUCKET_NAME}/{expected_filename}.parquet')
170 |     assert result == f's3a://{MINIO_BUCKET_NAME}/{expected_filename}.parquet'
171 |     assert len(written_tables) == 2
172 |     assert mock_writer_instance.write_table.call_count == 2
173 |     for written_table, chunk in zip(written_tables, [chunk1, chunk2]):
174 |         assert written_table.column('status').type == pa.string()
175 |         assert written_table.column('event_type').to_pylist()[0] == chunk['event_type'].iloc[0]
176 |         assert written_table.column('status').to_pylist()[0] == chunk['status'].iloc[0]
177 |         assert written_table.schema == schema
178 | 
179 | 
180 | def test_stream_from_clickhouse_to_minio_empty_chunk(stream_func, mock_ch_client, mock_parquet_writer, mock_s3_fs, mock_connections):
181 |     """Test stream_from_clickhouse_to_minio handles empty DataFrame correctly."""
182 |     date_time = datetime(2025, 8, 10, 12, 1, tzinfo=ZoneInfo('UTC'))
183 |     empty_df = pd.DataFrame(columns=df_chunk.columns)
184 |     mock_ch_client.query_df_stream.return_value = create_mock_df_stream([empty_df])
185 |     data_interval_start = date_time + timedelta(minutes=1)
186 |     date_time_str = date_time.astimezone(ZoneInfo('Asia/Tehran')).strftime('%Y-%m-%d_%H-%M')
187 |     mock_s3, mock_stream = mock_s3_fs
188 |     mock_writer_class, mock_writer_instance = mock_parquet_writer
189 |     
190 |     result = stream_func(data_interval_start=data_interval_start)
191 |     
192 |     mock_writer_class.assert_called_once_with(where=mock_stream, schema=schema)
193 |     mock_writer_instance.write_table.assert_not_called()
194 |     mock_s3.delete_file.assert_called_once_with(f'{MINIO_BUCKET_NAME}/{date_time_str}.parquet')
195 |     assert result == f's3a://{MINIO_BUCKET_NAME}/{date_time_str}'
196 | 


--------------------------------------------------------------------------------
/dashboard/Dockerfile-Dashboard-api:
--------------------------------------------------------------------------------
1 | FROM python:3.11.13-bookworm
2 | 
3 | RUN pip install fastapi==0.116.1 pydantic==2.11.7 uvicorn[standard]==0.35.0
4 | 


--------------------------------------------------------------------------------
/dashboard/Dockerfile-Dashboard-api-test:
--------------------------------------------------------------------------------
1 | FROM lp/dashboard-api:latest
2 | 
3 | RUN pip install pytest==8.4.1 httpx==0.28.1 requests==2.32.4
4 | 


--------------------------------------------------------------------------------
/dashboard/Dockerfile-Dashboard-ui:
--------------------------------------------------------------------------------
1 | FROM python:3.11.13-bookworm
2 | 
3 | RUN pip install matplotlib==3.10.5 streamlit==1.47.1
4 | 


--------------------------------------------------------------------------------
/dashboard/Dockerfile-Dashboard-ui-test:
--------------------------------------------------------------------------------
1 | FROM lp/dashboard-ui:latest
2 | 
3 | RUN pip install pytest==8.4.1 pytest-mock==3.14.1
4 | 


--------------------------------------------------------------------------------
/dashboard/dashboard_api.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | import logging
 3 | from collections import deque
 4 | from typing import Any
 5 | 
 6 | from fastapi import FastAPI, HTTPException, status
 7 | from pydantic import BaseModel
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | app = FastAPI(title='Dashboard API')
13 | storage = deque(maxlen=1)
14 | NO_REPORT_STORED = 'No report stored.'
15 | 
16 | 
17 | class AnalysisReport(BaseModel):
18 |     """Incoming analysis report from Airflow."""
19 |     
20 |     report: dict[str, Any] | str
21 | 
22 | 
23 | @app.post('/report')
24 | async def receive_report(report: AnalysisReport) -> None:
25 |     """Enpoint for Airflow to push analysis reports.
26 |     
27 |     Cases of a report:
28 |         Case 1: Data:
29 |             {'report': {
30 |                     'total_events': 5805,
31 |                     'total_errors': 1398,
32 |                     'by_event_type': {
33 |                         'ADD_TO_CART': {'SUCCESS': 876, 'ERROR': 292},
34 |                         'CHECKOUT': {'SUCCESS': 846, 'ERROR': 289},
35 |                         'PAYMENT': {'SUCCESS': 884, 'ERROR': 281},
36 |                         'SEARCH': {'SUCCESS': 933, 'ERROR': 261},
37 |                         'VIEW_PRODUCT': {'SUCCESS': 868, 'ERROR': 275}
38 |                     },
39 |                         
40 |                     'process_time': 22.15983009338379,
41 |                     'file_name': '2025-08-04_19-04.json'
42 |                 }
43 |             }
44 |         
45 |         Case 2: No Data:
46 |             {'report': 'No data for 2025-08-04_19-04.json.'}
47 |     
48 |     Args:
49 |         report: Analysis report.
50 |     """
51 |     print('Received data:', report)
52 |     storage.append(report)
53 |     print(f'number of reports in storage: {len(storage)}')
54 |     logger.info('log report: %s', report)
55 | 
56 | 
57 | @app.get(
58 |     path='/report',
59 |     response_model=AnalysisReport,
60 |     summary='Get the most recent report.',
61 |     responses={status.HTTP_404_NOT_FOUND: {'description': NO_REPORT_STORED}}
62 | )
63 | async def get_report() -> AnalysisReport:
64 |     """Return the most recent report.
65 |     
66 |     Returns:
67 |         The most recent report.
68 |     
69 |     Raises:
70 |         HTTPException: If no valid reports exist in storage. The status code is HTTP_404_NOT_FOUND.
71 |     """
72 |     print('Got request to send the most recent report')
73 |     if storage:
74 |         print('Responding with', storage[0])
75 |         return storage[0]
76 |     print('No data to send back.')
77 |     raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail=NO_REPORT_STORED)
78 | 
79 | 
80 | @app.get('/health')
81 | async def health_check() -> dict[str, Any]:
82 |     """Health check endpoint.
83 |     
84 |     Returns:
85 |         Status and number of reports in the storage.
86 |     """
87 |     return {'status': 'healthy', 'reports_count': len(storage)}
88 | 
89 | 
90 | # This is for testability (airflow test_integration_report.py)
91 | @app.delete('/report')
92 | def clear_storage() -> None:
93 |     """Endpoint to clear storage between tests."""
94 |     storage.clear()
95 | 


--------------------------------------------------------------------------------
/dashboard/dashboard_ui.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import logging
  4 | import os
  5 | import time
  6 | from http import HTTPStatus
  7 | from typing import Any
  8 | 
  9 | import requests
 10 | import streamlit as st
 11 | import matplotlib.pyplot as plt
 12 | from requests.exceptions import HTTPError
 13 | 
 14 | 
 15 | def prepare_timestamp(*, file_path: str) -> str:
 16 |     """Convert file path to a proper timestamp format.
 17 |     
 18 |     Example: 's3/some_bucket/2025-08-04_19-04.json' -> '2025/08/04 19:04'
 19 |     Args:
 20 |         file_path: Path of the file.
 21 |     
 22 |     Returns:
 23 |         Desired timestamp format.
 24 |     """
 25 |     return (
 26 |         file_path
 27 |         .rsplit('/', maxsplit=1)[-1]
 28 |         .replace('.json', '')
 29 |         .replace('_', ' ')
 30 |         .replace('-', '/', 2)
 31 |         .replace('-', ':')
 32 |     )
 33 | 
 34 | 
 35 | def fetch_report(*, url: str, timeout: int) -> dict[str, Any] | None:
 36 |     """Get the most recent report via REST API.
 37 |     
 38 |     Args:
 39 |         url: API url.
 40 |         timeout: Request timeout.
 41 |     
 42 |     Returns:
 43 |         Report.
 44 |     """
 45 |     try:
 46 |         response = requests.get(url, timeout=timeout)
 47 |         logger.info('fetch_reports - response: %s', response)
 48 |         response.raise_for_status()
 49 |         logger.info('fetch_reports - response.json: %s', response.json())
 50 |         return response.json()
 51 |     except requests.Timeout:
 52 |         logger.info('Timeout occured.')
 53 |         return None
 54 |     except HTTPError as e:
 55 |         if e.response.status_code == HTTPStatus.NOT_FOUND:
 56 |             error_detail = e.response.json().get('detail')
 57 |             logger.info("Request successful but no data. Detail: %s", error_detail)
 58 |         else:
 59 |             logger.exception('Unexpected error.')
 60 |         return None
 61 |     except requests.RequestException:
 62 |         logger.exception('Network error connecting to API')
 63 |         return None
 64 | 
 65 | 
 66 | def show_report(*, report: dict[str, Any]) -> None:
 67 |     """Display a bar chart and summary stats for an analysis report.
 68 |     
 69 |     Sample report:
 70 |     {
 71 |         'total_events': 5805,
 72 |         'total_errors': 1398,
 73 |         'by_event_type': {
 74 |             'ADD_TO_CART': {'SUCCESS': 876, 'ERROR': 292},
 75 |             'CHECKOUT': {'SUCCESS': 846, 'ERROR': 289},
 76 |             'PAYMENT': {'SUCCESS': 884, 'ERROR': 281},
 77 |             'SEARCH': {'SUCCESS': 933, 'ERROR': 261},
 78 |             'VIEW_PRODUCT': {'SUCCESS': 868, 'ERROR': 275}
 79 |         },
 80 |         
 81 |         'process_time': 22.15983009338379,
 82 |         'file_name': '2025-08-04_19-04.json'
 83 |     }
 84 |     
 85 |     Args:
 86 |         report: User event analysis report.
 87 |     """
 88 |     event_types = []
 89 |     successes = []
 90 |     errors = []
 91 |     for key, value in report['by_event_type'].items():
 92 |         event_types.append(key)
 93 |         successes.append(value['SUCCESS'])
 94 |         errors.append(value['ERROR'])
 95 |     
 96 |     plt.style.use('dark_background')
 97 |     fig, ax = plt.subplots(figsize=(5, 2))
 98 |     bar_width = 0.35
 99 |     n = list(range(len(event_types)))
100 |     
101 |     bars1 = ax.bar([i - bar_width / 2 for i in n], successes, width=bar_width, label='Success', color='#4CAF50')
102 |     
103 |     bars2 = ax.bar([i + bar_width / 2 for i in n], errors, width=bar_width, label='Error', color='#FF5252')
104 |     
105 |     for bar in bars1:
106 |         height = bar.get_height()
107 |         ax.annotate(
108 |             f'{height:,}', xy=(bar.get_x() + bar.get_width() / 2, height),
109 |             xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=7, color='#4CAF50'
110 |         )
111 |     for bar in bars2:
112 |         height = bar.get_height()
113 |         ax.annotate(
114 |             f'{height:,}', xy=(bar.get_x() + bar.get_width() / 2, height),
115 |             xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=7, color='#FF5252'
116 |         )
117 |     
118 |     ax.set_xticks(n)
119 |     ax.set_xticklabels([event_type.replace('_', ' ').title() for event_type in event_types], rotation=0, color='white', fontsize=7)
120 |     ax.set_ylabel('Count', color='white')
121 |     ax.tick_params(axis='y', colors='white')
122 |     ax.legend(facecolor='#222', edgecolor='white', labelcolor='white', bbox_to_anchor=(1.1, 1))
123 |     
124 |     ax.spines['top'].set_visible(False)
125 |     ax.spines['right'].set_visible(False)
126 |     ax.spines['bottom'].set_color('white')
127 |     ax.spines['left'].set_color('white')
128 |     
129 |     total_events = f"Total Events: {report.get('total_events', 'N/A'):,}"
130 |     total_errors = f"Total Errors: {report.get('total_errors', 'N/A'):,}"
131 |     timestamp = prepare_timestamp(file_path=report['file_name'])
132 |     summary_text = f'{total_events}    |    {total_errors}    |    Timestamp: {timestamp}'
133 |     ax.text(0.5, -0.18, summary_text, ha='center', va='top', fontsize=14, color='#FFD600', transform=ax.transAxes)
134 |     ax.text(
135 |         0.5, -0.38,
136 |         f'Spark process took {report["process_time"]:.2f} seconds.', ha='center',
137 |         va='top', fontsize=14, color='#FFD600', transform=ax.transAxes
138 |     )
139 |     
140 |     st.pyplot(fig)
141 | 
142 | 
143 | def prepare_no_data(*, report: str) -> str:
144 |     timestamp = prepare_timestamp(file_path=report.replace('No data for ', ''))
145 |     return f'No data for {timestamp}'
146 | 
147 | 
148 | def prepare_report(*, report: str | dict[str, Any]) -> None:
149 |     """Prepare output based on incoming report.
150 |     
151 |     Args:
152 |         report: Analysis report.
153 |     """
154 |     report = report['report']
155 |     if isinstance(report, str):
156 |         st.subheader(prepare_no_data(report=report))
157 |     else:
158 |         show_report(report=report)
159 | 
160 | 
161 | logger = logging.getLogger(__name__)
162 | 
163 | if __name__ == '__main__':
164 |     st.set_page_config(
165 |         page_icon='📊',
166 |         page_title='Dashboard',
167 |         layout='wide',
168 |         initial_sidebar_state='collapsed'
169 |     )
170 |     
171 |     st.title('Event Analysis Dashboard')
172 |     timeout = 5
173 |     placeholder = st.empty()
174 |     logger.info('Starting streamlit.')
175 |     while True:
176 |         with placeholder.container():
177 |             logger.info('Fetching reports ...')
178 |             report = fetch_report(url=os.environ['REPORTS_URL'], timeout=timeout)
179 |             logger.info('Report: %s', report)
180 |             if not report:
181 |                 st.subheader('No analysis report yet...')
182 |             else:
183 |                 prepare_report(report=report)
184 |         time.sleep(45)
185 | 


--------------------------------------------------------------------------------
/dashboard/tests/common.py:
--------------------------------------------------------------------------------
 1 | SAMPLE_REPORT_WITH_DATA = {
 2 |     'report': {
 3 |         'total_events': 5805,
 4 |         'total_errors': 1398,
 5 |         'by_event_type': {
 6 |             'ADD_TO_CART': {'SUCCESS': 876, 'ERROR': 292},
 7 |             'CHECKOUT': {'SUCCESS': 846, 'ERROR': 289},
 8 |             'PAYMENT': {'SUCCESS': 884, 'ERROR': 281},
 9 |             'SEARCH': {'SUCCESS': 933, 'ERROR': 261},
10 |             'VIEW_PRODUCT': {'SUCCESS': 868, 'ERROR': 275}
11 |         },
12 |         'process_time': 22.15983009338379,
13 |         'file_name': '2025-08-04_19-04.json'
14 |     }
15 | }
16 | 


--------------------------------------------------------------------------------
/dashboard/tests/test_integration_api.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | from collections.abc import Iterator
  5 | from http import HTTPStatus
  6 | 
  7 | import pytest
  8 | import requests
  9 | 
 10 | from common import SAMPLE_REPORT_WITH_DATA
 11 | from dashboard_api import NO_REPORT_STORED
 12 | 
 13 | 
 14 | REPORT_URL = os.environ['REPORT_URL']
 15 | HEALTH_URL = os.environ['HEALTH_URL']
 16 | 
 17 | 
 18 | @pytest.fixture
 19 | def api_client() -> Iterator[requests.Session]:
 20 |     """Provide a requests session for making API calls.
 21 |     
 22 |     Yields:
 23 |         An initialized requests session object.
 24 |     """
 25 |     with requests.Session() as session:
 26 |         yield session
 27 | 
 28 | 
 29 | def test_health_check(api_client: requests.Session) -> None:
 30 |     """Verify that the API health check endpoint is working."""
 31 |     response = api_client.get(HEALTH_URL)
 32 |     
 33 |     assert response.status_code == HTTPStatus.OK
 34 |     assert response.json()['status'] == 'healthy'
 35 |     assert 'reports_count' in response.json()
 36 | 
 37 | 
 38 | def test_get_report_when_storage_is_empty(api_client: requests.Session) -> None:
 39 |     """Verify the behavior when the storage is empty.
 40 |     
 41 |     Scenario: The UI starts before any report has been sent.
 42 |     Behavior: The API should return a 404 Not Found.
 43 |     """
 44 |     api_client.delete(REPORT_URL)
 45 |     health_response = api_client.get(HEALTH_URL)
 46 |     assert health_response.json()['reports_count'] == 0
 47 |     
 48 |     response = api_client.get(REPORT_URL)
 49 |     
 50 |     assert response.status_code == HTTPStatus.NOT_FOUND
 51 |     assert response.json()['detail'] == NO_REPORT_STORED
 52 | 
 53 | 
 54 | def test_post_and_get_valid_data_report(api_client: requests.Session) -> None:
 55 |     """Verify the behavior when posting a valid report and getting it.
 56 |     
 57 |     Scenario: Airflow posts a valid analysis report. The UI then fetches it.
 58 |     Behavior: The API should store the report and return it on a subsequent GET request.
 59 |     """
 60 |     post_response = api_client.post(REPORT_URL, json=SAMPLE_REPORT_WITH_DATA)
 61 |     assert post_response.status_code == HTTPStatus.OK
 62 |     
 63 |     get_response = api_client.get(REPORT_URL)
 64 |     assert get_response.status_code == HTTPStatus.OK
 65 |     assert get_response.json() == SAMPLE_REPORT_WITH_DATA
 66 | 
 67 | 
 68 | def test_post_and_get_no_data_report(api_client: requests.Session) -> None:
 69 |     """Verify the behavior when posting an empty report and getting it.
 70 |     
 71 |     Scenario: Airflow reports that there was no data to process. The UI fetches this status.
 72 |     Behavior: The API should store the string-based report and return it.
 73 |     """
 74 |     report_data = {'report': 'No data for 2025-08-04_19-04.json.'}
 75 |     
 76 |     post_response = api_client.post(REPORT_URL, json=report_data)
 77 |     assert post_response.status_code == HTTPStatus.OK
 78 |     
 79 |     get_response = api_client.get(REPORT_URL)
 80 |     assert get_response.status_code == HTTPStatus.OK
 81 |     assert get_response.json() == report_data
 82 | 
 83 | 
 84 | def test_storage_holds_only_the_latest_report(api_client: requests.Session) -> None:
 85 |     """Verify the storage only keeps the last report.
 86 |     
 87 |     Scenario: Airflow sends two reports in quick succession.
 88 |     Behavior: The API should only store and return the most recent report.
 89 |     """
 90 |     first_report = {'report': {'total_events': 100, 'file_name': 'first.json'}}
 91 |     second_report = {'report': {'total_events': 200, 'file_name': 'second.json'}}
 92 |     
 93 |     post_one_response = api_client.post(REPORT_URL, json=first_report)
 94 |     assert post_one_response.status_code == HTTPStatus.OK
 95 |     
 96 |     post_two_response = api_client.post(REPORT_URL, json=second_report)
 97 |     assert post_two_response.status_code == HTTPStatus.OK
 98 |     
 99 |     get_response = api_client.get(REPORT_URL)
100 |     assert get_response.status_code == HTTPStatus.OK
101 |     assert get_response.json() == second_report
102 |     
103 |     health_response = api_client.get(HEALTH_URL)
104 |     assert health_response.json()['reports_count'] == 1
105 | 


--------------------------------------------------------------------------------
/dashboard/tests/test_unit_api.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pytest
 4 | from fastapi import status
 5 | from fastapi.testclient import TestClient
 6 | 
 7 | from common import SAMPLE_REPORT_WITH_DATA
 8 | from dashboard_api import app, NO_REPORT_STORED
 9 | 
10 | 
11 | @pytest.fixture
12 | def client() -> TestClient:
13 |     """Fixture to provide a FastAPI test client."""
14 |     return TestClient(app)
15 | 
16 | 
17 | @pytest.fixture(autouse=True)
18 | def clear_storage(client: TestClient) -> None:
19 |     """Clear the storage before each test to ensure isolation."""
20 |     client.delete(os.environ['REPORT_URL'])
21 |     health_response = client.get(os.environ['HEALTH_URL'])
22 |     assert health_response.json()['reports_count'] == 0, 'Storage is not cleared!'
23 | 
24 | 
25 | def test_receive_report(client: TestClient) -> None:
26 |     """Test posting a report stores it correctly and returns 200."""
27 |     report_data = SAMPLE_REPORT_WITH_DATA
28 |     
29 |     response = client.post('/report', json=report_data)
30 |     
31 |     assert response.status_code == status.HTTP_200_OK
32 |     assert response.json() is None
33 | 
34 | 
35 | def test_get_report(client: TestClient) -> None:
36 |     """Test getting a report returns the stored report correctly."""
37 |     report_data = SAMPLE_REPORT_WITH_DATA
38 |     response = client.post('/report', json=report_data)
39 |     
40 |     response = client.get('/report')
41 |     
42 |     assert response.status_code == status.HTTP_200_OK
43 |     assert response.json() == report_data
44 | 
45 | 
46 | def test_get_report_no_data(client: TestClient) -> None:
47 |     """Test getting a report with no data returns 404 with the correct detail."""
48 |     response = client.get('/report')
49 |     
50 |     assert response.status_code == status.HTTP_404_NOT_FOUND
51 |     assert response.json()['detail'] == NO_REPORT_STORED
52 | 
53 | 
54 | def test_health_check(client: TestClient) -> None:
55 |     """Test the health check endpoint returns the expected status and metrics count."""
56 |     response = client.get('/health')
57 |     
58 |     assert response.status_code == 200
59 |     assert response.json() == {'status': 'healthy', 'reports_count': 0}
60 | 


--------------------------------------------------------------------------------
/dashboard/tests/test_unit_ui.py:
--------------------------------------------------------------------------------
 1 | from http import HTTPStatus
 2 | from unittest.mock import Mock
 3 | 
 4 | import requests
 5 | 
 6 | from common import SAMPLE_REPORT_WITH_DATA
 7 | from dashboard_ui import prepare_timestamp, fetch_report, prepare_no_data
 8 | 
 9 | 
10 | TIMEOUT = 1
11 | 
12 | 
13 | def test_prepare_timestamp() -> None:
14 |     """Test that prepare_timestamp correctly formats the file path into a timestamp string."""
15 |     file_path = 's3/some_bucket/2025-08-04_19-04.json'
16 |     expected = '2025/08/04 19:04'
17 |     
18 |     result = prepare_timestamp(file_path=file_path)
19 |     
20 |     assert result == expected
21 | 
22 | 
23 | def test_prepare_no_data() -> None:
24 |     """Test that prepare_no_data correctly formats the incoming report into a user-friendly message."""
25 |     report   = 'No data for 2025-08-04_19-04.json'
26 |     expected = 'No data for 2025/08/04 19:04'
27 |     
28 |     result = prepare_no_data(report=report)
29 |     
30 |     assert result == expected
31 | 
32 | 
33 | def test_fetch_report_success(mocker) -> None:
34 |     """Test that fetch_report handles a successful API response and returns the JSON data."""
35 |     report = SAMPLE_REPORT_WITH_DATA
36 |     
37 |     mock_response = Mock()
38 |     mock_response.status_code = HTTPStatus.OK
39 |     mock_response.json.return_value = report
40 |     mock_get = mocker.patch('dashboard_ui.requests.get', return_value=mock_response)
41 |     
42 |     result = fetch_report(url='http://test-url', timeout=TIMEOUT)
43 |     
44 |     assert result == report
45 |     mock_get.assert_called_once_with('http://test-url', timeout=TIMEOUT)
46 | 
47 | 
48 | def test_fetch_report_timeout(mocker) -> None:
49 |     """Test that fetch_report handles a timeout exception and returns None."""
50 |     mock_get = mocker.patch('dashboard_ui.requests.get', side_effect=requests.Timeout)
51 |     
52 |     result = fetch_report(url='http://test-url', timeout=TIMEOUT)
53 |     
54 |     assert result is None
55 |     mock_get.assert_called_once_with('http://test-url', timeout=TIMEOUT)
56 | 
57 | 
58 | def test_fetch_report_404(mocker) -> None:
59 |     """Test that fetch_report handles a 404 HTTP error and returns None."""
60 |     mock_response = Mock()
61 |     mock_response.status_code = HTTPStatus.NOT_FOUND
62 |     mock_response.json.return_value = {'detail': 'No report stored.'}
63 |     mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError(response=mock_response)
64 |     mock_get = mocker.patch('dashboard_ui.requests.get', return_value=mock_response)
65 |     
66 |     result = fetch_report(url='http://test-url', timeout=TIMEOUT)
67 |     
68 |     assert result is None
69 |     mock_get.assert_called_once_with('http://test-url', timeout=TIMEOUT)
70 | 
71 | 
72 | def test_fetch_report_unexpected_http_error(mocker) -> None:
73 |     """Test that fetch_report handles an unexpected HTTP error (non-404) and returns None."""
74 |     mock_response = Mock()
75 |     mock_response.status_code = HTTPStatus.INTERNAL_SERVER_ERROR
76 |     mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError(response=mock_response)
77 |     mock_get = mocker.patch('dashboard_ui.requests.get', return_value=mock_response)
78 |     
79 |     result = fetch_report(url='http://test-url', timeout=TIMEOUT)
80 |     
81 |     assert result is None
82 |     mock_get.assert_called_once_with('http://test-url', timeout=TIMEOUT)
83 | 
84 | 
85 | def test_fetch_report_network_error(mocker) -> None:
86 |     """Test that fetch_report handles a general network error and returns None."""
87 |     mock_get = mocker.patch('dashboard_ui.requests.get', side_effect=requests.RequestException('Network error'))
88 |     
89 |     result = fetch_report(url='http://test-url', timeout=TIMEOUT)
90 |     
91 |     assert result is None
92 |     mock_get.assert_called_once_with('http://test-url', timeout=TIMEOUT)
93 | 


--------------------------------------------------------------------------------
/db/Dockerfile-DB-test:
--------------------------------------------------------------------------------
1 | FROM python:3.11.13-bookworm
2 | 
3 | RUN pip install --no-cache-dir clickhouse-connect==0.8.18 minio==7.2.16 pytest==8.4.1
4 | 


--------------------------------------------------------------------------------
/db/clickhouse_table_schema.sql:
--------------------------------------------------------------------------------
 1 | CREATE DATABASE IF NOT EXISTS default;
 2 | 
 3 | CREATE TABLE IF NOT EXISTS default.user_interactions
 4 | (
 5 |     event_id UUID,
 6 |     user_id UUID,
 7 |     session_id UUID,
 8 |     event_type Enum8('VIEW_PRODUCT' = 1, 'ADD_TO_CART' = 2, 'CHECKOUT' = 3, 'PAYMENT' = 4, 'SEARCH' = 5),  -- 8bits for 5 items
 9 |     event_timestamp DateTime64(3, 'UTC'), -- 3: 10^-3: Millisecond precision, UTC timezone. This is stored as datetime in db.
10 |     request_latency_ms UInt32,
11 |     status Enum8('SUCCESS' = 1, 'ERROR' = 2),
12 |     error_code Nullable(UInt32),
13 |     product_id Nullable(UInt32),
14 |     
15 |     event_minute DateTime MATERIALIZED toStartOfMinute(event_timestamp)
16 | )
17 | 
18 | ENGINE = MergeTree()
19 | PARTITION BY event_minute
20 | ORDER BY (event_minute, event_type);


--------------------------------------------------------------------------------
/db/tests/test_clickhouse_schema.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | import uuid
 5 | from collections.abc import Iterator
 6 | from datetime import datetime, timezone
 7 | from socket import gaierror
 8 | 
 9 | import clickhouse_connect
10 | import pytest
11 | from clickhouse_connect.driver.client import Client
12 | from clickhouse_connect.driver.exceptions import ClickHouseError
13 | 
14 | 
15 | CLICKHOUSE_TABLE = os.environ['CLICKHOUSE_TABLE']
16 | 
17 | 
18 | @pytest.fixture(scope='module')
19 | def clickhouse_client() -> Iterator[Client]:
20 |     """Establish a connection to ClickHouse.
21 |     
22 |     Yields:
23 |         ClickHouse client.
24 |     """
25 |     try:
26 |         client = clickhouse_connect.get_client(
27 |             host=os.environ['CLICKHOUSE_HOST'],
28 |             port=int(os.environ['CLICKHOUSE_PORT']),
29 |             user=os.environ['CLICKHOUSE_USER'],
30 |             password=os.environ['CLICKHOUSE_PASSWORD'],
31 |             database=os.environ['CLICKHOUSE_DB']
32 |         )
33 |         client.ping()
34 |         yield client
35 |         client.command(f'TRUNCATE TABLE IF EXISTS {CLICKHOUSE_TABLE}')
36 |     except (ConnectionRefusedError, gaierror) as e:
37 |         pytest.fail(f'Could not connect to ClickHouse due to a network error: {e}')
38 |     except ClickHouseError as e:
39 |         pytest.fail(f'A ClickHouse server error occurred during connection: {e}')
40 |     except Exception as e:
41 |         pytest.fail(f'An unexpected error occurred while connecting to ClickHouse: {type(e).__name__} - {e}')
42 | 
43 | 
44 | def test_clickhouse_insert_and_select_valid_data(clickhouse_client: Client):
45 |     """Test that a valid row can be inserted and retrieved correctly, verifying data types and materialized column."""
46 |     event_ts = datetime.now()
47 |     
48 |     test_row = (
49 |         uuid.uuid4(),   # event_id
50 |         uuid.uuid4(),   # user_id
51 |         uuid.uuid4(),   # session_id
52 |         'ADD_TO_CART',  # event_type
53 |         event_ts,       # event_timestamp
54 |         250,            # request_latency_ms
55 |         'SUCCESS',      # status
56 |         None,           # error_code
57 |         12345,          # product_id
58 |     )
59 |     clickhouse_client.insert(table=CLICKHOUSE_TABLE, data=[test_row])
60 |     
61 |     result = clickhouse_client.query(
62 |         'SELECT *, event_minute FROM %(table)s WHERE event_id = %(event_id)s',
63 |         parameters={'table': CLICKHOUSE_TABLE, 'event_id': test_row[0]}
64 |     )
65 |     
66 |     retrieved_row = result.result_rows[0]
67 |     assert result.row_count == 1
68 |     assert retrieved_row[0] == test_row[0]
69 |     assert retrieved_row[3] == test_row[3]
70 |     assert retrieved_row[4].replace(microsecond=0) == event_ts.replace(microsecond=0)
71 |     assert retrieved_row[5] == test_row[5]
72 |     assert retrieved_row[7] == test_row[7]
73 |     assert retrieved_row[9] == event_ts.replace(second=0, microsecond=0), 'Materialized event_minute column is incorrect.'
74 | 
75 | 
76 | def test_clickhouse_handles_nullable_fields(clickhouse_client: Client):
77 |     """Test inserting a row where nullable fields are explicitly None."""
78 |     column_names = [
79 |         'event_id', 'user_id', 'session_id', 'event_type', 'event_timestamp',
80 |         'request_latency_ms', 'status', 'error_code', 'product_id'
81 |     ]
82 |     test_data = [(
83 |         uuid.uuid4(), uuid.uuid4(), uuid.uuid4(), 'PAYMENT', datetime.now(tz=timezone.utc),
84 |         250, 'ERROR', 503, None
85 |     )]
86 |     
87 |     clickhouse_client.insert(table=CLICKHOUSE_TABLE, data=test_data, column_names=column_names)
88 |     result = clickhouse_client.query(
89 |         query="SELECT product_id, error_code FROM %(table)s WHERE event_type = 'PAYMENT';",
90 |         parameters={'table': CLICKHOUSE_TABLE}
91 |     )
92 |     assert result.row_count > 0
93 |     retrieved_row = result.result_rows[0]
94 |     assert retrieved_row[0] == test_data[0][8]
95 |     assert retrieved_row[1] == test_data[0][7]
96 | 


--------------------------------------------------------------------------------
/db/tests/test_minio_setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from minio import Minio
 4 | import pytest
 5 | 
 6 | 
 7 | @pytest.fixture(scope='module')
 8 | def minio_client() -> Minio:
 9 |     """Establish a connection to MinIO.
10 |     
11 |     Returns:
12 |         MinIO client.
13 |     """
14 |     try:
15 |         minio_client = Minio(
16 |             endpoint='minio:9000',
17 |             access_key=os.environ['MINIO_ROOT_USER'],
18 |             secret_key=os.environ['MINIO_ROOT_PASSWORD'],
19 |             secure=False
20 |         )
21 |         minio_client.list_buckets()  # Ping the server
22 |         return minio_client
23 |     except Exception as e:
24 |         pytest.fail(f'An unexpected error occurred while connecting to MinIO: {type(e).__name__} - {e}')
25 | 
26 | 
27 | def test_minio_bucket_exists(minio_client: Minio):
28 |     """Test that the bucket was created by the minio-init service."""
29 |     bucket_name = os.environ['MINIO_BUCKET_NAME']
30 |     assert minio_client.bucket_exists(bucket_name=bucket_name), f"Bucket '{bucket_name}' should exist, but it doesn't."
31 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | x-airflow-common:
  2 |   &airflow-common
  3 |   build:
  4 |       context: ./airflow
  5 |       dockerfile: Dockerfile-Airflow
  6 |   depends_on:
  7 |     &airflow-common-depends-on
  8 |     postgres:
  9 |       condition: service_healthy
 10 |     redis:
 11 |       condition: service_healthy
 12 |   env_file:
 13 |     - ./env/clickhouse.env
 14 |     - ./env/minio.env
 15 |     - ./env/minio.creds
 16 |     - ./env/spark.env
 17 |   environment:
 18 |     &airflow-common-env
 19 |     AIRFLOW__API__SECRET_KEY: ${AIRFLOW__API__SECRET_KEY}
 20 |     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
 21 |     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres/${POSTGRES_DB}
 22 |     AIRFLOW__CORE__AUTH_MANAGER: airflow.providers.fab.auth_manager.fab_auth_manager.FabAuthManager
 23 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'false'
 24 |     AIRFLOW__CORE__EXECUTION_API_SERVER_URL: 'http://airflow-apiserver:8080/execution/'
 25 |     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
 26 |     AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW__CORE__FERNET_KEY}
 27 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
 28 |     AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres/${POSTGRES_DB}
 29 |     AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
 30 |     AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
 31 |     AIRFLOW_UID: ${AIRFLOW_UID}
 32 |     AIRFLOW__WEBSERVER__DEFAULT_UI_TIMEZONE: 'Asia/Tehran'
 33 |     _PIP_ADDITIONAL_REQUIREMENTS: ''
 34 |     DASHBOARD_API_URL: http://dashboard-api:8080/report
 35 |   image: lp/airflow
 36 |   networks:
 37 |       - lp
 38 |   user: "${AIRFLOW_UID}:0"
 39 |   volumes:
 40 |     - ./airflow/config:/opt/airflow/config
 41 |     - ./airflow/dags:/opt/airflow/dags
 42 |     - ./airflow/logs:/opt/airflow/logs
 43 | 
 44 | x-healthcheck:
 45 |   healthcheck:
 46 |     &healthcheck-common
 47 |       interval: 30s
 48 |       timeout: 5s
 49 |       retries: 3
 50 |       start_period: 30s
 51 | 
 52 | services:
 53 |   airflow-apiserver:
 54 |     <<: *airflow-common
 55 |     command: api-server
 56 |     depends_on:
 57 |       <<: *airflow-common-depends-on
 58 |       airflow-init:
 59 |         condition: service_completed_successfully
 60 |     healthcheck:
 61 |       <<: *healthcheck-common
 62 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/api/v2/version"]  # Set by airflow docker-compose.yml
 63 |       # test: ["CMD", "curl", "--fail", "http://localhost:8080/monitor/health"]  # https://airflow.apache.org/docs/apache-airflow/stable/stable-rest-api-ref.html
 64 |     ports:
 65 |       - "8080:8080"
 66 |     restart: always
 67 |   
 68 |   airflow-init:
 69 |     <<: *airflow-common
 70 |     command:
 71 |       - -c
 72 |       - |
 73 |         if [[ -z "${AIRFLOW_UID}" ]]; then
 74 |           echo
 75 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
 76 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
 77 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
 78 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
 79 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
 80 |           echo
 81 |           export AIRFLOW_UID=$$(id -u)
 82 |         fi
 83 |         one_meg=1048576
 84 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
 85 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
 86 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
 87 |         warning_resources="false"
 88 |         if (( mem_available < 4000 )) ; then
 89 |           echo
 90 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
 91 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
 92 |           echo
 93 |           warning_resources="true"
 94 |         fi
 95 |         if (( cpus_available < 2 )); then
 96 |           echo
 97 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
 98 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
 99 |           echo
100 |           warning_resources="true"
101 |         fi
102 |         if (( disk_available < one_meg * 10 )); then
103 |           echo
104 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
105 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
106 |           echo
107 |           warning_resources="true"
108 |         fi
109 |         if [[ $${warning_resources} == "true" ]]; then
110 |           echo
111 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
112 |           echo "Please follow the instructions to increase amount of resources available:"
113 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
114 |           echo
115 |         fi
116 |         echo
117 |         echo "Creating missing opt dirs if missing:"
118 |         echo
119 |         mkdir -v -p /opt/airflow/{logs,dags,plugins,config}
120 |         mkdir /opt/airflow/logs/spark
121 |         echo
122 |         echo "Airflow version:"
123 |         /entrypoint airflow version
124 |         echo
125 |         echo "Files in shared volumes:"
126 |         echo
127 |         ls -la /opt/airflow/{logs,dags,plugins,config}
128 |         echo
129 |         echo "Running airflow config list to create default config file if missing."
130 |         echo
131 |         /entrypoint airflow config list >/dev/null
132 |         echo
133 |         echo "Files in shared volumes:"
134 |         echo
135 |         ls -la /opt/airflow/{logs,dags,plugins,config}
136 |         echo
137 |         echo "Change ownership of files in /opt/airflow to ${AIRFLOW_UID}:0"
138 |         echo
139 |         chown -R "${AIRFLOW_UID}:0" /opt/airflow/
140 |         echo
141 |         echo "Change ownership of files in shared volumes to ${AIRFLOW_UID}:0"
142 |         echo
143 |         chown -v -R "${AIRFLOW_UID}:0" /opt/airflow/{logs,dags,plugins,config}
144 |         echo
145 |         echo "Files in shared volumes:"
146 |         echo
147 |         ls -la /opt/airflow/{logs,dags,plugins,config}
148 |         echo
149 |         echo "Waiting for db to create ClickHouse and MinIO connections"
150 |         until airflow db check; do
151 |           >&2 echo "Airflow DB not reachable. Waiting..."
152 |           sleep 5
153 |         done
154 |         echo
155 |         echo "Add/Update ClickHouse connection"
156 |         airflow connections add "$CLICKHOUSE_CONN_NAME" \
157 |           --conn-uri "clickhouse://$CLICKHOUSE_USER:$CLICKHOUSE_PASSWORD@$CLICKHOUSE_HOST:$CLICKHOUSE_PORT/$CLICKHOUSE_DB"
158 |         echo
159 |         echo "Add/Update MinIO connection"
160 |         airflow connections add "$MINIO_CONN_NAME" \
161 |           --conn-type     'aws' \
162 |           --conn-login    "$MINIO_ROOT_USER" \
163 |           --conn-password "$MINIO_ROOT_PASSWORD" \
164 |           --conn-extra   '{"host": "http://minio:9000"}'
165 |         echo
166 |         echo "Add/Update Spark connection"
167 |         airflow connections add "$SPARK_CONN_NAME" \
168 |           --conn-type "spark" \
169 |           --conn-host "spark://spark-master" \
170 |           --conn-port "7077"
171 |         echo
172 |         echo "Done."
173 |         exit 0;
174 |     depends_on:
175 |       <<: *airflow-common-depends-on
176 |     entrypoint: /bin/bash
177 |     environment:
178 |       <<: *airflow-common-env
179 |       _AIRFLOW_DB_MIGRATE: 'true'
180 |       _AIRFLOW_WWW_USER_CREATE: 'true'
181 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME}
182 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD}
183 |       _PIP_ADDITIONAL_REQUIREMENTS: ''
184 |     user: "0:0"
185 |   
186 |   airflow-dag-processor:
187 |     <<: *airflow-common
188 |     command: dag-processor
189 |     depends_on:
190 |       <<: *airflow-common-depends-on
191 |       airflow-init:
192 |         condition: service_completed_successfully
193 |     healthcheck:
194 |       <<: *healthcheck-common
195 |       test: ["CMD-SHELL", 'airflow jobs check --job-type DagProcessorJob --hostname "$${HOSTNAME}"']
196 |     restart: always
197 |   
198 |   airflow-scheduler:
199 |     <<: *airflow-common
200 |     command: scheduler
201 |     depends_on:
202 |       <<: *airflow-common-depends-on
203 |       airflow-init:
204 |         condition: service_completed_successfully
205 |     healthcheck:
206 |       <<: *healthcheck-common
207 |       test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
208 |     restart: always
209 |   
210 |   airflow-worker:
211 |     <<: *airflow-common
212 |     command: celery worker
213 |     depends_on:
214 |       <<: *airflow-common-depends-on
215 |       airflow-apiserver:
216 |         condition: service_healthy
217 |       airflow-init:
218 |         condition: service_completed_successfully
219 |     environment:
220 |       <<: *airflow-common-env
221 |       DUMB_INIT_SETSID: "0"
222 |     healthcheck:
223 |       <<: *healthcheck-common
224 |       test:
225 |         - "CMD-SHELL"
226 |         - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
227 |     restart: always
228 |     hostname: airflow-worker
229 |   
230 |   broker:
231 |     env_file:
232 |       - ./env/kafka.env
233 |     environment:
234 |       KAFKA_NODE_ID: 1
235 |       KAFKA_CONTROLLER_QUORUM_VOTERS: '1@broker:29093'
236 |       KAFKA_PROCESS_ROLES: 'broker,controller'
237 |       KAFKA_LISTENERS: 'PLAINTEXT://broker:29092,CONTROLLER://broker:29093,PLAINTEXT_HOST://0.0.0.0:9092'
238 |       KAFKA_ADVERTISED_LISTENERS: 'PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092'
239 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: 'CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT'
240 |       KAFKA_INTER_BROKER_LISTENER_NAME: 'PLAINTEXT'
241 |       KAFKA_CONTROLLER_LISTENER_NAMES: 'CONTROLLER'
242 |       KAFKA_LOG_DIRS: 'var/lib/kafka/data'
243 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
244 |       KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
245 |       KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
246 |       KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
247 |     image: confluentinc/cp-kafka:7.9.1-1-ubi8
248 |     healthcheck:
249 |       <<: *healthcheck-common
250 |       test: ['CMD', 'kafka-broker-api-versions', '--bootstrap-server', 'broker:29092']
251 |     hostname: broker
252 |     networks:
253 |       - lp
254 |     restart: always
255 |     volumes:
256 |       - broker:/var/lib/kafka/data
257 |   
258 |   broker-helper:
259 |     command:
260 |       - -c
261 |       - |
262 |         echo "Waiting for broker..."
263 |         until kafka-topics --bootstrap-server broker:29092 --list; do sleep 2; done
264 |         echo "broker available. Checking $${KAFKA_TOPIC} topic..."
265 |         if kafka-topics --bootstrap-server broker:29092 --list | grep -q "^$${KAFKA_TOPIC}$$"; then
266 |           echo "Topic exists. Checking partitions count..."
267 |           TOPIC_INFO=$$(kafka-topics --bootstrap-server broker:29092 --describe --topic $${KAFKA_TOPIC})
268 |           PARTITION_COUNT=$$(echo \"$$TOPIC_INFO\" | grep -oE "PartitionCount: [0-9]+" | awk "{print $$2}")
269 |           echo \"Current partition count: $$PARTITION_COUNT\"
270 |           if [ \"$$PARTITION_COUNT\" != \"6\" ]; then
271 |             echo "Changing partition count to 6..."
272 |             kafka-topics --bootstrap-server broker:29092 --alter --topic $${KAFKA_TOPIC} --partitions 6
273 |             echo "Partition count updated to 6"
274 |           else
275 |             echo "Partition count is already 6"
276 |           fi
277 |         else
278 |           echo "Creating $${KAFKA_TOPIC} topic with 6 partitions..."
279 |           kafka-topics --bootstrap-server broker:29092 --create --topic $${KAFKA_TOPIC} --partitions 6 --replication-factor 1
280 |         fi
281 |         echo "Topic info:"
282 |         kafka-topics --bootstrap-server broker:29092 --describe --topic $${KAFKA_TOPIC}
283 |         echo "Done"
284 |         exit 0;
285 |     entrypoint: /bin/bash
286 |     env_file:
287 |       - ./env/kafka.env
288 |     depends_on:
289 |       broker:
290 |         condition: service_healthy
291 |     image: confluentinc/cp-kafka:7.9.1-1-ubi8
292 |     networks:
293 |       - lp
294 |   
295 |   clickhouse:
296 |     env_file:
297 |       - ./env/clickhouse.creds
298 |     healthcheck:
299 |       <<: *healthcheck-common
300 |       test:
301 |         - "CMD-SHELL"
302 |         - "clickhouse-client --user $${CLICKHOUSE_USER} --password $${CLICKHOUSE_PASSWORD} -q 'SELECT 1'"
303 |     hostname: clickhouse
304 |     image: clickhouse/clickhouse-server:25.6.3.116
305 |     networks:
306 |       - lp
307 |     ports:
308 |       - '8123:8123'
309 |     restart: always
310 |     volumes:
311 |       - ./db/clickhouse_table_schema.sql:/docker-entrypoint-initdb.d/init.sql:ro
312 |       - clickhouse:/var/lib/clickhouse/
313 |   
314 |   connect:
315 |     depends_on:
316 |       broker:
317 |         condition: service_healthy
318 |       schema-registry:
319 |         condition: service_healthy
320 |     environment:
321 |       CONNECT_BOOTSTRAP_SERVERS: 'PLAINTEXT://broker:29092'
322 |       CONNECT_REST_ADVERTISED_HOST_NAME: connect
323 |       CONNECT_REST_PORT: 8083
324 |       CONNECT_GROUP_ID: 'clickhouse-connect-group'
325 |       CONNECT_CONFIG_STORAGE_TOPIC: '_connect-configs'
326 |       CONNECT_OFFSET_STORAGE_TOPIC: '_connect-offsets'
327 |       CONNECT_STATUS_STORAGE_TOPIC: '_connect-status'
328 |       CONNECT_REPLICATION_FACTOR: 1
329 |       CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1
330 |       CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1
331 |       CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1
332 |       CONNECT_KEY_CONVERTER: 'org.apache.kafka.connect.converters.ByteArrayConverter'
333 |       CONNECT_VALUE_CONVERTER: 'io.confluent.connect.avro.AvroConverter'
334 |       CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: 'http://schema-registry:8081'
335 |       CONNECT_SCHEMA_REGISTRY_URL: 'http://schema-registry:8081'
336 |       CONNECT_VALUE_CONVERTER_SCHEMA_ENABLE: 'true'
337 |       CONNECT_PLUGIN_PATH: '/usr/share/java/clickhouse'
338 |     healthcheck:
339 |       <<: *healthcheck-common
340 |       test: ['CMD', 'curl', '-f', 'http://localhost:8083/connectors']
341 |     hostname: connect
342 |     image: confluentinc/cp-kafka-connect:7.9.1-1-ubi8.amd64
343 |     networks:
344 |       - lp
345 |     restart: always
346 |     volumes:
347 |       - ./kafka/connectors:/usr/share/java/clickhouse:ro
348 |   
349 |   connect-helper:
350 |     build:
351 |       context: ./kafka
352 |       dockerfile: Dockerfile-Connect-helper
353 |     command:
354 |       - -c
355 |       - |
356 |         # Exit immediately if a command exits with a non-zero status.
357 |         set -e
358 |         echo "--- Verifying Environment Variables ---"
359 |         printenv | grep CLICKHOUSE || echo "CLICKHOUSE variables not found!"
360 |         echo "-------------------------------------"
361 |         
362 |         # Read secrets into env vars
363 |         export CLICKHOUSE_USER
364 |         export CLICKHOUSE_PASSWORD
365 |         export CLICKHOUSE_HOST
366 |         export CLICKHOUSE_PORT
367 |         export CLICKHOUSE_DB
368 |         export KAFKA_TOPIC
369 |         export DOLLAR="$"
370 |         
371 |         # Prepare a temporary config file with substituted variables
372 |         CONNECTOR_CONFIG_FILE="/tmp/config.json"
373 |         envsubst < /home/clickhouse_connector_configuration.json > "$${CONNECTOR_CONFIG_FILE}"
374 |         echo "--- Generated Connector Configuration ---"
375 |         cat "$${CONNECTOR_CONFIG_FILE}"
376 |         echo "---------------------------------------"
377 |         
378 |         CONNECT_HOST="$${CONNECT_HOST:-connect}"
379 |         CONNECT_PORT="$${CONNECT_PORT:-8083}"
380 |         CONNECT_URL="http://$${CONNECT_HOST}:$${CONNECT_PORT}/connectors"
381 |         echo "$${CONNECT_URL}"
382 |         echo "Waiting for Kafka Connect to be available at $${CONNECT_URL}..."
383 |         # Use a loop to check if the Connect REST API is up and running.
384 |         # The -f flag makes curl fail silently on server errors.
385 |         until $$(curl -s -f -o /dev/null $${CONNECT_URL}); do
386 |             sleep 2
387 |         done
388 |         echo "Kafka Connect is available. Checking if the configuration is already applied..."
389 |         CONNECTOR_NAME=$$(jq -r .name "$${CONNECTOR_CONFIG_FILE}")
390 |         if curl -s -f "http://$${CONNECT_HOST}:$${CONNECT_PORT}/connectors/$${CONNECTOR_NAME}/status" > /dev/null; then
391 |             echo "Connector '$$CONNECTOR_NAME' already configured. No further action needed."
392 |         else
393 |             echo "Connector '$$CONNECTOR_NAME' is NOT applied. Configuring the ClickHouse sink connector..."
394 |             HTTP_CODE=$$(curl -s -o response.txt -w "%{http_code}" -X POST -H "Content-Type:application/json" --data "@$${CONNECTOR_CONFIG_FILE}" "$${CONNECT_URL}")
395 |             if [ "$$HTTP_CODE" -eq 200 ] || [ "$$HTTP_CODE" -eq 201 ]; then
396 |                 echo "ClickHouse sink connector configured."
397 |             else
398 |                 echo "Failed to configure clickhouse sink connector."
399 |                 cat response.txt
400 |                 exit 1
401 |             fi
402 |         fi
403 |         exit 0
404 |     depends_on:
405 |       connect:
406 |         condition: service_healthy
407 |       broker-helper:
408 |         condition: service_completed_successfully
409 |     entrypoint: /bin/bash
410 |     env_file:
411 |       - ./env/clickhouse.env
412 |       - ./env/clickhouse.creds
413 |       - ./env/kafka.env
414 |     image: lp/connect-helper
415 |     networks:
416 |       - lp
417 |     volumes:
418 |       - ./kafka/clickhouse_connector_configuration.json:/home/clickhouse_connector_configuration.json:ro
419 |   
420 |   consumer:  # This is just for debugging
421 |     command:
422 |       - -c
423 |       - |
424 |         echo "Starting Kafka Consumer..."
425 |         kafka-console-consumer --bootstrap-server broker:29092 --topic $${KAFKA_TOPIC} --from-beginning
426 |     depends_on:
427 |       - broker
428 |     entrypoint: /bin/bash
429 |     env_file:
430 |       - ./env/kafka.env
431 |     environment:
432 |       KAFKA_BOOTSTRAP_SERVERS: 'PLAINTEXT://broker:29092'
433 |       KAFKA_GROUP_ID: my-debug-consumer-group
434 |       KAFKA_AUTO_OFFSET_RESET: earliest
435 |       KAFKA_ENABLE_AUTO_COMMIT: 'true'
436 |     image: confluentinc/cp-kafka:7.9.1-1-ubi8
437 |     networks:
438 |       - lp
439 |     profiles:
440 |       - debug
441 |   
442 |   dashboard-api:
443 |     build:
444 |       context: ./dashboard
445 |       dockerfile: Dockerfile-Dashboard-api
446 |     command: ["uvicorn", "dashboard_api:app", "--host", "0.0.0.0", "--port", "8080"]
447 |     healthcheck:
448 |       <<: *healthcheck-common
449 |       test: ['CMD', 'curl', '-f', 'http://localhost:8080/health']
450 |     hostname: dashboard-api
451 |     image: lp/dashboard-api
452 |     networks:
453 |       - lp
454 |     restart: always
455 |     volumes:
456 |       - ./dashboard/dashboard_api.py:/home/dashboard_api.py:ro
457 |     working_dir: /home
458 |   
459 |   dashboard-ui:
460 |     build:
461 |       context: ./dashboard
462 |       dockerfile: Dockerfile-Dashboard-ui
463 |     command: ["streamlit", "run", "dashboard_ui.py", \
464 |               "--server.address", "0.0.0.0", \
465 |               "--server.port", "8501", \
466 |               "--server.headless", "true", \
467 |               "--server.fileWatcherType", "none", \
468 |               "--browser.gatherUsageStats", "false"]
469 |     environment:
470 |       REPORTS_URL: http://dashboard-api:8080/report
471 |     hostname: dashboard-ui
472 |     image: lp/dashboard-ui
473 |     networks:
474 |       - lp
475 |     ports:
476 |       - "8501:8501"
477 |     restart: always
478 |     volumes:
479 |       - ./dashboard/dashboard_ui.py:/home/dashboard_ui.py:ro
480 |     working_dir: /home
481 |   
482 |   minio:
483 |     command: server /data --console-address ":9001"
484 |     env_file:
485 |       - ./env/minio.creds
486 |     healthcheck:
487 |       <<: *healthcheck-common
488 |       test: ['CMD', 'mc', 'ready', 'local']
489 |     hostname: minio
490 |     image: minio/minio:RELEASE.2025-07-18T21-56-31Z
491 |     networks:
492 |       - lp
493 |     restart: always
494 |     volumes:
495 |       - minio:/data
496 |   
497 |   minio-init:
498 |     command:
499 |       - -c
500 |       - |
501 |         # Although it depends on minio, but it not guaranteed. So we have to do manuall wating.
502 |         sleep 5
503 |         
504 |         echo "Setup alias for MinIO server"
505 |         mc alias set $$MINIO_CONN_NAME http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD;
506 |         
507 |         echo "Create the bucket if it doesn't exist"
508 |         mc mb $$MINIO_CONN_NAME/$$MINIO_BUCKET_NAME || true;
509 |         
510 |         exit 0;
511 |     depends_on:
512 |       minio:
513 |         condition: service_healthy
514 |     entrypoint: /bin/bash
515 |     env_file:
516 |       - ./env/minio.env
517 |       - ./env/minio.creds
518 |     hostname: minio-init
519 |     image: minio/minio:RELEASE.2025-07-18T21-56-31Z
520 |     networks:
521 |       - lp
522 |   
523 |   postgres:
524 |     env_file:
525 |       - ./env/postgres.env
526 |       - ./env/postgres.creds
527 |     healthcheck:
528 |       <<: *healthcheck-common
529 |       test: ["CMD", "pg_isready", "-U", "$${POSTGRES_USER}"]
530 |     image: postgres:17.4-bookworm
531 |     networks:
532 |       - lp
533 |     restart: always
534 |     volumes:
535 |       - postgres:/var/lib/postgresql/data
536 |   
537 |   producer:
538 |     build:
539 |       context: ./producer
540 |       dockerfile: Dockerfile-Producer
541 |     command: ["python", "producer.py"]
542 |     depends_on:
543 |       broker:
544 |         condition: service_healthy
545 |       schema-registry:
546 |         condition: service_healthy
547 |     env_file:
548 |       - ./env/kafka.env
549 |     image: lp/producer
550 |     networks:
551 |       - lp
552 |     restart: always
553 |     volumes:
554 |       - ./producer:/home/app
555 |     working_dir: /home/app
556 |   
557 |   redis:
558 |     image: redis:7.2.10-bookworm
559 |     expose:
560 |       - 6379
561 |     healthcheck:
562 |       <<: *healthcheck-common
563 |       test: ["CMD", "redis-cli", "ping"]
564 |     networks:
565 |       - lp
566 |     restart: always
567 |   
568 |   schema-registry:
569 |     depends_on:
570 |       broker:
571 |         condition: service_started
572 |     environment:
573 |       SCHEMA_REGISTRY_HOST_NAME: schema-registry
574 |       SCHEMA_REGISTRY_LISTENERS: 'http://0.0.0.0:8081'
575 |       SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: 'PLAINTEXT://broker:29092'
576 |     hostname: schema-registry
577 |     image: confluentinc/cp-schema-registry:7.9.1-1-ubi8.amd64
578 |     healthcheck:
579 |       <<: *healthcheck-common
580 |       test: ["CMD", "curl", "-f", "http://localhost:8081/subjects"]
581 |     networks:
582 |       - lp
583 |     restart: always
584 |   
585 |   spark-master:
586 |     build:
587 |       context: ./spark
588 |       dockerfile: Dockerfile-Spark
589 |     command: /opt/spark/bin/spark-class org.apache.spark.deploy.master.Master --host spark-master
590 |     depends_on:
591 |       - minio
592 |     healthcheck:
593 |       <<: *healthcheck-common
594 |       test: ['CMD', 'curl', '-f', 'http://localhost:8080']
595 |     hostname: spark-master
596 |     image: lp/spark
597 |     networks:
598 |       - lp
599 |     ports:
600 |       - "8182:8080"
601 |     restart: always
602 |     volumes:
603 |       - ./airflow/dags:/opt/airflow/dags
604 |   
605 |   spark-worker:
606 |     build:
607 |       context: ./spark
608 |       dockerfile: Dockerfile-Spark
609 |     command: /opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077
610 |     depends_on:
611 |       - spark-master
612 |     env_file:
613 |       - ./env/minio.env
614 |       - ./env/minio.creds
615 |     environment:
616 |       SPARK_WORKER_CORES: 2
617 |       SPARK_WORKER_MEMORY: 1g
618 |     healthcheck:
619 |       <<: *healthcheck-common
620 |       test: ['CMD', 'curl', '-f', 'http://localhost:8081']
621 |     hostname: spark-worker
622 |     image: lp/spark
623 |     networks:
624 |       - lp
625 |     ports:
626 |       - "8183:8081"
627 |     restart: always
628 |     volumes:
629 |       - ./airflow/dags:/opt/airflow/dags
630 | 
631 | networks:
632 |   lp:
633 |     name: lp
634 | 
635 | volumes:
636 |   broker:
637 |   clickhouse:
638 |   minio:
639 |   postgres:
640 | 


--------------------------------------------------------------------------------
/env/airflow.creds:
--------------------------------------------------------------------------------
1 | AIRFLOW__API__SECRET_KEY=bfaed84e69f0ae94c5e1b721ae443fa3
2 | _AIRFLOW_WWW_USER_USERNAME=airflow
3 | _AIRFLOW_WWW_USER_PASSWORD=airflow
4 | AIRFLOW__CORE__FERNET_KEY=FDhw_XNW-bppePFnhse2QlJn7ZfNaLy9gjlQZwPh2a0=


--------------------------------------------------------------------------------
/env/airflow.env:
--------------------------------------------------------------------------------
1 | AIRFLOW_UID=1000


--------------------------------------------------------------------------------
/env/clickhouse.creds:
--------------------------------------------------------------------------------
1 | CLICKHOUSE_USER=default
2 | CLICKHOUSE_PASSWORD=password


--------------------------------------------------------------------------------
/env/clickhouse.env:
--------------------------------------------------------------------------------
1 | CLICKHOUSE_HOST=clickhouse
2 | CLICKHOUSE_PORT=8123
3 | CLICKHOUSE_DB=default
4 | CLICKHOUSE_TABLE=user_interactions
5 | CLICKHOUSE_CONN_NAME=clickhouse_connection


--------------------------------------------------------------------------------
/env/kafka.env:
--------------------------------------------------------------------------------
1 | KAFKA_TOPIC=user_interactions
2 | SCHEMA_REGISTRY_URL=http://schema-registry:8081
3 | CLUSTER_ID='8e42df2d-b58f-47d0-a2cf-e75ff2f2e58e'
4 | 
5 | KAFKA_BOOTSTRAP_SERVERS=broker:29092
6 | 
7 | FLUSH_SECONDS=2


--------------------------------------------------------------------------------
/env/minio.creds:
--------------------------------------------------------------------------------
1 | MINIO_ROOT_USER=admin
2 | MINIO_ROOT_PASSWORD=adminpsw


--------------------------------------------------------------------------------
/env/minio.env:
--------------------------------------------------------------------------------
1 | MINIO_BUCKET_NAME=user-interactions
2 | MINIO_CONN_NAME=minio_connection


--------------------------------------------------------------------------------
/env/postgres.creds:
--------------------------------------------------------------------------------
1 | POSTGRES_USER=airflow
2 | POSTGRES_PASSWORD=airflow


--------------------------------------------------------------------------------
/env/postgres.env:
--------------------------------------------------------------------------------
1 | POSTGRES_DB=airflow


--------------------------------------------------------------------------------
/env/spark.env:
--------------------------------------------------------------------------------
1 | SPARK_CONN_NAME=spark_connection
2 | SPARK_APPLICATION_PATH='/opt/airflow/dags/spark.py'
3 | HADOOP_AWS_VERSION=3.3.4
4 | AWS_JAVA_SDK_VERSION=1.12.262
5 | SPARK_EVENT_LOG_ENABLED=true


--------------------------------------------------------------------------------
/images/Airflow_UI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xadra-T/End2End-Data-Pipeline/48a430687115d20f58a6d3f4ade67369674f1aca/images/Airflow_UI.png


--------------------------------------------------------------------------------
/images/Chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xadra-T/End2End-Data-Pipeline/48a430687115d20f58a6d3f4ade67369674f1aca/images/Chart.png


--------------------------------------------------------------------------------
/images/Pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xadra-T/End2End-Data-Pipeline/48a430687115d20f58a6d3f4ade67369674f1aca/images/Pipeline.png


--------------------------------------------------------------------------------
/kafka/Dockerfile-Connect-helper:
--------------------------------------------------------------------------------
1 | FROM alpine:3.22.0
2 | 
3 | RUN apk add --no-cache curl jq bash gettext
4 | 


--------------------------------------------------------------------------------
/kafka/clickhouse_connector_configuration.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "clickhouse-connect-sink-user-interactions",
 3 |   "config": {
 4 |     "connector.class": "com.clickhouse.kafka.connect.ClickHouseSinkConnector",
 5 |     "tasks.max": "6",
 6 |     "topics": "${KAFKA_TOPIC}",
 7 |     "database": "${CLICKHOUSE_DB}",
 8 |     "hostname": "${CLICKHOUSE_HOST}",
 9 |     "port": "${CLICKHOUSE_PORT}",
10 |     "username": "${CLICKHOUSE_USER}",
11 |     "password": "${CLICKHOUSE_PASSWORD}",
12 |     "ssl": "false",
13 |     "key.converter": "org.apache.kafka.connect.converters.ByteArrayConverter",
14 |     "value.converter": "io.confluent.connect.avro.AvroConverter",
15 |     "value.converter.schema.registry.url": "http://schema-registry:8081",
16 |     "value.converter.schemas.enable": "true",
17 |     "errors.tolerance": "none",
18 |     "errors.log.enable": "true",
19 |     "errors.log.include.message": "true",
20 |     
21 |     "transforms": "castTimestamp",
22 |     "transforms.castTimestamp.type": "org.apache.kafka.connect.transforms.TimestampConverter${DOLLAR}Value",
23 |     "transforms.castTimestamp.target.type": "Timestamp",
24 |     "transforms.castTimestamp.field": "event_timestamp"
25 |   }
26 | }


--------------------------------------------------------------------------------
/kafka/connectors/clickhouse-kafka-connect-v1.3.1-confluent.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Xadra-T/End2End-Data-Pipeline/48a430687115d20f58a6d3f4ade67369674f1aca/kafka/connectors/clickhouse-kafka-connect-v1.3.1-confluent.jar


--------------------------------------------------------------------------------
/producer/Dockerfile-Producer:
--------------------------------------------------------------------------------
1 | FROM python:3.11.13-bookworm
2 | 
3 | ENV PYTHONBUFFERED=1
4 | 
5 | RUN pip install --no-cache-dir python-dotenv==1.1.1 confluent-kafka[avro]==2.11.0
6 | 


--------------------------------------------------------------------------------
/producer/Dockerfile-Producer-test:
--------------------------------------------------------------------------------
1 | FROM lp/producer:latest
2 | 
3 | ENV PYTHONBUFFERED=1
4 | 
5 | RUN pip install --no-cache-dir pytest==8.4.1 pytest-mock==3.14.1 clickhouse-connect==0.8.18
6 | 


--------------------------------------------------------------------------------
/producer/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from enum import Enum
 3 | 
 4 | from dotenv import load_dotenv
 5 | from schema_registry import avro_serializer, uuid_serializer
 6 | 
 7 | 
 8 | load_dotenv()
 9 | 
10 | 
11 | class Events(str, Enum):
12 |     ADD_TO_CART = 'ADD_TO_CART'
13 |     CHECKOUT = 'CHECKOUT'
14 |     PAYMENT = 'PAYMENT'
15 |     SEARCH = 'SEARCH'
16 |     VIEW_PRODUCT = 'VIEW_PRODUCT'
17 | 
18 | 
19 | class Status(str, Enum):
20 |     SUCCESS = 'SUCCESS'
21 |     ERROR = 'ERROR'
22 | 
23 | 
24 | NUM_WORKERS = 1
25 | KAFKA_TOPIC = os.environ['KAFKA_TOPIC']
26 | EVENT_INTERVAL_SECONDS = 0.01
27 | NEW_USER_SESSION_PROBABILITY = 0.01
28 | 
29 | PRODUCER_CONF = {
30 |     'acks': 'all',
31 |     'batch.size': 32768,  # 32 KB
32 |     'linger.ms': 20,
33 |     'bootstrap.servers': os.environ['KAFKA_BOOTSTRAP_SERVERS'],
34 |     'compression.type': 'snappy',
35 |     'key.serializer': uuid_serializer,
36 |     'value.serializer': avro_serializer,
37 | }
38 | 


--------------------------------------------------------------------------------
/producer/custom_types.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | from typing import TypedDict
 4 | 
 5 | 
 6 | class Event(TypedDict):
 7 |     event_id: str
 8 |     user_id: str
 9 |     session_id: str
10 |     event_type: str
11 |     event_timestamp: int
12 |     request_latency_ms: int
13 |     status: str
14 |     error_code: int | None
15 |     product_id: int | None
16 | 


--------------------------------------------------------------------------------
/producer/producer.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import logging
  4 | import os
  5 | import random
  6 | import sys
  7 | import time
  8 | import uuid
  9 | from multiprocessing import Process
 10 | from uuid import UUID
 11 | 
 12 | from confluent_kafka import Message
 13 | from confluent_kafka.error import KafkaError, KafkaException, ValueSerializationError
 14 | from confluent_kafka.serializing_producer import SerializingProducer
 15 | 
 16 | from config import EVENT_INTERVAL_SECONDS, Events, Status, NUM_WORKERS, NEW_USER_SESSION_PROBABILITY, PRODUCER_CONF, KAFKA_TOPIC
 17 | from custom_types import Event
 18 | 
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | def generate_event(user_id: UUID, session_id: UUID) -> Event:
 24 |     """Generate a user event dictionary.
 25 |     
 26 |     Args:
 27 |         user_id: The UUID of the user.
 28 |         session_id: The UUID of the session.
 29 |     
 30 |     Returns:
 31 |         A dictionary representing the event log.
 32 |     """
 33 |     error_probability = random.uniform(0, 0.5)
 34 |     has_error = random.random() < error_probability
 35 |     event_type = random.choice(list(Events))
 36 |     
 37 |     return {
 38 |         'event_id': str(uuid.uuid4()),
 39 |         'user_id': str(user_id),
 40 |         'session_id': str(session_id),
 41 |         'event_type': event_type,
 42 |         
 43 |         'event_timestamp': int(time.time() * 1000),
 44 |         # time.time() unit is second (in UTC). Avro timestamp-millis expects milliseconds. Multiplying by 1000 is the unit conversion.
 45 |         # ClickHouse column is set DateTime64(3, 'UTC') to store and display the value in UTC with millisecond precision.
 46 |         
 47 |         'request_latency_ms': random.randint(50, 1500),
 48 |         'status': Status.ERROR if has_error else Status.SUCCESS,
 49 |         'error_code': random.randint(400, 599) if has_error else None,
 50 |         'product_id': random.randint(1, 10000) if event_type in {Events.VIEW_PRODUCT, Events.ADD_TO_CART} else None
 51 |     }
 52 | 
 53 | 
 54 | def delivery_report(err: KafkaError | None, msg: Message) -> None:
 55 |     """Report delivery failures.
 56 |     
 57 |     Args:
 58 |         err: KafkaError on failure; None on success.
 59 |         msg: The Message containing topic/partition/offset metadata (on success), and the original key/value.
 60 |     """
 61 |     if err is not None:
 62 |         try:
 63 |             code = err.code()
 64 |             reason = err.str()
 65 |         except Exception:
 66 |             code = 'unknown'
 67 |             reason = str(err)
 68 |         logger.error(
 69 |             'Delivery failed: topic=%s, partition=%s, key=%s, error_code=%s, reason=%s',
 70 |             msg.topic(),
 71 |             msg.partition(),
 72 |             msg.key(),
 73 |             code,
 74 |             reason,
 75 |         )
 76 | 
 77 | 
 78 | def worker(worker_id: int, max_messages: int | None = None) -> None:
 79 |     """Continuously generate data and send it to Kafka.
 80 |     
 81 |     Args:
 82 |         worker_id: A unique identifier for the worker process.
 83 |         max_messages: If provided, the worker will stop after producing this many messages. Used for testing.
 84 |     """
 85 |     logger.info('Starting worker %d (PID: %d)', worker_id, os.getpid())
 86 |     producer = SerializingProducer(PRODUCER_CONF)
 87 |     
 88 |     user_id = uuid.uuid4()
 89 |     session_id = uuid.uuid4()
 90 |     count = 0
 91 |     time_start = time.time()
 92 |     
 93 |     while True if max_messages is None else count < max_messages:
 94 |         count += 1
 95 |         user_event = generate_event(user_id, session_id)
 96 |         if count % 1000 == 0:
 97 |             time_sofar = time.time() - time_start
 98 |             logger.info('Worker %d produced %d messages in %f seconds with an average speed of %.2f MPS.', worker_id, count, time_sofar, count / time_sofar)
 99 |         try:
100 |             producer.produce(
101 |                 topic=KAFKA_TOPIC,
102 |                 key=user_id,
103 |                 value=user_event,
104 |                 on_delivery=delivery_report
105 |             )
106 |             producer.poll(0)
107 |         except BufferError:
108 |             logger.info('Worker %d: Producer buffer full. Polling for 1s before retrying...', worker_id)
109 |             producer.poll(1)
110 |         except ValueSerializationError:
111 |             logger.exception('Worker %d: Message serialization failed:', worker_id)
112 |         except KafkaException:
113 |             logger.exception('Worker %d: Kafka error:', worker_id)
114 |         except Exception:
115 |             logger.exception('Worker %d: Unexpected error occurred.', worker_id)
116 |             producer.poll(5)
117 |         
118 |         if random.random() < NEW_USER_SESSION_PROBABILITY:
119 |             user_id = uuid.uuid4()
120 |             session_id = uuid.uuid4()
121 |         
122 |         producer.poll(EVENT_INTERVAL_SECONDS)
123 |     
124 |     if max_messages:
125 |         logger.info('Worker %d: Loop done. Flushing producer...', worker_id)
126 |         while remaining_messages := producer.flush(timeout=1):
127 |             logger.info('Worker %d: %d messages still in queue after flush.', worker_id, remaining_messages)
128 |     logger.info('Worker %d: All messages flushed successfully.', worker_id)
129 | 
130 | 
131 | if __name__ == '__main__':
132 |     logging.basicConfig(
133 |         level=logging.INFO,
134 |         format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
135 |         stream=sys.stdout,
136 |     )
137 |     
138 |     processes = []
139 |     logger.info('Spawning %d worker processes...', NUM_WORKERS)
140 |     for i in range(NUM_WORKERS):
141 |         p = Process(target=worker, args=(i + 1,))
142 |         processes.append(p)
143 |         p.start()
144 |     
145 |     try:
146 |         for p in processes:
147 |             p.join()
148 |     except KeyboardInterrupt:
149 |         logger.info('Shutdown signal received. Terminating workers.')
150 |         for p in processes:
151 |             p.terminate()
152 | 


--------------------------------------------------------------------------------
/producer/schema_registry.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | 
 3 | import os
 4 | from uuid import UUID
 5 | 
 6 | from confluent_kafka import avro
 7 | from confluent_kafka.schema_registry import SchemaRegistryClient
 8 | from confluent_kafka.schema_registry.avro import AvroSerializer, AvroDeserializer
 9 | from confluent_kafka.serialization import SerializationContext
10 | from dotenv import load_dotenv
11 | 
12 | 
13 | def uuid_serializer(uuid_obj: UUID, _: SerializationContext | None) -> bytes | None:
14 |     """Serialize a uuid object to bytes."""
15 |     if not uuid_obj:
16 |         return None
17 |     if not isinstance(uuid_obj, UUID):
18 |         msg = f'Excpected a UUID object, got {type(uuid_obj)}'
19 |         raise TypeError(msg)
20 |     return uuid_obj.bytes
21 | 
22 | 
23 | load_dotenv()
24 | 
25 | schema = str(avro.load('user_event_schema.avsc'))
26 | sr_config = {'url': f'{os.environ["SCHEMA_REGISTRY_URL"]}'}
27 | serializer_config = {'auto.register.schemas': True}
28 | sr_client = SchemaRegistryClient(sr_config)
29 | avro_serializer = AvroSerializer(schema_registry_client=sr_client, schema_str=schema, conf=serializer_config)
30 | avro_deserializer = AvroDeserializer(schema_registry_client=sr_client, schema_str=schema)
31 | 


--------------------------------------------------------------------------------
/producer/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | from socket import gaierror
 5 | 
 6 | import clickhouse_connect
 7 | import pytest
 8 | from clickhouse_connect.driver.client import Client
 9 | from clickhouse_connect.driver.exceptions import ClickHouseError
10 | 
11 | 
12 | logging.basicConfig(
13 |     level=logging.INFO,
14 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
15 |     stream=sys.stdout,
16 | )
17 | 
18 | 
19 | @pytest.fixture
20 | def clickhouse_client() -> Client:
21 |     """Establish a connection to ClickHouse.
22 |     
23 |     Returns:
24 |         ClickHouse client.
25 |     """
26 |     try:
27 |         client = clickhouse_connect.get_client(
28 |             host=os.environ['CLICKHOUSE_HOST'],
29 |             port=int(os.environ['CLICKHOUSE_PORT']),
30 |             user=os.environ['CLICKHOUSE_USER'],
31 |             password=os.environ['CLICKHOUSE_PASSWORD'],
32 |             database=os.environ['CLICKHOUSE_DB']
33 |         )
34 |         client.ping()
35 |         return client
36 |     except (ConnectionRefusedError, gaierror) as e:
37 |         pytest.fail(f'Could not connect to ClickHouse due to a network error: {e}')
38 |     except ClickHouseError as e:
39 |         pytest.fail(f'A ClickHouse server error occurred during connection: {e}')
40 |     except Exception as e:
41 |         pytest.fail(f'An unexpected error occurred while connecting to ClickHouse: {type(e).__name__} - {e}')
42 | 


--------------------------------------------------------------------------------
/producer/tests/test_integration.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import time
 4 | from datetime import datetime
 5 | import pytest
 6 | from confluent_kafka import DeserializingConsumer
 7 | 
 8 | from config import Events, Status
 9 | from producer import worker
10 | from schema_registry import avro_deserializer
11 | 
12 | 
13 | logger = logging.getLogger(__name__)
14 | TABLE = os.environ['CLICKHOUSE_TABLE']
15 | 
16 | 
17 | def test_produces_to_clickhouse_pipeline(clickhouse_client):
18 |     """Test the pipeline: Producer -> Kafka -> Connect -> ClickHouse."""
19 |     num_test_messages = 2
20 |     
21 |     clickhouse_client.command(f'TRUNCATE TABLE {TABLE}')
22 |     
23 |     worker(worker_id=101, max_messages=num_test_messages)
24 |     
25 |     poll_interval_seconds = 2  # Takes time for messages to reach ClickHouse
26 |     start = time.time()
27 |     end_time = poll_interval_seconds * num_test_messages + start
28 |     final_count = 0
29 |     while time.time() < end_time:
30 |         try:
31 |             final_count = clickhouse_client.query(f'SELECT count() FROM {TABLE};').result_rows[0][0]
32 |             if final_count >= num_test_messages:
33 |                 break
34 |             logger.info(f'Found {final_count}/{num_test_messages} rows. Waiting...')
35 |             time.sleep(poll_interval_seconds)
36 |         except Exception as e:
37 |             logger.info(f'An error occurred while polling ClickHouse: {e}. Retrying...')
38 |             time.sleep(poll_interval_seconds)
39 |     
40 |     assert final_count == num_test_messages, f'Expected {num_test_messages} rows, but found {final_count}.'
41 |     
42 |     row = clickhouse_client.query('SELECT event_type, status, event_timestamp FROM %(table)s LIMIT 1', parameters={'table': TABLE}).result_rows[0]
43 |     assert isinstance(row[0], str)
44 |     assert row[0] in Events.__members__
45 |     assert isinstance(row[1], str)
46 |     assert row[1] in Status.__members__
47 |     assert isinstance(row[2], datetime), f'Timestamp should be a datetime object, but got {type(row[2])}'
48 | 
49 | 
50 | def test_producer_worker_sends_valid_avro_messages(clickhouse_client):
51 |     """Verify the worker function produces valid Avro messages to the Kafka topic."""
52 |     kafka_consumer_conf = {
53 |         'bootstrap.servers': os.environ['KAFKA_BOOTSTRAP_SERVERS'],
54 |         'group.id': 'test-integration-consumer-group',
55 |         'auto.offset.reset': 'earliest',
56 |         'value.deserializer': avro_deserializer,
57 |     }
58 |     
59 |     max_messages = 3
60 |     worker(worker_id=99, max_messages=max_messages)
61 |     
62 |     consumer = DeserializingConsumer(conf=kafka_consumer_conf)
63 |     consumer.subscribe([os.environ['KAFKA_TOPIC']])
64 |     consumed_messages = []
65 |     try:
66 |         while len(consumed_messages) < max_messages:
67 |             msg = consumer.poll(timeout=1)
68 |             
69 |             if msg is None:
70 |                 pytest.fail(reason=f'Timed out waiting for messages. Received {len(consumed_messages)} out of {max_messages}.')
71 |             
72 |             if msg.error():
73 |                 pytest.fail(reason=f'Consumer error: {msg.error()}')
74 |             
75 |             consumed_messages.append(msg.value())
76 |     finally:
77 |         consumer.close()
78 |     
79 |     assert len(consumed_messages) == max_messages
80 |     
81 |     first_event = consumed_messages[0]
82 |     assert isinstance(first_event, dict)
83 |     assert 'event_id' in first_event
84 |     assert isinstance(first_event['event_type'], str)
85 |     assert first_event['status'] in Status.__members__
86 |     
87 |     clickhouse_client.command(f'TRUNCATE TABLE {TABLE}')
88 | 


--------------------------------------------------------------------------------
/producer/tests/test_unit.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import uuid
  3 | from unittest.mock import call
  4 | from uuid import UUID
  5 | 
  6 | import pytest
  7 | from confluent_kafka.error import KafkaException, ValueSerializationError
  8 | 
  9 | from config import EVENT_INTERVAL_SECONDS, KAFKA_TOPIC, Events, Status
 10 | from custom_types import Event
 11 | from producer import delivery_report, generate_event, worker
 12 | from schema_registry import uuid_serializer
 13 | 
 14 | 
 15 | COLUMNS = Event.__annotations__.keys()
 16 | 
 17 | 
 18 | def test_generate_event_with_productid_relevant_eventtype_no_error(mocker):
 19 |     """Test that generate_event produces a dictionary that has product_id with relevant event_type and no error."""
 20 |     epoch = 1000000
 21 |     radnint = 123
 22 |     event_type = 'VIEW_PRODUCT'
 23 |     mocker.patch('random.choice', return_value=event_type)
 24 |     mocker.patch('random.random', return_value=1)
 25 |     mocker.patch('random.randint', return_value=radnint)
 26 |     mocker.patch('time.time', return_value=epoch)
 27 |     user_id = uuid.uuid4()
 28 |     session_id = uuid.uuid4()
 29 |     
 30 |     event = generate_event(user_id, session_id)
 31 |     
 32 |     assert isinstance(event, dict)
 33 |     assert event.keys() == COLUMNS
 34 |     assert event['user_id'] == str(user_id)
 35 |     assert event['session_id'] == str(session_id)
 36 |     assert event['event_type'] == event_type
 37 |     assert event['event_timestamp'] == epoch * 1000
 38 |     assert event['request_latency_ms'] == radnint
 39 |     assert event['status'] == Status.SUCCESS
 40 |     assert event['error_code'] is None
 41 |     assert event['product_id'] == radnint
 42 | 
 43 | 
 44 | def test_generate_event_without_productid_nonrelevant_eventtype_no_error(mocker):
 45 |     """Test that generate_event produces a dictionary that doesn't have product_id with non-relevant event_type and no error."""
 46 |     event_type = Events.SEARCH
 47 |     mocker.patch('random.choice', return_value=event_type)
 48 |     mocker.patch('random.random', return_value=1)
 49 |     user_id = uuid.uuid4()
 50 |     session_id = uuid.uuid4()
 51 |     
 52 |     event = generate_event(user_id, session_id)
 53 |     
 54 |     assert isinstance(event, dict)
 55 |     assert event.keys() == COLUMNS
 56 |     assert event['event_type'] == event_type
 57 |     assert event['status'] == Status.SUCCESS
 58 |     assert event['error_code'] is None
 59 |     assert event['product_id'] is None
 60 | 
 61 | 
 62 | def test_generate_event_without_productid_relevant_eventtype_error(mocker):
 63 |     """Test that generate_event produces a dictionary that has product_id with relevant event_type and has error."""
 64 |     error_code = 503
 65 |     event_type = Events.VIEW_PRODUCT
 66 |     latency = 100
 67 |     product_id = 1000
 68 |     mocker.patch('random.choice', return_value=event_type)
 69 |     mocker.patch('random.random', return_value=-1)
 70 |     mocker.patch('random.randint', side_effect=[latency, error_code, product_id])
 71 |     
 72 |     event = generate_event(uuid.uuid4(), uuid.uuid4())
 73 |     
 74 |     assert event['event_type'] == event_type
 75 |     assert event['status'] == Status.ERROR
 76 |     assert event['request_latency_ms'] == latency
 77 |     assert event['error_code'] == error_code
 78 |     assert event['product_id'] == product_id
 79 | 
 80 | 
 81 | def test_uuid_serializer_success():
 82 |     """Test that the UUID serializer correctly converts a UUID to bytes."""
 83 |     test_uuid = uuid.uuid4()
 84 |     assert uuid_serializer(uuid_obj=test_uuid, _=None) == test_uuid.bytes
 85 | 
 86 | 
 87 | def test_uuid_serializer_invalid_uuid_failure():
 88 |     """Test that the UUID serializer raises a TypeError for invalid uuid input."""
 89 |     with pytest.raises(TypeError):
 90 |         uuid_serializer(uuid_obj='not-a-uuid', _=None)
 91 | 
 92 | 
 93 | def test_uuid_serializer_None_type_failure():
 94 |     """Test that the UUID serializer returns None for None input."""
 95 |     assert uuid_serializer(None, None) is None
 96 | 
 97 | 
 98 | def test_worker_produces_messages(mocker):
 99 |     """Test the worker function's core logic of producing a set number of messages."""
100 |     user_id = UUID('123e4567-e89b-12d3-a456-426614174000')
101 |     user_event = {'event_id': 'test-event'}
102 |     max_messages = 2
103 |     mocker.patch('producer.generate_event', return_value=user_event)
104 |     mocker.patch('random.random', side_effect=[random.random(), 1] * max_messages)
105 |     mocker.patch('uuid.uuid4', return_value=user_id)
106 |     
107 |     mock_producer = mocker.Mock()
108 |     mocker.patch('producer.SerializingProducer', return_value=mock_producer, autospec=True)
109 |     mock_producer.flush.return_value = 0
110 |     
111 |     worker(worker_id=102, max_messages=max_messages)
112 |     
113 |     assert mock_producer.produce.call_count == max_messages
114 |     
115 |     calls_list = mock_producer.produce.call_args_list
116 |     assert all(
117 |         call == mocker.call(
118 |             topic=KAFKA_TOPIC,
119 |             key=user_id,
120 |             value=user_event,
121 |             on_delivery=delivery_report
122 |         )
123 |         for call in calls_list
124 |     )
125 |     
126 |     first_call_args = calls_list[0]
127 |     assert first_call_args.kwargs['topic'] == KAFKA_TOPIC
128 |     assert first_call_args.kwargs['key'] == user_id
129 |     assert first_call_args.kwargs['value'] == user_event
130 |     assert first_call_args.kwargs['on_delivery'] is not None
131 | 
132 | 
133 | def test_worker_polls_and_handles_buffer_error(mocker):
134 |     """Test that the worker polls correctly and handles BufferError."""
135 |     mock_producer = mocker.Mock()
136 |     mock_producer.produce.side_effect = [None, BufferError]
137 |     mocker.patch('producer.SerializingProducer', return_value=mock_producer)
138 |     mock_producer.flush.return_value = 0
139 |     
140 |     worker(worker_id=101, max_messages=2)
141 |     
142 |     assert mock_producer.produce.call_count == 2
143 |     
144 |     assert mock_producer.poll.call_args_list == [
145 |         mocker.call(0),
146 |         mocker.call(EVENT_INTERVAL_SECONDS),
147 |         mocker.call(1),
148 |         mocker.call(EVENT_INTERVAL_SECONDS),
149 |     ]
150 | 
151 | 
152 | def test_worker_survives_serialization_error_and_logs_exception(mocker):
153 |     """Verify that, given a ValueSerializationError, the worker logs the exception and gracefully finishes its execution loop without crashing."""
154 |     mock_producer = mocker.Mock()
155 |     mocker.patch('producer.SerializingProducer', return_value=mock_producer)
156 |     mocker.patch('producer.generate_event', return_value={'event_id': 'bad-data'})
157 |     mock_producer.produce.side_effect = ValueSerializationError('Invalid Avro schema')
158 |     mock_producer.flush.return_value = 0
159 |     logger_exception_mock = mocker.patch('producer.logger.exception')
160 |     
161 |     worker_id = 103
162 |     worker(worker_id=worker_id, max_messages=1)
163 |     
164 |     mock_producer.produce.assert_called_once()
165 |     assert mocker.call(0) not in mock_producer.poll.call_args_list
166 |     
167 |     logger_exception_mock.assert_called_once()
168 |     log_call_args = logger_exception_mock.call_args[0]
169 |     assert 'Message serialization failed:' in log_call_args[0]
170 |     assert log_call_args[1] == worker_id
171 | 
172 | 
173 | def test_worker_survives_kafka_exception_and_logs_it(mocker):
174 |     """Verify that, given a KafkaException during produce, the worker logs the exception and continues without crashing."""
175 |     mock_producer = mocker.Mock()
176 |     mocker.patch('producer.SerializingProducer', return_value=mock_producer)
177 |     mocker.patch('producer.generate_event', return_value={'event_id': 'event'})
178 |     mock_producer.produce.side_effect = KafkaException("Broker is down")
179 |     mock_producer.flush.return_value = 0
180 |     logger_exception_mock = mocker.patch('producer.logger.exception')
181 |     
182 |     worker(worker_id=104, max_messages=1)
183 |     
184 |     mock_producer.produce.assert_called_once()
185 |     logger_exception_mock.assert_called_once()
186 |     log_call_args = logger_exception_mock.call_args[0]
187 |     assert 'Kafka error:' in log_call_args[0]
188 |     assert mocker.call(0) not in mock_producer.poll.call_args_list
189 | 
190 | 
191 | def test_worker_pauses_on_unexpected_exception(mocker):
192 |     """Verify that, given an unexpected RuntimeError, the worker logs the exception and pauses for 5 seconds, regardless of other pauses in the loop."""
193 |     mock_producer = mocker.Mock()
194 |     mocker.patch('producer.SerializingProducer', return_value=mock_producer)
195 |     mocker.patch('producer.generate_event', return_value={'event_id': 'event'})
196 |     mock_producer.produce.side_effect = RuntimeError('Something completely unexpected happened')
197 |     mock_producer.flush.return_value = 0
198 |     logger_exception_mock = mocker.patch('producer.logger.exception')
199 |     
200 |     worker_id = 105
201 |     worker(worker_id=worker_id, max_messages=1)
202 |     
203 |     mock_producer.produce.assert_called_once()
204 |     
205 |     logger_exception_mock.assert_called_once()
206 |     call_args, _ = logger_exception_mock.call_args
207 |     assert call_args[0] == 'Worker %d: Unexpected error occurred.'
208 |     assert call_args[1] == worker_id
209 |     
210 |     mock_producer.poll.assert_any_call(5)
211 |     
212 |     assert len(mock_producer.poll.call_args_list) == 2
213 |     assert mock_producer.poll.call_args_list[0] == call(5)
214 | 


--------------------------------------------------------------------------------
/producer/user_event_schema.avsc:
--------------------------------------------------------------------------------
 1 | {
 2 |     "namespace": "com.ecommerce.events.v1",
 3 |     "type": "record",
 4 |     "name": "user_interactions",
 5 |     "doc": "Defines a single user interaction event.",
 6 |     "fields": [
 7 |         {"name": "event_id", "type": "string", "logicalType": "uuid"},
 8 |         {"name": "user_id", "type": "string", "logicalType": "uuid"},
 9 |         {"name": "session_id", "type": "string", "logicalType": "uuid"},
10 |         {
11 |             "name": "event_type",
12 |             "type": {
13 |                 "type": "enum",
14 |                 "name": "EventType",
15 |                 "symbols": ["VIEW_PRODUCT", "ADD_TO_CART", "CHECKOUT", "PAYMENT", "SEARCH"]
16 |             }
17 |         },
18 |         {"name": "event_timestamp", "type": "long", "logicalType": "timestamp-millis"},
19 |         {"name": "request_latency_ms", "type": "int"},
20 |         {
21 |             "name": "status",
22 |             "type": {
23 |                 "type": "enum",
24 |                 "name": "StatusType",
25 |                 "symbols": ["SUCCESS", "ERROR"]
26 |             }
27 |         },
28 |         {"name": "error_code", "type": ["null", "int"], "default": null},
29 |         {"name": "product_id", "type": ["null", "int"], "default": null}
30 |     ]
31 | }


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.ruff.per-file-ignores]
2 | 'test_*.py' = ['ANN001', 'ANN002', 'ANN003', 'ANN202', 'ANN201', 'ARG001', 'ARG005', 'BLE001', 'DOC201', 'INP001', 'N802', 'PLR2004', 'S101', 'S105', 'S106', 'S608']
3 | '*.py' = ["INP001"]


--------------------------------------------------------------------------------
/spark/Dockerfile-Spark:
--------------------------------------------------------------------------------
 1 | FROM apache/spark:3.5.6-scala2.12-java11-python3-ubuntu
 2 | 
 3 | USER root
 4 | 
 5 | RUN pip install --no-cache-dir \
 6 |     fastapi==0.116.1 \
 7 |     uvicorn[standard]==0.33.0 \
 8 |     pydantic==2.10.6 \
 9 |     requests==2.32.4 \
10 |     pyspark==3.5.6 \
11 |     pandas==2.0.3 \
12 |     pyarrow==17.0.0 \
13 |     grpcio==1.70.0 \
14 |     protobuf==5.29.5 \
15 |     grpcio-status==1.70.0 \
16 |     minio==7.2.10
17 | 
18 | WORKDIR /opt/spark/work-dir
19 | 
20 | ARG HADOOP_AWS_VERSION=3.3.4
21 | ARG AWS_JAVA_SDK_VERSION=1.12.262
22 | 
23 | RUN apt-get update && apt-get install -y wget && \
24 |     mkdir -p /opt/spark/jars && \
25 |     wget -P /opt/spark/jars https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_AWS_VERSION}/hadoop-aws-${HADOOP_AWS_VERSION}.jar && \
26 |     wget -P /opt/spark/jars https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_JAVA_SDK_VERSION}/aws-java-sdk-bundle-${AWS_JAVA_SDK_VERSION}.jar && \
27 |     apt-get remove -y wget && \
28 |     apt-get autoremove -y && \
29 |     rm -rf /var/lib/apt/lists/*
30 | 
31 | USER spark
32 | 
33 | CMD ["tail", "-f", "/dev/null"]


--------------------------------------------------------------------------------
/spark/Dockerfile-Spark-test:
--------------------------------------------------------------------------------
1 | FROM lp/spark:latest
2 | 
3 | USER root
4 | 
5 | RUN pip install --no-cache-dir pytest==8.3.5 pytest-mock==3.14.1
6 | 
7 | USER spark
8 | 


--------------------------------------------------------------------------------
/spark/tests/test_spark.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import json
  4 | import os
  5 | import tempfile
  6 | from datetime import datetime
  7 | from uuid import uuid4
  8 | 
  9 | import pandas as pd
 10 | import pyarrow as pa
 11 | import pyarrow.parquet as pq
 12 | import pytest
 13 | from minio import Minio
 14 | from pyspark.sql import SparkSession
 15 | 
 16 | from spark import analyze_events, main
 17 | 
 18 | 
 19 | MINIO_BUCKET_NAME = os.environ['MINIO_BUCKET_NAME']
 20 | NUM_ERROR = 3
 21 | NUM_SUCCESS = 17
 22 | EVENTS = {'VIEW_PRODUCT', 'ADD_TO_CART', 'CHECKOUT', 'PAYMENT', 'SEARCH'}
 23 | SCHEMA = pa.schema([
 24 |     pa.field('event_id', pa.string()),
 25 |     pa.field('user_id', pa.string()),
 26 |     pa.field('session_id', pa.string()),
 27 |     pa.field('event_type', pa.string()),
 28 |     pa.field('event_timestamp', pa.timestamp('ms', tz='Asia/Tehran')),
 29 |     pa.field('request_latency_ms', pa.int32()),
 30 |     pa.field('status', pa.string()),
 31 |     pa.field('error_code', pa.int32(), nullable=True),
 32 |     pa.field('product_id', pa.int32(), nullable=True),
 33 | ])
 34 | 
 35 | 
 36 | @pytest.fixture(scope='module')
 37 | def spark():
 38 |     """Create a SparkSession for integration testing."""
 39 |     spark_session = SparkSession.builder \
 40 |         .appName('TestEventAnalysis') \
 41 |         .master('spark://spark-master:7077') \
 42 |         .config('spark.driver.host', 'spark-test-runner') \
 43 |         .config('spark.hadoop.fs.s3a.endpoint', 'http://minio:9000') \
 44 |         .config('spark.hadoop.fs.s3a.access.key', os.environ['MINIO_ROOT_USER']) \
 45 |         .config('spark.hadoop.fs.s3a.secret.key', os.environ['MINIO_ROOT_PASSWORD']) \
 46 |         .config('spark.hadoop.fs.s3a.path.style.access', 'true') \
 47 |         .config('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem') \
 48 |         .config('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false') \
 49 |         .getOrCreate()
 50 |     
 51 |     yield spark_session
 52 |     spark_session.stop()
 53 | 
 54 | 
 55 | @pytest.fixture
 56 | def minio_client() -> Minio:
 57 |     """Create a real MinIO client for integration testing."""
 58 |     minio_client = Minio(
 59 |         endpoint='minio:9000',
 60 |         access_key=os.environ['MINIO_ROOT_USER'],
 61 |         secret_key=os.environ['MINIO_ROOT_PASSWORD'],
 62 |         secure=False
 63 |     )
 64 |     return minio_client
 65 | 
 66 | 
 67 | @pytest.fixture
 68 | def parquet_file(minio_client):
 69 |     """Create a test parquet file in MinIO and yield its S3 path."""
 70 |     timestamp = datetime(2025, 1, 15, 10, 0)
 71 |     timestamp_str = timestamp.strftime('%Y-%m-%d_%H-%M')
 72 |     object_name = f'{timestamp_str}.parquet'
 73 |     
 74 |     test_data = []
 75 |     
 76 |     for event_type in EVENTS:
 77 |         test_data.extend(
 78 |             {
 79 |                 'event_id': str(uuid4()),
 80 |                 'user_id': str(uuid4()),
 81 |                 'session_id': str(uuid4()),
 82 |                 'event_type': event_type,
 83 |                 'event_timestamp': timestamp,
 84 |                 'request_latency_ms': 50,
 85 |                 'status': 'ERROR',
 86 |                 'error_code': 500,
 87 |                 'product_id': 1000 if event_type in {'VIEW_PRODUCT', 'ADD_TO_CART'} else None,
 88 |             }
 89 |             for _ in range(NUM_ERROR)
 90 |         )
 91 |         test_data.extend(
 92 |             {
 93 |                 'event_id': str(uuid4()),
 94 |                 'user_id': str(uuid4()),
 95 |                 'session_id': str(uuid4()),
 96 |                 'event_type': event_type,
 97 |                 'event_timestamp': timestamp,
 98 |                 'request_latency_ms': 50,
 99 |                 'status': 'SUCCESS',
100 |                 'error_code': None,
101 |                 'product_id': 1000 if event_type in {'VIEW_PRODUCT', 'ADD_TO_CART'} else None,
102 |             }
103 |             for _ in range(NUM_SUCCESS)
104 |         )
105 |     
106 |     df = pd.DataFrame(test_data)
107 |     
108 |     with tempfile.NamedTemporaryFile(suffix='.parquet') as tmp:
109 |         table = pa.Table.from_pandas(df, schema=SCHEMA)
110 |         pq.write_table(table, tmp.name)
111 |         
112 |         minio_client.fput_object(
113 |             bucket_name=MINIO_BUCKET_NAME,
114 |             object_name=object_name,
115 |             file_path=tmp.name
116 |         )
117 |     
118 |     s3_path = f's3a://{MINIO_BUCKET_NAME}/{object_name}'
119 |     
120 |     yield s3_path
121 |     
122 |     minio_client.remove_object(MINIO_BUCKET_NAME, object_name)
123 | 
124 | 
125 | def test_spark_analyze_events_with_data(spark: SparkSession, parquet_file: str) -> None:
126 |     """Test `analyze_events` with real data."""
127 |     result = analyze_events(spark=spark, file_path=parquet_file)
128 |     
129 |     assert result['total_events'] == len(EVENTS) * (NUM_ERROR + NUM_SUCCESS)
130 |     assert result['total_errors'] == len(EVENTS) * NUM_ERROR
131 |     
132 |     for event_type, stats in result['by_event_type'].items():
133 |         assert event_type in EVENTS
134 |         assert stats['SUCCESS'] == NUM_SUCCESS
135 |         assert stats['ERROR'] == NUM_ERROR
136 | 
137 | 
138 | def test_spark_analyze_events_empty_file(spark: SparkSession, minio_client: Minio) -> None:
139 |     """Test `analyze_events` with an empty parquet file."""
140 |     object_name = 'empty-test.parquet'
141 |     
142 |     empty_df = pd.DataFrame(
143 |         columns=[
144 |             'event_id', 'user_id', 'session_id', 'event_type',
145 |             'event_timestamp', 'request_latency_ms', 'status',
146 |             'error_code', 'product_id'
147 |         ]
148 |     )
149 |     
150 |     with tempfile.NamedTemporaryFile(suffix='.parquet') as tmp:
151 |         table = pa.Table.from_pandas(empty_df, schema=SCHEMA)
152 |         pq.write_table(table, tmp.name)
153 |         
154 |         minio_client.fput_object(
155 |             bucket_name=MINIO_BUCKET_NAME,
156 |             object_name=object_name,
157 |             file_path=tmp.name
158 |         )
159 |     
160 |     s3_path = f's3a://{MINIO_BUCKET_NAME}/{object_name}'
161 |     
162 |     try:
163 |         result = analyze_events(spark=spark, file_path=s3_path)
164 |         
165 |         assert result['total_events'] == 0
166 |         assert result['total_errors'] == 0
167 |         assert result['by_event_type'] == {}
168 |     finally:
169 |         minio_client.remove_object(MINIO_BUCKET_NAME, object_name)
170 | 
171 | 
172 | def test_spark_main_with_data(mocker, minio_client: Minio, parquet_file: str) -> None:
173 |     """Test the main function of spark.py with real data."""
174 |     mocker.patch('sys.argv', ['spark.py', parquet_file])
175 |     
176 |     with pytest.raises(SystemExit) as exc_info:
177 |         main()
178 |     
179 |     assert exc_info.value.code == 0
180 |     
181 |     json_object_name = parquet_file.split(os.sep)[-1].replace('.parquet', '.json')
182 |     try:
183 |         response = minio_client.get_object(MINIO_BUCKET_NAME, json_object_name)
184 |         result_data = json.loads(response.read())
185 |         
186 |         report = result_data['report']
187 |         assert report['total_events'] == len(EVENTS) * (NUM_ERROR + NUM_SUCCESS)
188 |         assert report['total_errors'] == len(EVENTS) * NUM_ERROR
189 |     finally:
190 |         response.close()
191 |         response.release_conn()
192 | 
193 | 
194 | def test_spark_main_no_data(mocker, minio_client: Minio) -> None:
195 |     """Test spark `main` function with no parquet file."""
196 |     timestamp_str = '2025-01-15_11-00'
197 |     s3_path = f's3a://{MINIO_BUCKET_NAME}/{timestamp_str}'
198 |     
199 |     mocker.patch('sys.argv', ['spark.py', s3_path])
200 |     
201 |     with pytest.raises(SystemExit) as exc_info:
202 |         main()
203 |     
204 |     assert exc_info.value.code == 0
205 |     
206 |     json_object_name = f'{timestamp_str}.json'
207 |     
208 |     try:
209 |         response = minio_client.get_object(MINIO_BUCKET_NAME, json_object_name)
210 |         result_data = json.loads(response.read())
211 |         
212 |         assert 'report' in result_data
213 |         assert result_data['report'] == f'No data for {timestamp_str}.'
214 |     finally:
215 |         response.close()
216 |         response.release_conn()
217 |         minio_client.remove_object(MINIO_BUCKET_NAME, json_object_name)
218 | 


--------------------------------------------------------------------------------
/tests/docker-compose.test.airflow.yml:
--------------------------------------------------------------------------------
  1 | x-airflow-common:
  2 |   &airflow-common
  3 |   build:
  4 |       context: ./airflow
  5 |       dockerfile: Dockerfile-Airflow
  6 |   depends_on:
  7 |     &airflow-common-depends-on
  8 |     postgres:
  9 |       condition: service_healthy
 10 |     redis:
 11 |       condition: service_healthy
 12 |   env_file:
 13 |     - ./tests/env-test/clickhouse.env
 14 |     - ./tests/env-test/minio.env
 15 |     - ./tests/env-test/minio.creds
 16 |     - ./tests/env-test/spark.env
 17 |   environment:
 18 |     &airflow-common-env
 19 |     AIRFLOW__API__SECRET_KEY: ${AIRFLOW__API__SECRET_KEY}
 20 |     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
 21 |     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres/${POSTGRES_DB}
 22 |     AIRFLOW__CORE__AUTH_MANAGER: airflow.providers.fab.auth_manager.fab_auth_manager.FabAuthManager
 23 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'false'
 24 |     AIRFLOW__CORE__EXECUTION_API_SERVER_URL: 'http://airflow-apiserver:8080/execution/'
 25 |     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
 26 |     AIRFLOW__CORE__FERNET_KEY: ${AIRFLOW__CORE__FERNET_KEY}
 27 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
 28 |     AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres/${POSTGRES_DB}
 29 |     AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
 30 |     AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
 31 |     AIRFLOW_UID: ${AIRFLOW_UID}
 32 |     AIRFLOW__WEBSERVER__DEFAULT_UI_TIMEZONE: 'Asia/Tehran'
 33 |     _PIP_ADDITIONAL_REQUIREMENTS: ''
 34 |     DASHBOARD_API_URL: http://dashboard-api:8080/report
 35 |   image: lp/airflow
 36 |   networks:
 37 |       - airflow
 38 |   user: "${AIRFLOW_UID}:0"
 39 |   volumes:
 40 |     - ./tests/airflow/config:/opt/airflow/config
 41 |     - ./tests/airflow/dags:/opt/airflow/dags
 42 |     - ./tests/airflow/logs:/opt/airflow/logs
 43 | 
 44 | x-healthcheck:
 45 |   healthcheck:
 46 |     &healthcheck-common
 47 |       interval: 30s
 48 |       timeout: 5s
 49 |       retries: 3
 50 |       start_period: 30s
 51 | 
 52 | services:
 53 |   airflow-apiserver:
 54 |     <<: *airflow-common
 55 |     command: api-server
 56 |     depends_on:
 57 |       <<: *airflow-common-depends-on
 58 |       airflow-init:
 59 |         condition: service_completed_successfully
 60 |     healthcheck:
 61 |       <<: *healthcheck-common
 62 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/api/v2/version"]
 63 |   
 64 |   airflow-init:
 65 |     <<: *airflow-common
 66 |     command:
 67 |       - -c
 68 |       - |
 69 |         if [[ -z "${AIRFLOW_UID}" ]]; then
 70 |           echo
 71 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
 72 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
 73 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
 74 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
 75 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
 76 |           echo
 77 |           export AIRFLOW_UID=$$(id -u)
 78 |         fi
 79 |         one_meg=1048576
 80 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
 81 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
 82 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
 83 |         warning_resources="false"
 84 |         if (( mem_available < 4000 )) ; then
 85 |           echo
 86 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
 87 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
 88 |           echo
 89 |           warning_resources="true"
 90 |         fi
 91 |         if (( cpus_available < 2 )); then
 92 |           echo
 93 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
 94 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
 95 |           echo
 96 |           warning_resources="true"
 97 |         fi
 98 |         if (( disk_available < one_meg * 10 )); then
 99 |           echo
100 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
101 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
102 |           echo
103 |           warning_resources="true"
104 |         fi
105 |         if [[ $${warning_resources} == "true" ]]; then
106 |           echo
107 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
108 |           echo "Please follow the instructions to increase amount of resources available:"
109 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
110 |           echo
111 |         fi
112 |         echo
113 |         echo "Creating missing opt dirs if missing:"
114 |         echo
115 |         mkdir -v -p /opt/airflow/{logs,dags,plugins,config}
116 |         mkdir /opt/airflow/logs/spark
117 |         echo
118 |         echo "Airflow version:"
119 |         /entrypoint airflow version
120 |         echo
121 |         echo "Files in shared volumes:"
122 |         echo
123 |         ls -la /opt/airflow/{logs,dags,plugins,config}
124 |         echo
125 |         echo "Running airflow config list to create default config file if missing."
126 |         echo
127 |         /entrypoint airflow config list >/dev/null
128 |         echo
129 |         echo "Files in shared volumes:"
130 |         echo
131 |         ls -la /opt/airflow/{logs,dags,plugins,config}
132 |         echo
133 |         echo "Change ownership of files in /opt/airflow to ${AIRFLOW_UID}:0"
134 |         echo
135 |         chown -R "${AIRFLOW_UID}:0" /opt/airflow/
136 |         echo
137 |         echo "Change ownership of files in shared volumes to ${AIRFLOW_UID}:0"
138 |         echo
139 |         chown -v -R "${AIRFLOW_UID}:0" /opt/airflow/{logs,dags,plugins,config}
140 |         echo
141 |         echo "Files in shared volumes:"
142 |         echo
143 |         ls -la /opt/airflow/{logs,dags,plugins,config}
144 |         echo
145 |         echo "Waiting for db to create ClickHouse and MinIO connections"
146 |         until airflow db check; do
147 |           >&2 echo "Airflow DB not reachable. Waiting..."
148 |           sleep 5
149 |         done
150 |         echo
151 |         echo "Add/Update ClickHouse connection"
152 |         airflow connections add "$CLICKHOUSE_CONN_NAME" \
153 |           --conn-uri "clickhouse://$CLICKHOUSE_USER:$CLICKHOUSE_PASSWORD@$CLICKHOUSE_HOST:$CLICKHOUSE_PORT/$CLICKHOUSE_DB"
154 |         echo
155 |         echo "Add/Update MinIO connection"
156 |         airflow connections add "$MINIO_CONN_NAME" \
157 |           --conn-type     'aws' \
158 |           --conn-login    "$MINIO_ROOT_USER" \
159 |           --conn-password "$MINIO_ROOT_PASSWORD" \
160 |           --conn-extra   '{"host": "http://minio:9000"}'
161 |         echo
162 |         echo "Add/Update Spark connection"
163 |         airflow connections add "$SPARK_CONN_NAME" \
164 |           --conn-type "spark" \
165 |           --conn-host "spark://spark-master" \
166 |           --conn-port "7077"
167 |         echo
168 |         echo "Done."
169 |         exit 0;
170 |     depends_on:
171 |       <<: *airflow-common-depends-on
172 |     entrypoint: /bin/bash
173 |     environment:
174 |       <<: *airflow-common-env
175 |       _AIRFLOW_DB_MIGRATE: 'true'
176 |       _AIRFLOW_WWW_USER_CREATE: 'true'
177 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME}
178 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD}
179 |       _PIP_ADDITIONAL_REQUIREMENTS: ''
180 |     user: "0:0"
181 |   
182 |   airflow-dag-processor:
183 |     <<: *airflow-common
184 |     command: dag-processor
185 |     depends_on:
186 |       <<: *airflow-common-depends-on
187 |       airflow-init:
188 |         condition: service_completed_successfully
189 |     healthcheck:
190 |       <<: *healthcheck-common
191 |       test: ["CMD-SHELL", 'airflow jobs check --job-type DagProcessorJob --hostname "$${HOSTNAME}"']
192 |     restart: always
193 |   
194 |   airflow-scheduler:
195 |     <<: *airflow-common
196 |     command: scheduler
197 |     depends_on:
198 |       <<: *airflow-common-depends-on
199 |       airflow-init:
200 |         condition: service_completed_successfully
201 |     healthcheck:
202 |       <<: *healthcheck-common
203 |       test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
204 |     restart: always
205 |   
206 |   airflow-triggerer:
207 |     <<: *airflow-common
208 |     command: triggerer
209 |     depends_on:
210 |       <<: *airflow-common-depends-on
211 |       airflow-init:
212 |         condition: service_completed_successfully
213 |     healthcheck:
214 |       <<: *healthcheck-common
215 |       test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
216 |     restart: always
217 |   
218 |   airflow-worker:
219 |     <<: *airflow-common
220 |     command: celery worker
221 |     depends_on:
222 |       <<: *airflow-common-depends-on
223 |       airflow-apiserver:
224 |         condition: service_healthy
225 |       airflow-init:
226 |         condition: service_completed_successfully
227 |     environment:
228 |       <<: *airflow-common-env
229 |       DUMB_INIT_SETSID: "0"
230 |     healthcheck:
231 |       <<: *healthcheck-common
232 |       test:
233 |         - "CMD-SHELL"
234 |         - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
235 |     restart: always
236 |     hostname: airflow-worker
237 |   
238 |   clickhouse:
239 |     env_file:
240 |       - ./tests/env-test/clickhouse.creds
241 |     healthcheck:
242 |       <<: *healthcheck-common
243 |       test:
244 |         - "CMD-SHELL"
245 |         - "clickhouse-client --user $${CLICKHOUSE_USER} --password $${CLICKHOUSE_PASSWORD} -q 'SELECT 1'"
246 |     hostname: clickhouse
247 |     image: clickhouse/clickhouse-server:25.6.3.116
248 |     networks:
249 |       - airflow
250 |     restart: always
251 |     volumes:
252 |       - ./tests/files/clickhouse_table_schema.sql:/docker-entrypoint-initdb.d/init.sql:ro
253 |   
254 |   dashboard-api:
255 |     build:
256 |       context: ./dashboard
257 |       dockerfile: Dockerfile-Dashboard-api
258 |     command: ["uvicorn", "dashboard_api:app", "--host", "0.0.0.0", "--port", "8080"]
259 |     healthcheck:
260 |       <<: *healthcheck-common
261 |       test: ['CMD', 'curl', '-f', 'http://localhost:8080/health']
262 |     hostname: dashboard-api
263 |     image: lp/dashboard-api
264 |     networks:
265 |       - airflow
266 |     restart: always
267 |     volumes:
268 |       - ./dashboard/dashboard_api.py:/home/dashboard_api.py:ro
269 |     working_dir: /home
270 |   
271 |   minio:
272 |     command: server /data --console-address ":9001"
273 |     env_file:
274 |       - ./tests/env-test/minio.creds
275 |     healthcheck:
276 |       <<: *healthcheck-common
277 |       test: ['CMD', 'mc', 'ready', 'local']
278 |     hostname: minio
279 |     image: minio/minio:RELEASE.2025-07-18T21-56-31Z
280 |     networks:
281 |       - airflow
282 |     restart: always
283 |   
284 |   minio-init:
285 |     command:
286 |       - -c
287 |       - |
288 |         # Although it is set to depend on minio, but it not guaranteed. So we have to do a short sleep
289 |         sleep 5
290 |         
291 |         echo "Setup alias for MinIO server"
292 |         mc alias set $$MINIO_CONN_NAME http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD;
293 |         
294 |         echo "Create the bucket if it doesn't exist"
295 |         mc mb $$MINIO_CONN_NAME/$$MINIO_BUCKET_NAME || true;
296 |         
297 |         exit 0;
298 |     depends_on:
299 |       minio:
300 |         condition: service_healthy
301 |     entrypoint: /bin/bash
302 |     env_file:
303 |       - ./tests/env-test/minio.env
304 |       - ./tests/env-test/minio.creds
305 |     hostname: minio-init
306 |     image: minio/minio:RELEASE.2025-07-18T21-56-31Z
307 |     networks:
308 |       - airflow
309 |   
310 |   postgres:
311 |     env_file:
312 |       - ./tests/env-test/postgres.env
313 |       - ./tests/env-test/postgres.creds
314 |     healthcheck:
315 |       <<: *healthcheck-common
316 |       test: ["CMD", "pg_isready", "-U", "${POSTGRES_USER}"]
317 |     image: postgres:17.4-bookworm
318 |     networks:
319 |       - airflow
320 |     restart: always
321 |   
322 |   redis:
323 |     image: redis:7.2.10-bookworm
324 |     expose:
325 |       - 6379
326 |     healthcheck:
327 |       <<: *healthcheck-common
328 |       test: ["CMD", "redis-cli", "ping"]
329 |     networks:
330 |       - airflow
331 |     restart: always
332 |   
333 |   spark-master:
334 |     build:
335 |       context: ./spark
336 |       dockerfile: Dockerfile-Spark
337 |     command: /opt/spark/bin/spark-class org.apache.spark.deploy.master.Master --host spark-master
338 |     depends_on:
339 |       - minio
340 |     hostname: spark-master
341 |     image: lp/spark
342 |     networks:
343 |       - airflow
344 |     restart: always
345 |     volumes:
346 |       - ./airflow/dags:/opt/airflow/dags
347 |   
348 |   spark-worker:
349 |     build:
350 |       context: ./spark
351 |       dockerfile: Dockerfile-Spark
352 |     command: /opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077
353 |     depends_on:
354 |       - spark-master
355 |     env_file:
356 |       - ./tests/env-test/minio.env
357 |       - ./tests/env-test/minio.creds
358 |     environment:
359 |       SPARK_WORKER_CORES: 2
360 |       SPARK_WORKER_MEMORY: 1g
361 |     hostname: spark-worker
362 |     image: lp/spark
363 |     networks:
364 |       - airflow
365 |     restart: always
366 |     volumes:
367 |       - ./airflow/dags:/opt/airflow/dags
368 |   
369 |   airflow-test-runner:
370 |     build:
371 |       context: ./airflow/
372 |       dockerfile: Dockerfile-Airflow-test
373 |     command: python -m pytest --disable-warnings
374 |     depends_on:
375 |       <<: *airflow-common-depends-on
376 |       airflow-init:
377 |         condition: service_completed_successfully
378 |     env_file:
379 |       - ./tests/env-test/clickhouse.creds
380 |       - ./tests/env-test/clickhouse.env
381 |       - ./tests/env-test/minio.creds
382 |       - ./tests/env-test/minio.env
383 |       - ./tests/env-test/spark.env
384 |     environment:
385 |       <<: *airflow-common-env
386 |     image: lp/test-airflow
387 |     networks:
388 |       - airflow
389 |     volumes:
390 |       - ./airflow/dags:/opt/airflow/dags
391 |       - ./airflow/tests:/opt/airflow/dags/tests:ro
392 |       - ./tests/airflow/config:/opt/airflow/config
393 |       - ./tests/airflow/logs:/opt/airflow/logs
394 |     working_dir: /opt/airflow/dags
395 | 
396 | networks:
397 |   airflow:
398 |     name: airflow
399 | 


--------------------------------------------------------------------------------
/tests/docker-compose.test.dashboard.yml:
--------------------------------------------------------------------------------
 1 | x-healthcheck:
 2 |   healthcheck:
 3 |     &healthcheck-common
 4 |       interval: 30s
 5 |       timeout: 5s
 6 |       retries: 3
 7 |       start_period: 30s
 8 | 
 9 | services:
10 |   dashboard-api:
11 |     build:
12 |       context: ./dashboard
13 |       dockerfile: Dockerfile-Dashboard-api
14 |     command: ["uvicorn", "dashboard_api:app", "--host", "0.0.0.0", "--port", "8080"]
15 |     healthcheck:
16 |       <<: *healthcheck-common
17 |       test: ['CMD', 'curl', '-f', 'http://localhost:8080/health']
18 |     hostname: dashboard-api
19 |     image: lp/dashboard-api
20 |     networks:
21 |       - dashboard
22 |     restart: always
23 |     volumes:
24 |       - ./dashboard/dashboard_api.py:/home/dashboard_api.py:ro
25 |     working_dir: /home
26 |   
27 |   dashboard-api-test-runner:
28 |     build:
29 |       context: ./dashboard
30 |       dockerfile: Dockerfile-Dashboard-api-test
31 |     command: python -m pytest -v --disable-warnings tests/test_unit_api.py tests/test_integration_api.py
32 |     depends_on:
33 |       dashboard-api:
34 |         condition: service_healthy
35 |     environment:
36 |       REPORT_URL: http://dashboard-api:8080/report
37 |       HEALTH_URL: http://dashboard-api:8080/health
38 |     image: lp/test-dashboard-api
39 |     hostname: dashboard-api-test
40 |     networks:
41 |       - dashboard
42 |     volumes:
43 |       - ./dashboard:/home:ro
44 |     working_dir: /home
45 |   
46 |   dashboard-ui-test-runner:
47 |     build:
48 |       context: ./dashboard
49 |       dockerfile: Dockerfile-Dashboard-ui-test
50 |     command: python -m pytest -v --disable-warnings tests/test_unit_ui.py
51 |     image: lp/test-dashboard-ui
52 |     hostname: dashboard-ui-test
53 |     networks:
54 |       - dashboard
55 |     volumes:
56 |       - ./dashboard:/home:ro
57 |     working_dir: /home
58 | 
59 | networks:
60 |   dashboard:
61 |     name: dashboard
62 | 


--------------------------------------------------------------------------------
/tests/docker-compose.test.db.yml:
--------------------------------------------------------------------------------
 1 | x-healthcheck:
 2 |   healthcheck:
 3 |     &healthcheck-common
 4 |       interval: 30s
 5 |       timeout: 5s
 6 |       retries: 3
 7 |       start_period: 30s
 8 | 
 9 | services:
10 |   clickhouse:
11 |     env_file:
12 |       - ./tests/env-test/clickhouse.creds
13 |     healthcheck:
14 |       <<: *healthcheck-common
15 |       test:
16 |         - "CMD-SHELL"
17 |         - "clickhouse-client --user $${CLICKHOUSE_USER} --password $${CLICKHOUSE_PASSWORD} -q 'SELECT 1'"
18 |     hostname: clickhouse
19 |     image: clickhouse/clickhouse-server:25.6.3.116
20 |     networks:
21 |       - db
22 |     ports:
23 |       - '8123:8123'
24 |     restart: always
25 |     volumes:
26 |       - ./tests/files/clickhouse_table_schema.sql:/docker-entrypoint-initdb.d/init.sql:ro
27 |   
28 |   minio:
29 |     command: server /data --console-address ":9001"
30 |     env_file:
31 |       - ./tests/env-test/minio.creds
32 |     healthcheck:
33 |       <<: *healthcheck-common
34 |       test: ['CMD', 'mc', 'ready', 'local']
35 |     hostname: minio
36 |     image: minio/minio:RELEASE.2025-07-18T21-56-31Z
37 |     networks:
38 |       - db
39 |     restart: always
40 |   
41 |   minio-init:
42 |     command:
43 |       - -c
44 |       - |
45 |         # Although it depends on minio, but it not guaranteed. So we have to do a short sleep
46 |         sleep 5
47 |         
48 |         echo "Setup alias for MinIO server"
49 |         mc alias set minio_connection http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD;
50 |         
51 |         echo "Create the bucket if it doesn't exist"
52 |         mc mb minio_connection/$$MINIO_BUCKET_NAME || true;
53 |         
54 |         exit 0;
55 |     depends_on:
56 |       minio:
57 |         condition: service_healthy
58 |     entrypoint: /bin/bash
59 |     env_file:
60 |       - ./tests/env-test/minio.env
61 |       - ./tests/env-test/minio.creds
62 |     hostname: minio-init
63 |     image: minio/minio:RELEASE.2025-07-18T21-56-31Z
64 |     networks:
65 |       - db
66 |   
67 |   db-test-runner:
68 |     build:
69 |       context: ./db
70 |       dockerfile: Dockerfile-DB-test
71 |     command: python -m pytest -v -W "ignore::pytest.PytestCacheWarning"
72 |     depends_on:
73 |       clickhouse:
74 |         condition: service_healthy
75 |       minio-init:
76 |         condition: service_completed_successfully
77 |     env_file:
78 |       - ./tests/env-test/clickhouse.creds
79 |       - ./tests/env-test/clickhouse.env
80 |       - ./tests/env-test/minio.creds
81 |       - ./tests/env-test/minio.env
82 |       - ./tests/env-test/clickhouse.env
83 |     environment:
84 |       KAFKA_BOOTSTRAP_SERVERS: 'broker:29092'
85 |       SCHEMA_REGISTRY_URL: 'http://schema-registry:8081'
86 |     image: lp/test-db
87 |     networks:
88 |       - db
89 |     volumes:
90 |       - ./db:/home/app:ro
91 |     working_dir: /home/app
92 | 
93 | networks:
94 |   db:
95 | 


--------------------------------------------------------------------------------
/tests/docker-compose.test.producer.yml:
--------------------------------------------------------------------------------
  1 | x-healthcheck:
  2 |   healthcheck:
  3 |     &healthcheck-common
  4 |       interval: 30s
  5 |       timeout: 5s
  6 |       retries: 3
  7 |       start_period: 30s
  8 | 
  9 | services:
 10 |   broker:
 11 |     env_file:
 12 |       - ./tests/env-test/kafka.env
 13 |     environment:
 14 |       KAFKA_NODE_ID: 1
 15 |       KAFKA_PROCESS_ROLES: 'broker,controller'
 16 |       KAFKA_CONTROLLER_QUORUM_VOTERS: '1@broker:29093'
 17 |       KAFKA_LISTENERS: 'PLAINTEXT://broker:29092,CONTROLLER://broker:29093,PLAINTEXT_HOST://0.0.0.0:9092'
 18 |       KAFKA_ADVERTISED_LISTENERS: 'PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092'
 19 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: 'CONTROLLER:PLAINTEXT,PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT'
 20 |       KAFKA_INTER_BROKER_LISTENER_NAME: 'PLAINTEXT'
 21 |       KAFKA_CONTROLLER_LISTENER_NAMES: 'CONTROLLER'
 22 |       KAFKA_LOG_DIRS: 'var/lib/kafka/data'
 23 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
 24 |       KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
 25 |       KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
 26 |       KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
 27 |     image: confluentinc/cp-kafka:7.9.1-1-ubi8
 28 |     healthcheck:
 29 |       <<: *healthcheck-common
 30 |       test: ['CMD', 'kafka-broker-api-versions', '--bootstrap-server', 'broker:29092']
 31 |     hostname: broker
 32 |     networks:
 33 |       - producer
 34 |     restart: always
 35 |   
 36 |   clickhouse:
 37 |     env_file:
 38 |       - ./tests/env-test/clickhouse.creds
 39 |     healthcheck:
 40 |       <<: *healthcheck-common
 41 |       test:
 42 |         - "CMD-SHELL"
 43 |         - "clickhouse-client --user $${CLICKHOUSE_USER} --password $${CLICKHOUSE_PASSWORD} -q 'SELECT 1'"
 44 |     hostname: clickhouse
 45 |     image: clickhouse/clickhouse-server:25.6.3.116
 46 |     networks:
 47 |       - producer
 48 |     restart: always
 49 |     volumes:
 50 |       - ./tests/files/clickhouse_table_schema.sql:/docker-entrypoint-initdb.d/init.sql:ro
 51 |   
 52 |   connect:
 53 |     depends_on:
 54 |       broker:
 55 |         condition: service_healthy
 56 |       schema-registry:
 57 |         condition: service_healthy
 58 |     environment:
 59 |       CONNECT_BOOTSTRAP_SERVERS: 'PLAINTEXT://broker:29092'
 60 |       CONNECT_REST_ADVERTISED_HOST_NAME: connect
 61 |       CONNECT_REST_PORT: 8083
 62 |       CONNECT_GROUP_ID: 'clickhouse-connect-group'
 63 |       CONNECT_CONFIG_STORAGE_TOPIC: '_connect-configs'
 64 |       CONNECT_OFFSET_STORAGE_TOPIC: '_connect-offsets'
 65 |       CONNECT_STATUS_STORAGE_TOPIC: '_connect-status'
 66 |       CONNECT_REPLICATION_FACTOR: 1
 67 |       CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1
 68 |       CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1
 69 |       CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1
 70 |       CONNECT_KEY_CONVERTER: 'org.apache.kafka.connect.converters.ByteArrayConverter'
 71 |       CONNECT_VALUE_CONVERTER: 'io.confluent.connect.avro.AvroConverter'
 72 |       CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: 'http://schema-registry:8081'
 73 |       CONNECT_SCHEMA_REGISTRY_URL: 'http://schema-registry:8081'
 74 |       CONNECT_VALUE_CONVERTER_SCHEMA_ENABLE: 'true'
 75 |       CONNECT_PLUGIN_PATH: '/usr/share/java/clickhouse'
 76 |     healthcheck:
 77 |       <<: *healthcheck-common
 78 |       test: ['CMD', 'curl', '-f', 'http://localhost:8083/connectors']
 79 |     hostname: connect
 80 |     image: confluentinc/cp-kafka-connect:7.9.1-1-ubi8.amd64
 81 |     networks:
 82 |       - producer
 83 |     restart: always
 84 |     volumes:
 85 |       - ./kafka/connectors:/usr/share/java/clickhouse:ro
 86 |   
 87 |   connect-helper:
 88 |     build:
 89 |       context: ./kafka
 90 |       dockerfile: Dockerfile-Connect-helper
 91 |     command:
 92 |       - -c
 93 |       - |
 94 |         # Exit immediately if a command exits with a non-zero status.
 95 |         set -e
 96 |         echo "--- Verifying Environment Variables ---"
 97 |         printenv | grep CLICKHOUSE || echo "CLICKHOUSE variables not found!"
 98 |         echo "-------------------------------------"
 99 |         
100 |         # Read secrets into env vars
101 |         export CLICKHOUSE_USER
102 |         export CLICKHOUSE_PASSWORD
103 |         export CLICKHOUSE_HOST
104 |         export CLICKHOUSE_PORT
105 |         export CLICKHOUSE_DB
106 |         export KAFKA_TOPIC
107 |         export DOLLAR="$"
108 |         
109 |         # Prepare a temporary config file with substituted variables
110 |         CONNECTOR_CONFIG_FILE="/tmp/config.json"
111 |         envsubst < /home/clickhouse_connector_configuration.json > "$${CONNECTOR_CONFIG_FILE}"
112 |         echo "--- Generated Connector Configuration ---"
113 |         cat "$${CONNECTOR_CONFIG_FILE}"
114 |         echo "---------------------------------------"
115 |         
116 |         CONNECT_HOST="$${CONNECT_HOST:-connect}"
117 |         CONNECT_PORT="$${CONNECT_PORT:-8083}"
118 |         CONNECT_URL="http://$${CONNECT_HOST}:$${CONNECT_PORT}/connectors"
119 |         echo "$${CONNECT_URL}"
120 |         echo "Waiting for Kafka Connect to be available at $${CONNECT_URL}..."
121 |         # Use a loop to check if the Connect REST API is up and running.
122 |         # The -f flag makes curl fail silently on server errors.
123 |         until $$(curl -s -f -o /dev/null $${CONNECT_URL}); do
124 |             sleep 2
125 |         done
126 |         echo "Kafka Connect is available. Checking if the configuration is already applied..."
127 |         CONNECTOR_NAME=$$(jq -r .name "$${CONNECTOR_CONFIG_FILE}")
128 |         if curl -s -f "http://$${CONNECT_HOST}:$${CONNECT_PORT}/connectors/$${CONNECTOR_NAME}/status" > /dev/null; then
129 |             echo "Connector '$$CONNECTOR_NAME' already configured. No further action needed."
130 |         else
131 |             echo "Connector '$$CONNECTOR_NAME' is NOT applied. Configuring the ClickHouse sink connector..."
132 |             HTTP_CODE=$$(curl -s -o response.txt -w "%{http_code}" -X POST -H "Content-Type:application/json" --data "@$${CONNECTOR_CONFIG_FILE}" "$${CONNECT_URL}")
133 |             if [ "$$HTTP_CODE" -eq 200 ] || [ "$$HTTP_CODE" -eq 201 ]; then
134 |                 echo "ClickHouse sink connector configured."
135 |             else
136 |                 echo "Failed to configure clickhouse sink connector."
137 |                 cat response.txt
138 |                 exit 1
139 |             fi
140 |         fi
141 |         exit 0
142 |     depends_on:
143 |       connect:
144 |         condition: service_healthy
145 |     entrypoint: /bin/bash
146 |     env_file:
147 |       - ./tests/env-test/clickhouse.env
148 |       - ./tests/env-test/clickhouse.creds
149 |       - ./tests/env-test/kafka.env
150 |     image: lp/connect-helper
151 |     networks:
152 |       - producer
153 |     volumes:
154 |       - ./kafka/clickhouse_connector_configuration.json:/home/clickhouse_connector_configuration.json:ro
155 |   
156 |   schema-registry:
157 |     depends_on:
158 |       broker:
159 |         condition: service_started
160 |     environment:
161 |       SCHEMA_REGISTRY_HOST_NAME: schema-registry
162 |       SCHEMA_REGISTRY_LISTENERS: 'http://0.0.0.0:8081'
163 |       SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: 'PLAINTEXT://broker:29092'
164 |     hostname: schema-registry
165 |     image: confluentinc/cp-schema-registry:7.9.1-1-ubi8.amd64
166 |     healthcheck:
167 |       <<: *healthcheck-common
168 |       test: ["CMD", "curl", "-f", "http://localhost:8081/subjects"]
169 |     networks:
170 |       - producer
171 |     restart: always
172 |   
173 |   producer-test-runner:
174 |     build:
175 |       context: ./producer
176 |       dockerfile: Dockerfile-Producer-test
177 |     command: python -m pytest --disable-warnings
178 |     depends_on:
179 |       broker:
180 |         condition: service_healthy
181 |       clickhouse:
182 |         condition: service_healthy
183 |       connect-helper:
184 |         condition: service_completed_successfully
185 |       schema-registry:
186 |         condition: service_healthy
187 |     env_file:
188 |       - ./tests/env-test/kafka.env
189 |       - ./tests/env-test/clickhouse.env
190 |       - ./tests/env-test/clickhouse.creds
191 |     environment:
192 |       KAFKA_BOOTSTRAP_SERVERS: 'broker:29092'
193 |       SCHEMA_REGISTRY_URL: 'http://schema-registry:8081'
194 |     image: lp/test-producer
195 |     networks:
196 |       - producer
197 |     volumes:
198 |       - ./producer:/home/app:ro
199 |     working_dir: /home/app
200 | 
201 | networks:
202 |   producer:
203 | 


--------------------------------------------------------------------------------
/tests/docker-compose.test.spark.yml:
--------------------------------------------------------------------------------
  1 | x-healthcheck:
  2 |   healthcheck:
  3 |     &healthcheck-common
  4 |       interval: 30s
  5 |       timeout: 5s
  6 |       retries: 3
  7 |       start_period: 30s
  8 | 
  9 | services:
 10 |   minio:
 11 |     command: server /data --console-address ":9001"
 12 |     env_file:
 13 |       - ./tests/env-test/minio.creds
 14 |     healthcheck:
 15 |       <<: *healthcheck-common
 16 |       test: ['CMD', 'mc', 'ready', 'local']
 17 |     hostname: minio
 18 |     image: minio/minio:RELEASE.2025-07-18T21-56-31Z
 19 |     networks:
 20 |       - airflow
 21 |     restart: always
 22 |   
 23 |   minio-init:
 24 |     command:
 25 |       - -c
 26 |       - |
 27 |         # Although it is set to depend on minio, but it not guaranteed. So we have to do a short sleep
 28 |         sleep 5
 29 |         
 30 |         echo "Setup alias for MinIO server"
 31 |         mc alias set $$MINIO_CONN_NAME http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD;
 32 |         
 33 |         echo "Create the bucket if it doesn't exist"
 34 |         mc mb $$MINIO_CONN_NAME/$$MINIO_BUCKET_NAME || true;
 35 |         
 36 |         exit 0;
 37 |     depends_on:
 38 |       minio:
 39 |         condition: service_healthy
 40 |     entrypoint: /bin/bash
 41 |     env_file:
 42 |       - ./tests/env-test/minio.env
 43 |       - ./tests/env-test/minio.creds
 44 |     hostname: minio-init
 45 |     image: minio/minio:RELEASE.2025-07-18T21-56-31Z
 46 |     networks:
 47 |       - airflow
 48 |   
 49 |   spark-master:
 50 |     build:
 51 |       context: ./spark
 52 |       dockerfile: Dockerfile-Spark
 53 |     command: /opt/spark/bin/spark-class org.apache.spark.deploy.master.Master --host spark-master
 54 |     depends_on:
 55 |       - minio
 56 |     healthcheck:
 57 |       <<: *healthcheck-common
 58 |       test: ['CMD', 'curl', '-f', 'http://localhost:8080']
 59 |     hostname: spark-master
 60 |     image: lp/spark
 61 |     networks:
 62 |       - airflow
 63 |     restart: always
 64 |     volumes:
 65 |       - ./airflow/dags:/opt/airflow/dags
 66 |   
 67 |   spark-worker:
 68 |     build:
 69 |       context: ./spark
 70 |       dockerfile: Dockerfile-Spark
 71 |     command: /opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077
 72 |     depends_on:
 73 |       - spark-master
 74 |     env_file:
 75 |       - ./tests/env-test/minio.env
 76 |       - ./tests/env-test/minio.creds
 77 |     environment:
 78 |       SPARK_WORKER_CORES: 2
 79 |       SPARK_WORKER_MEMORY: 1g
 80 |     healthcheck:
 81 |       <<: *healthcheck-common
 82 |       test: ['CMD', 'curl', '-f', 'http://localhost:8081']
 83 |     hostname: spark-worker
 84 |     image: lp/spark
 85 |     networks:
 86 |       - airflow
 87 |     restart: always
 88 |     volumes:
 89 |       - ./airflow/dags:/opt/airflow/dags
 90 |   
 91 |   spark-test-runner:
 92 |     build:
 93 |       context: ./spark/
 94 |       dockerfile: Dockerfile-Spark-test
 95 |     command: python3 -m pytest --disable-warnings
 96 |     depends_on:
 97 |       minio:
 98 |         condition: service_healthy
 99 |       minio-init:
100 |         condition: service_completed_successfully
101 |       spark-master:
102 |         condition: service_healthy
103 |       spark-worker:
104 |         condition: service_healthy
105 |     env_file:
106 |       - ./tests/env-test/clickhouse.creds
107 |       - ./tests/env-test/clickhouse.env
108 |       - ./tests/env-test/minio.creds
109 |       - ./tests/env-test/minio.env
110 |       - ./tests/env-test/spark.env
111 |     image: lp/test-spark
112 |     networks:
113 |       - airflow
114 |     volumes:
115 |       - ./airflow/dags:/opt/airflow/dags
116 |       - ./spark/tests:/opt/airflow/dags/tests:ro
117 |     working_dir: /opt/airflow/dags
118 | 
119 | networks:
120 |   airflow:
121 |     name: airflow
122 | 


--------------------------------------------------------------------------------
/tests/env-test/airflow.creds:
--------------------------------------------------------------------------------
1 | AIRFLOW__API__SECRET_KEY=testd84e69f0ae94c5e1b721ae443fa3
2 | _AIRFLOW_WWW_USER_USERNAME=airflow-test
3 | _AIRFLOW_WWW_USER_PASSWORD=airflow-test
4 | AIRFLOW__CORE__FERNET_KEY=test_XNW-bppePFnhse2QlJn7ZfNaLy9gjlQZwPh2a0=


--------------------------------------------------------------------------------
/tests/env-test/airflow.env:
--------------------------------------------------------------------------------
1 | AIRFLOW_UID=1000


--------------------------------------------------------------------------------
/tests/env-test/clickhouse.creds:
--------------------------------------------------------------------------------
1 | CLICKHOUSE_USER=default-test
2 | CLICKHOUSE_PASSWORD=password-test


--------------------------------------------------------------------------------
/tests/env-test/clickhouse.env:
--------------------------------------------------------------------------------
1 | CLICKHOUSE_HOST=clickhouse
2 | CLICKHOUSE_PORT=8123
3 | CLICKHOUSE_DB=default_test
4 | CLICKHOUSE_TABLE=user_interactions_test
5 | CLICKHOUSE_CONN_NAME=clickhouse_connection_test


--------------------------------------------------------------------------------
/tests/env-test/kafka.env:
--------------------------------------------------------------------------------
1 | KAFKA_TOPIC=user_interactions_test
2 | SCHEMA_REGISTRY_URL=http://schema-registry:8081
3 | CLUSTER_ID='testdf2d-b58f-47d0-a2cf-e75ff2f2e58e'
4 | 
5 | KAFKA_BOOTSTRAP_SERVERS=broker:29092
6 | 
7 | FLUSH_SECONDS=0


--------------------------------------------------------------------------------
/tests/env-test/minio.creds:
--------------------------------------------------------------------------------
1 | MINIO_ROOT_USER=admin-test
2 | MINIO_ROOT_PASSWORD=adminpsw-test


--------------------------------------------------------------------------------
/tests/env-test/minio.env:
--------------------------------------------------------------------------------
1 | MINIO_BUCKET_NAME=user-interactions-test
2 | MINIO_CONN_NAME=minio_connection_test


--------------------------------------------------------------------------------
/tests/env-test/postgres.creds:
--------------------------------------------------------------------------------
1 | POSTGRES_USER=airflow-test
2 | POSTGRES_PASSWORD=airflow-test


--------------------------------------------------------------------------------
/tests/env-test/postgres.env:
--------------------------------------------------------------------------------
1 | POSTGRES_DB=airflow-test


--------------------------------------------------------------------------------
/tests/env-test/spark.env:
--------------------------------------------------------------------------------
1 | SPARK_CONN_NAME=spark_connection_test
2 | SPARK_APPLICATION_PATH='/opt/airflow/dags/spark.py'
3 | HADOOP_AWS_VERSION=3.3.4
4 | AWS_JAVA_SDK_VERSION=1.12.262
5 | SPARK_EVENT_LOG_ENABLED=false
6 | 


--------------------------------------------------------------------------------
/tests/files/clickhouse_table_schema.sql:
--------------------------------------------------------------------------------
 1 | CREATE DATABASE IF NOT EXISTS default_test;
 2 | 
 3 | CREATE TABLE IF NOT EXISTS default_test.user_interactions_test
 4 | (
 5 |     event_id UUID,
 6 |     user_id UUID,
 7 |     session_id UUID,
 8 |     event_type Enum8('VIEW_PRODUCT' = 1, 'ADD_TO_CART' = 2, 'CHECKOUT' = 3, 'PAYMENT' = 4, 'SEARCH' = 5),  -- 8bits for 5 items
 9 |     event_timestamp DateTime64(3, 'UTC'), -- 3: 10^-3: Millisecond precision, UTC timezone. This is stored as datetime in db.
10 |     request_latency_ms UInt32,
11 |     status Enum8('SUCCESS' = 1, 'ERROR' = 2),
12 |     error_code Nullable(UInt32),
13 |     product_id Nullable(UInt32),
14 |     event_minute DateTime MATERIALIZED toStartOfMinute(event_timestamp)
15 | )
16 | ENGINE = MergeTree()
17 | PARTITION BY event_minute
18 | ORDER BY (event_minute, event_type);


--------------------------------------------------------------------------------