├── .github
    └── workflows
    │   └── docker-image.yml
├── .gitignore
├── README.md
├── compose.yml
├── dags
    └── kafka_stream.py
├── dockerfile
├── image
    └── architecture.png
├── requirements.txt
├── script
    └── entrypoint.sh
└── spark_stream.py


/.github/workflows/docker-image.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Push Docker Image
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - name: Checkout code
14 |       uses: actions/checkout@v2
15 | 
16 |     - name: Login to Docker Hub
17 |       uses: docker/login-action@v1
18 |       with:
19 |         username: ${{ secrets.DOCKERHUB_USERNAME }}
20 |         password: ${{ secrets.DOCKERHUB_TOKEN }}
21 | 
22 |     - name: Build and push Docker image
23 |       uses: docker/build-push-action@v2
24 |       with:
25 |         context: .
26 |         file: ./dockerfile
27 |         push: true
28 |         tags: labchiri/data-streaming
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Architecture
 2 | 
 3 | ![1698196520888](image/architecture.png)
 4 | 
 5 | ## Description
 6 | 
 7 | I used Airflow, PostgreSQL, Kafka, Spark and Cassandra in order to establish a fully automated ETL pipeline in a container runtime, with a CI using GitHub Actions to automate the service's Docker image updates on DockerHub.
 8 | 
 9 | ## Get Started
10 | 
11 | - Clone the repository
12 |   - `git clone https://github.com/moontucer/Data-Streaming-Project/`
13 | - Go to the project folder
14 |   - `cd Data-Streaming-Project`
15 | - Build the environment with Docker Compose
16 |   - `docker-compose up`
17 | 
18 | ## Link to the Medium article
19 | https://medium.com/@moontucer/data-streaming-project-real-time-end-to-end-data-pipeline-082f0d9cfbdb
20 | 


--------------------------------------------------------------------------------
/compose.yml:
--------------------------------------------------------------------------------
  1 | version: '3'
  2 | 
  3 | services:
  4 |   zookeeper:
  5 |       image: confluentinc/cp-zookeeper:7.4.0
  6 |       hostname: zookeeper
  7 |       container_name: zookeeper
  8 |       ports:
  9 |         - "2181:2181"
 10 |       environment:
 11 |         ZOOKEEPER_CLIENT_PORT: 2181
 12 |         ZOOKEEPER_TICK_TIME: 2000
 13 |       healthcheck:
 14 |         test: ['CMD', 'bash', '-c', "echo 'ruok' | nc localhost 2181"]
 15 |         interval: 10s
 16 |         timeout: 5s
 17 |         retries: 5
 18 |       networks:
 19 |         - confluent
 20 | 
 21 |   broker:
 22 |     image: confluentinc/cp-server:7.4.0
 23 |     hostname: broker
 24 |     container_name: broker
 25 |     depends_on:
 26 |       zookeeper:
 27 |         condition: service_healthy
 28 |     ports:
 29 |       - "9092:9092"
 30 |       - "9101:9101"
 31 |     environment:
 32 |       KAFKA_BROKER_ID: 1
 33 |       KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
 34 |       KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
 35 |       KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
 36 |       KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter
 37 |       KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
 38 |       KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
 39 |       KAFKA_CONFLUENT_LICENSE_TOPIC_REPLICATION_FACTOR: 1
 40 |       KAFKA_CONFLUENT_BALANCER_TOPIC_REPLICATION_FACTOR: 1
 41 |       KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
 42 |       KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
 43 |       KAFKA_JMX_PORT: 9101
 44 |       KAFKA_JMX_HOSTNAME: localhost
 45 |       KAFKA_CONFLUENT_SCHEMA_REGISTRY_URL: http://schema-registry:8081
 46 |       CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: broker:29092
 47 |       CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1
 48 |       CONFLUENT_METRICS_ENABLE: 'false'
 49 |       CONFLUENT_SUPPORT_CUSTOMER_ID: 'anonymous'
 50 |     networks:
 51 |       - confluent
 52 |     healthcheck:
 53 |       test: [ "CMD", "bash", "-c", 'nc -z localhost 9092' ]
 54 |       interval: 10s
 55 |       timeout: 5s
 56 |       retries: 5
 57 | 
 58 |   schema-registry:
 59 |     image: confluentinc/cp-schema-registry:7.4.0
 60 |     hostname: schema-registry
 61 |     container_name: schema-registry
 62 |     depends_on:
 63 |       broker:
 64 |         condition: service_healthy
 65 |     ports:
 66 |       - "8081:8081"
 67 |     environment:
 68 |       SCHEMA_REGISTRY_HOST_NAME: schema-registry
 69 |       SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: 'broker:29092'
 70 |       SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:8081
 71 |     networks:
 72 |       - confluent
 73 |     healthcheck:
 74 |       test: [ "CMD", "curl", "-f", "http://localhost:8081/" ]
 75 |       interval: 30s
 76 |       timeout: 10s
 77 |       retries: 5
 78 |   control-center:
 79 |     image: confluentinc/cp-enterprise-control-center:7.4.0
 80 |     hostname: control-center
 81 |     container_name: control-center
 82 |     depends_on:
 83 |       broker:
 84 |         condition: service_healthy
 85 |       schema-registry:
 86 |         condition: service_healthy
 87 |     ports:
 88 |       - "9021:9021"
 89 |     environment:
 90 |       CONTROL_CENTER_BOOTSTRAP_SERVERS: 'broker:29092'
 91 |       CONTROL_CENTER_SCHEMA_REGISTRY_URL: "http://schema-registry:8081"
 92 |       CONTROL_CENTER_REPLICATION_FACTOR: 1
 93 |       CONTROL_CENTER_INTERNAL_TOPICS_PARTITIONS: 1
 94 |       CONTROL_CENTER_MONITORING_INTERCEPTOR_TOPIC_PARTITIONS: 1
 95 |       CONFLUENT_METRICS_TOPIC_REPLICATION: 1
 96 |       CONFLIENT_METRICS_ENABLE: 'false'
 97 |       PORT: 9021
 98 |     networks:
 99 |       - confluent
100 |     healthcheck:
101 |       test: [ "CMD", "curl", "-f", "http://localhost:9021/health" ]
102 |       interval: 30s
103 |       timeout: 10s
104 |       retries: 5
105 | 
106 |   webserver:
107 |     image: apache/airflow:2.6.0-python3.9
108 |     command: webserver
109 |     entrypoint: ['/opt/airflow/script/entrypoint.sh']
110 |     depends_on:
111 |       - postgres
112 |     environment:
113 |       - LOAD_EX=n
114 |       - EXECUTOR=Sequential
115 |       - AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres:5432/airflow
116 |       - AIRFLOW_WEBSERVER_SECRET_KEY=this_is_a_very_secured_key
117 |     logging:
118 |       options:
119 |         max-size: 10m
120 |         max-file: "3"
121 |     volumes:
122 |       - ./dags:/opt/airflow/dags
123 |       - ./script/entrypoint.sh:/opt/airflow/script/entrypoint.sh
124 |       - ./requirements.txt:/opt/airflow/requirements.txt
125 |     ports:
126 |       - "8080:8080"
127 |     healthcheck:
128 |       test: ['CMD-SHELL', "[ -f /opt/airflow/airflow-webserver.pid ]"]
129 |       interval: 30s
130 |       timeout: 30s
131 |       retries: 3
132 |     networks:
133 |       - confluent
134 | 
135 |   scheduler:
136 |     image: apache/airflow:2.6.0-python3.9
137 |     depends_on:
138 |       webserver:
139 |         condition: service_healthy
140 |     volumes:
141 |       - ./dags:/opt/airflow/dags
142 |       - ./script/entrypoint.sh:/opt/airflow/script/entrypoint.sh
143 |       - ./requirements.txt:/opt/airflow/requirements.txt
144 |     environment:
145 |       - LOAD_EX=n
146 |       - EXECUTOR=Sequential
147 |       - AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres:5432/airflow
148 |       - AIRFLOW_WEBSERVER_SECRET_KEY=this_is_a_very_secured_key
149 |     command: bash -c "pip install -r ./requirements.txt && airflow db upgrade && airflow scheduler"
150 |     networks:
151 |       - confluent
152 | 
153 |   postgres:
154 |     image: postgres:14.0
155 |     environment:
156 |       - POSTGRES_USER=airflow
157 |       - POSTGRES_PASSWORD=airflow
158 |       - POSTGRES_DB=airflow
159 |     logging:
160 |       options:
161 |         max-size: 10m
162 |         max-file: "3"
163 |     networks:
164 |       - confluent
165 | 
166 |   spark-master:
167 |     image: bitnami/spark:latest
168 |     command: bin/spark-class org.apache.spark.deploy.master.Master
169 |     ports:
170 |       - "9090:8080"
171 |       - "7077:7077"
172 |     networks:
173 |       - confluent
174 |   spark-worker:
175 |     image: bitnami/spark:latest
176 |     command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077
177 |     depends_on:
178 |       - spark-master
179 |     environment:
180 |       SPARK_MODE: worker
181 |       SPARK_WORKER_CORES: 2
182 |       SPARK_WORKER_MEMORY: 1g
183 |       SPARK_MASTER_URL: spark://spark-master:7077
184 |     networks:
185 |       - confluent
186 | 
187 |   cassandra_db:
188 |     image: cassandra:latest
189 |     container_name: cassandra
190 |     hostname: cassandra
191 |     ports:
192 |       - "9042:9042"
193 |     environment:
194 |       - MAX_HEAP_SIZE=512M
195 |       - HEAP_NEWSIZE=100M
196 |       - CASSANDRA_USERNAME=cassandra
197 |       - CASSANDRA_PASSWORD=cassandra
198 |     networks:
199 |       - confluent
200 |   myapp:
201 |     image: myapp:latest
202 |     depends_on:
203 |       - broker  # The service needs to wait for the Kafka broker
204 |     networks:
205 |       - confluent
206 | networks:
207 |   confluent:


--------------------------------------------------------------------------------
/dags/kafka_stream.py:
--------------------------------------------------------------------------------
 1 | import uuid
 2 | from datetime import datetime
 3 | from airflow import DAG
 4 | from airflow.operators.python import PythonOperator
 5 | 
 6 | default_args = {
 7 |     'owner': 'airscholar',
 8 |     'start_date': datetime(2023, 9, 3, 10, 00)
 9 | }
10 | 
11 | def get_data():
12 |     import requests
13 | 
14 |     res = requests.get("https://randomuser.me/api/")
15 |     res = res.json()
16 |     res = res['results'][0]
17 | 
18 |     return res
19 | 
20 | def format_data(res):
21 |     data = {}
22 |     location = res['location']
23 |     data['id'] = uuid.uuid4()
24 |     data['first_name'] = res['name']['first']
25 |     data['last_name'] = res['name']['last']
26 |     data['gender'] = res['gender']
27 |     data['address'] = f"{str(location['street']['number'])} {location['street']['name']}, " \
28 |                       f"{location['city']}, {location['state']}, {location['country']}"
29 |     data['post_code'] = location['postcode']
30 |     data['email'] = res['email']
31 |     data['username'] = res['login']['username']
32 |     data['dob'] = res['dob']['date']
33 |     data['registered_date'] = res['registered']['date']
34 |     data['phone'] = res['phone']
35 |     data['picture'] = res['picture']['medium']
36 | 
37 |     return data
38 | 
39 | def stream_data():
40 |     import json
41 |     from kafka import KafkaProducer
42 |     import time
43 |     import logging
44 | 
45 |     producer = KafkaProducer(bootstrap_servers=['broker:29092'], max_block_ms=5000)
46 |     curr_time = time.time()
47 | 
48 |     while True:
49 |         if time.time() > curr_time + 60: #1 minute
50 |             break
51 |         try:
52 |             res = get_data()
53 |             res = format_data(res)
54 | 
55 |             producer.send('users_created', json.dumps(res).encode('utf-8'))
56 |         except Exception as e:
57 |             logging.error(f'An error occured: {e}')
58 |             continue
59 | 
60 | with DAG('user_automation',
61 |          default_args=default_args,
62 |          schedule_interval='@daily',
63 |          catchup=False) as dag:
64 | 
65 |     streaming_task = PythonOperator(
66 |         task_id='stream_data_from_api',
67 |         python_callable=stream_data
68 |     )


--------------------------------------------------------------------------------
/dockerfile:
--------------------------------------------------------------------------------
 1 | # Use an appropriate base image, for instance, python:3.x
 2 | FROM python:3.9-slim
 3 | 
 4 | # Install any OS-level dependencies if necessary
 5 | RUN apt-get update && apt-get install -y \
 6 |     gcc \
 7 |     python3-dev \
 8 | && apt-get clean && rm -rf /var/lib/apt/lists/*
 9 | 
10 | # Set the working directory in the container
11 | WORKDIR /app
12 | 
13 | # Copy the requirements file into the container
14 | COPY requirements.txt .
15 | 
16 | # Install Python dependencies with force reinstall
17 | RUN pip install --no-cache-dir --force-reinstall -r requirements.txt
18 | 
19 | # Copy the entire project into the container (this can be narrowed down as required)
20 | COPY . .
21 | 
22 | # The default command to run when starting the container
23 | CMD ["python", "./spark_stream.py"]
24 | 


--------------------------------------------------------------------------------
/image/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moontucer/Data-Streaming-Project/f529dfbc60e2ddf583f1e54fc9cbada90fe39f7e/image/architecture.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | aiohttp==3.8.5
  2 | aiosignal==1.3.1
  3 | alembic==1.12.0
  4 | anyio==4.0.0
  5 | apache-airflow==2.7.0
  6 | apache-airflow-providers-common-sql==1.7.1
  7 | apache-airflow-providers-ftp==3.5.1
  8 | apache-airflow-providers-http==4.5.1
  9 | apache-airflow-providers-imap==3.3.1
 10 | apache-airflow-providers-sqlite==3.4.3
 11 | apispec==6.3.0
 12 | argcomplete==3.1.1
 13 | asgiref==3.7.2
 14 | async-timeout==4.0.3
 15 | attrs==23.1.0
 16 | Babel==2.12.1
 17 | backoff==2.2.1
 18 | blinker==1.6.2
 19 | cachelib==0.9.0
 20 | cassandra-driver==3.28.0
 21 | cattrs==23.1.2
 22 | certifi==2023.7.22
 23 | cffi==1.15.1
 24 | charset-normalizer==3.2.0
 25 | click==8.1.7
 26 | clickclick==20.10.2
 27 | colorama==0.4.6
 28 | colorlog==4.8.0
 29 | ConfigUpdater==3.1.1
 30 | connexion==2.14.2
 31 | cron-descriptor==1.4.0
 32 | croniter==1.4.1
 33 | cryptography==41.0.3
 34 | Deprecated==1.2.14
 35 | dill==0.3.7
 36 | dnspython==2.4.2
 37 | docutils==0.20.1
 38 | email-validator==1.3.1
 39 | Flask==2.2.5
 40 | Flask-AppBuilder==4.3.3
 41 | Flask-Babel==2.0.0
 42 | Flask-Caching==2.0.2
 43 | Flask-JWT-Extended==4.5.2
 44 | Flask-Limiter==3.5.0
 45 | Flask-Login==0.6.2
 46 | Flask-Session==0.5.0
 47 | Flask-SQLAlchemy==2.5.1
 48 | Flask-WTF==1.1.1
 49 | frozenlist==1.4.0
 50 | geomet==0.2.1.post1
 51 | google-re2==1.1
 52 | googleapis-common-protos==1.60.0
 53 | graphviz==0.20.1
 54 | grpcio==1.57.0
 55 | gunicorn==21.2.0
 56 | h11==0.14.0
 57 | httpcore==0.17.3
 58 | httpx==0.24.1
 59 | idna==3.4
 60 | importlib-resources==6.0.1
 61 | inflection==0.5.1
 62 | itsdangerous==2.1.2
 63 | Jinja2==3.1.2
 64 | jsonschema==4.19.0
 65 | jsonschema-specifications==2023.7.1
 66 | kafka-python==2.0.2
 67 | lazy-object-proxy==1.9.0
 68 | limits==3.6.0
 69 | linkify-it-py==2.0.2
 70 | lockfile==0.12.2
 71 | Mako==1.2.4
 72 | Markdown==3.4.4
 73 | markdown-it-py==3.0.0
 74 | MarkupSafe==2.1.3
 75 | marshmallow==3.20.1
 76 | marshmallow-oneofschema==3.0.1
 77 | marshmallow-sqlalchemy==0.26.1
 78 | mdit-py-plugins==0.4.0
 79 | mdurl==0.1.2
 80 | multidict==6.0.4
 81 | opentelemetry-api==1.15.0
 82 | opentelemetry-exporter-otlp==1.15.0
 83 | opentelemetry-exporter-otlp-proto-grpc==1.15.0
 84 | opentelemetry-exporter-otlp-proto-http==1.15.0
 85 | opentelemetry-proto==1.15.0
 86 | opentelemetry-sdk==1.15.0
 87 | opentelemetry-semantic-conventions==0.36b0
 88 | ordered-set==4.1.0
 89 | packaging==23.1
 90 | pathspec==0.11.2
 91 | pendulum==2.1.2
 92 | pluggy==1.3.0
 93 | prison==0.2.1
 94 | protobuf==4.24.2
 95 | psutil==5.9.5
 96 | py4j==0.10.9.7
 97 | pycparser==2.21
 98 | pydantic==1.10.12
 99 | Pygments==2.16.1
100 | PyJWT==2.8.0
101 | # pyspark==3.4.1
102 | python-daemon==3.0.1
103 | python-dateutil==2.8.2
104 | python-nvd3==0.15.0
105 | python-slugify==8.0.1
106 | pytz==2023.3
107 | pytzdata==2020.1
108 | PyYAML==6.0.1
109 | referencing==0.30.2
110 | requests==2.31.0
111 | requests-toolbelt==1.0.0
112 | rfc3339-validator==0.1.4
113 | rich==13.5.2
114 | rich-argparse==1.3.0
115 | rpds-py==0.10.0
116 | setproctitle==1.3.2
117 | six==1.16.0
118 | sniffio==1.3.0
119 | spark==0.2.1
120 | SQLAlchemy==1.4.49
121 | SQLAlchemy-JSONField==1.0.1.post0
122 | SQLAlchemy-Utils==0.41.1
123 | sqlparse==0.4.4
124 | tabulate==0.9.0
125 | tenacity==8.2.3
126 | termcolor==2.3.0
127 | text-unidecode==1.3
128 | typing_extensions==4.7.1
129 | uc-micro-py==1.0.2
130 | unicodecsv==0.14.1
131 | urllib3==2.0.4
132 | Werkzeug==2.2.3
133 | wrapt==1.15.0
134 | WTForms==3.0.1
135 | yarl==1.9.2


--------------------------------------------------------------------------------
/script/entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | if [ -e "/opt/airflow/requirements.txt" ]; then
 5 |   $(command python) pip install --upgrade pip
 6 |   $(command -v pip) install --user -r requirements.txt
 7 | fi
 8 | 
 9 | if [ ! -f "/opt/airflow/airflow.db" ]; then
10 |   airflow db init && \
11 |   airflow users create \
12 |     --username admin \
13 |     --firstname admin \
14 |     --lastname admin \
15 |     --role Admin \
16 |     --email admin@example.com \
17 |     --password admin
18 | fi
19 | 
20 | $(command -v airflow) db upgrade
21 | 
22 | exec airflow webserver


--------------------------------------------------------------------------------
/spark_stream.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from cassandra.cluster import Cluster
  4 | from pyspark.sql import SparkSession
  5 | from pyspark.sql.functions import from_json, col
  6 | from pyspark.sql.types import StructType, StructField, StringType
  7 | 
  8 | 
  9 | def create_keyspace(session):
 10 |     session.execute("""
 11 |         CREATE KEYSPACE IF NOT EXISTS spark_streams
 12 |         WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'};
 13 |     """)
 14 | 
 15 |     print("Keyspace created successfully!")
 16 | 
 17 | 
 18 | def create_table(session):
 19 |     session.execute("""
 20 |     CREATE TABLE IF NOT EXISTS spark_streams.created_users (
 21 |         id UUID PRIMARY KEY,
 22 |         first_name TEXT,
 23 |         last_name TEXT,
 24 |         gender TEXT,
 25 |         address TEXT,
 26 |         post_code TEXT,
 27 |         email TEXT,
 28 |         username TEXT,
 29 |         registered_date TEXT,
 30 |         phone TEXT,
 31 |         picture TEXT);
 32 |     """)
 33 | 
 34 |     print("Table created successfully!")
 35 | 
 36 | 
 37 | def insert_data(session, **kwargs):
 38 |     print("inserting data...")
 39 | 
 40 |     user_id = kwargs.get('id')
 41 |     first_name = kwargs.get('first_name')
 42 |     last_name = kwargs.get('last_name')
 43 |     gender = kwargs.get('gender')
 44 |     address = kwargs.get('address')
 45 |     postcode = kwargs.get('post_code')
 46 |     email = kwargs.get('email')
 47 |     username = kwargs.get('username')
 48 |     dob = kwargs.get('dob')
 49 |     registered_date = kwargs.get('registered_date')
 50 |     phone = kwargs.get('phone')
 51 |     picture = kwargs.get('picture')
 52 | 
 53 |     try:
 54 |         session.execute("""
 55 |             INSERT INTO spark_streams.created_users(id, first_name, last_name, gender, address, 
 56 |                 post_code, email, username, dob, registered_date, phone, picture)
 57 |                 VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
 58 |         """, (user_id, first_name, last_name, gender, address,
 59 |               postcode, email, username, dob, registered_date, phone, picture))
 60 |         logging.info(f"Data inserted for {first_name} {last_name}")
 61 | 
 62 |     except Exception as e:
 63 |         logging.error(f'could not insert data due to {e}')
 64 | 
 65 | 
 66 | def create_spark_connection():
 67 |     s_conn = None
 68 | 
 69 |     try:
 70 |         s_conn = SparkSession.builder \
 71 |             .appName('SparkDataStreaming') \
 72 |             .config('spark.jars.packages', "com.datastax.spark:spark-cassandra-connector_2.13:3.4.1,"
 73 |                                            "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.1") \
 74 |             .config('spark.cassandra.connection.host', 'localhost') \
 75 |             .getOrCreate()
 76 | 
 77 |         s_conn.sparkContext.setLogLevel("ERROR")
 78 |         logging.info("Spark connection created successfully!")
 79 |     except Exception as e:
 80 |         logging.error(f"Couldn't create the spark session due to exception {e}")
 81 | 
 82 |     return s_conn
 83 | 
 84 | 
 85 | def connect_to_kafka(spark_conn):
 86 |     spark_df = None
 87 |     try:
 88 |         spark_df = spark_conn.readStream \
 89 |             .format('kafka') \
 90 |             .option('kafka.bootstrap.servers', 'localhost:9092') \
 91 |             .option('subscribe', 'users_created') \
 92 |             .option('startingOffsets', 'earliest') \
 93 |             .load()
 94 |         logging.info("kafka dataframe created successfully")
 95 |     except Exception as e:
 96 |         logging.warning(f"kafka dataframe could not be created because: {e}")
 97 | 
 98 |     return spark_df
 99 | 
100 | 
101 | def create_cassandra_connection():
102 |     try:
103 |         # connecting to the cassandra cluster
104 |         cluster = Cluster(['localhost'])
105 | 
106 |         cas_session = cluster.connect()
107 | 
108 |         return cas_session
109 |     except Exception as e:
110 |         logging.error(f"Could not create cassandra connection due to {e}")
111 |         return None
112 | 
113 | 
114 | def create_selection_df_from_kafka(spark_df):
115 |     schema = StructType([
116 |         StructField("id", StringType(), False),
117 |         StructField("first_name", StringType(), False),
118 |         StructField("last_name", StringType(), False),
119 |         StructField("gender", StringType(), False),
120 |         StructField("address", StringType(), False),
121 |         StructField("post_code", StringType(), False),
122 |         StructField("email", StringType(), False),
123 |         StructField("username", StringType(), False),
124 |         StructField("registered_date", StringType(), False),
125 |         StructField("phone", StringType(), False),
126 |         StructField("picture", StringType(), False)
127 |     ])
128 | 
129 |     sel = spark_df.selectExpr("CAST(value AS STRING)") \
130 |         .select(from_json(col('value'), schema).alias('data')).select("data.*")
131 |     print(sel)
132 | 
133 |     return sel
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     # create spark connection
138 |     spark_conn = create_spark_connection()
139 | 
140 |     if spark_conn is not None:
141 |         # connect to kafka with spark connection
142 |         spark_df = connect_to_kafka(spark_conn)
143 |         selection_df = create_selection_df_from_kafka(spark_df)
144 |         session = create_cassandra_connection()
145 | 
146 |         if session is not None:
147 |             create_keyspace(session)
148 |             create_table(session)
149 | 
150 |             logging.info("Streaming is being started...")
151 | 
152 |             streaming_query = (selection_df.writeStream.format("org.apache.spark.sql.cassandra")
153 |                                .option('checkpointLocation', '/tmp/checkpoint')
154 |                                .option('keyspace', 'spark_streams')
155 |                                .option('table', 'created_users')
156 |                                .start())
157 | 
158 |             streaming_query.awaitTermination()


--------------------------------------------------------------------------------