├── .github └── workflows │ └── docker-image.yml ├── .gitignore ├── README.md ├── compose.yml ├── dags └── kafka_stream.py ├── dockerfile ├── image └── architecture.png ├── requirements.txt ├── script └── entrypoint.sh └── spark_stream.py /.github/workflows/docker-image.yml: -------------------------------------------------------------------------------- 1 | name: Build and Push Docker Image 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v2 15 | 16 | - name: Login to Docker Hub 17 | uses: docker/login-action@v1 18 | with: 19 | username: ${{ secrets.DOCKERHUB_USERNAME }} 20 | password: ${{ secrets.DOCKERHUB_TOKEN }} 21 | 22 | - name: Build and push Docker image 23 | uses: docker/build-push-action@v2 24 | with: 25 | context: . 26 | file: ./dockerfile 27 | push: true 28 | tags: labchiri/data-streaming 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | venv -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Architecture 2 | 3 | ![1698196520888](image/architecture.png) 4 | 5 | ## Description 6 | 7 | I used Airflow, PostgreSQL, Kafka, Spark and Cassandra in order to establish a fully automated ETL pipeline in a container runtime, with a CI using GitHub Actions to automate the service's Docker image updates on DockerHub. 8 | 9 | ## Get Started 10 | 11 | - Clone the repository 12 | - `git clone https://github.com/moontucer/Data-Streaming-Project/` 13 | - Go to the project folder 14 | - `cd Data-Streaming-Project` 15 | - Build the environment with Docker Compose 16 | - `docker-compose up` 17 | 18 | ## Link to the Medium article 19 | https://medium.com/@moontucer/data-streaming-project-real-time-end-to-end-data-pipeline-082f0d9cfbdb 20 | -------------------------------------------------------------------------------- /compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | zookeeper: 5 | image: confluentinc/cp-zookeeper:7.4.0 6 | hostname: zookeeper 7 | container_name: zookeeper 8 | ports: 9 | - "2181:2181" 10 | environment: 11 | ZOOKEEPER_CLIENT_PORT: 2181 12 | ZOOKEEPER_TICK_TIME: 2000 13 | healthcheck: 14 | test: ['CMD', 'bash', '-c', "echo 'ruok' | nc localhost 2181"] 15 | interval: 10s 16 | timeout: 5s 17 | retries: 5 18 | networks: 19 | - confluent 20 | 21 | broker: 22 | image: confluentinc/cp-server:7.4.0 23 | hostname: broker 24 | container_name: broker 25 | depends_on: 26 | zookeeper: 27 | condition: service_healthy 28 | ports: 29 | - "9092:9092" 30 | - "9101:9101" 31 | environment: 32 | KAFKA_BROKER_ID: 1 33 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' 34 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 35 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092 36 | KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter 37 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 38 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 39 | KAFKA_CONFLUENT_LICENSE_TOPIC_REPLICATION_FACTOR: 1 40 | KAFKA_CONFLUENT_BALANCER_TOPIC_REPLICATION_FACTOR: 1 41 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 42 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 43 | KAFKA_JMX_PORT: 9101 44 | KAFKA_JMX_HOSTNAME: localhost 45 | KAFKA_CONFLUENT_SCHEMA_REGISTRY_URL: http://schema-registry:8081 46 | CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: broker:29092 47 | CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1 48 | CONFLUENT_METRICS_ENABLE: 'false' 49 | CONFLUENT_SUPPORT_CUSTOMER_ID: 'anonymous' 50 | networks: 51 | - confluent 52 | healthcheck: 53 | test: [ "CMD", "bash", "-c", 'nc -z localhost 9092' ] 54 | interval: 10s 55 | timeout: 5s 56 | retries: 5 57 | 58 | schema-registry: 59 | image: confluentinc/cp-schema-registry:7.4.0 60 | hostname: schema-registry 61 | container_name: schema-registry 62 | depends_on: 63 | broker: 64 | condition: service_healthy 65 | ports: 66 | - "8081:8081" 67 | environment: 68 | SCHEMA_REGISTRY_HOST_NAME: schema-registry 69 | SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: 'broker:29092' 70 | SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:8081 71 | networks: 72 | - confluent 73 | healthcheck: 74 | test: [ "CMD", "curl", "-f", "http://localhost:8081/" ] 75 | interval: 30s 76 | timeout: 10s 77 | retries: 5 78 | control-center: 79 | image: confluentinc/cp-enterprise-control-center:7.4.0 80 | hostname: control-center 81 | container_name: control-center 82 | depends_on: 83 | broker: 84 | condition: service_healthy 85 | schema-registry: 86 | condition: service_healthy 87 | ports: 88 | - "9021:9021" 89 | environment: 90 | CONTROL_CENTER_BOOTSTRAP_SERVERS: 'broker:29092' 91 | CONTROL_CENTER_SCHEMA_REGISTRY_URL: "http://schema-registry:8081" 92 | CONTROL_CENTER_REPLICATION_FACTOR: 1 93 | CONTROL_CENTER_INTERNAL_TOPICS_PARTITIONS: 1 94 | CONTROL_CENTER_MONITORING_INTERCEPTOR_TOPIC_PARTITIONS: 1 95 | CONFLUENT_METRICS_TOPIC_REPLICATION: 1 96 | CONFLIENT_METRICS_ENABLE: 'false' 97 | PORT: 9021 98 | networks: 99 | - confluent 100 | healthcheck: 101 | test: [ "CMD", "curl", "-f", "http://localhost:9021/health" ] 102 | interval: 30s 103 | timeout: 10s 104 | retries: 5 105 | 106 | webserver: 107 | image: apache/airflow:2.6.0-python3.9 108 | command: webserver 109 | entrypoint: ['/opt/airflow/script/entrypoint.sh'] 110 | depends_on: 111 | - postgres 112 | environment: 113 | - LOAD_EX=n 114 | - EXECUTOR=Sequential 115 | - AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres:5432/airflow 116 | - AIRFLOW_WEBSERVER_SECRET_KEY=this_is_a_very_secured_key 117 | logging: 118 | options: 119 | max-size: 10m 120 | max-file: "3" 121 | volumes: 122 | - ./dags:/opt/airflow/dags 123 | - ./script/entrypoint.sh:/opt/airflow/script/entrypoint.sh 124 | - ./requirements.txt:/opt/airflow/requirements.txt 125 | ports: 126 | - "8080:8080" 127 | healthcheck: 128 | test: ['CMD-SHELL', "[ -f /opt/airflow/airflow-webserver.pid ]"] 129 | interval: 30s 130 | timeout: 30s 131 | retries: 3 132 | networks: 133 | - confluent 134 | 135 | scheduler: 136 | image: apache/airflow:2.6.0-python3.9 137 | depends_on: 138 | webserver: 139 | condition: service_healthy 140 | volumes: 141 | - ./dags:/opt/airflow/dags 142 | - ./script/entrypoint.sh:/opt/airflow/script/entrypoint.sh 143 | - ./requirements.txt:/opt/airflow/requirements.txt 144 | environment: 145 | - LOAD_EX=n 146 | - EXECUTOR=Sequential 147 | - AIRFLOW__DATABASE__SQL_ALCHEMY_CONN=postgresql+psycopg2://airflow:airflow@postgres:5432/airflow 148 | - AIRFLOW_WEBSERVER_SECRET_KEY=this_is_a_very_secured_key 149 | command: bash -c "pip install -r ./requirements.txt && airflow db upgrade && airflow scheduler" 150 | networks: 151 | - confluent 152 | 153 | postgres: 154 | image: postgres:14.0 155 | environment: 156 | - POSTGRES_USER=airflow 157 | - POSTGRES_PASSWORD=airflow 158 | - POSTGRES_DB=airflow 159 | logging: 160 | options: 161 | max-size: 10m 162 | max-file: "3" 163 | networks: 164 | - confluent 165 | 166 | spark-master: 167 | image: bitnami/spark:latest 168 | command: bin/spark-class org.apache.spark.deploy.master.Master 169 | ports: 170 | - "9090:8080" 171 | - "7077:7077" 172 | networks: 173 | - confluent 174 | spark-worker: 175 | image: bitnami/spark:latest 176 | command: bin/spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 177 | depends_on: 178 | - spark-master 179 | environment: 180 | SPARK_MODE: worker 181 | SPARK_WORKER_CORES: 2 182 | SPARK_WORKER_MEMORY: 1g 183 | SPARK_MASTER_URL: spark://spark-master:7077 184 | networks: 185 | - confluent 186 | 187 | cassandra_db: 188 | image: cassandra:latest 189 | container_name: cassandra 190 | hostname: cassandra 191 | ports: 192 | - "9042:9042" 193 | environment: 194 | - MAX_HEAP_SIZE=512M 195 | - HEAP_NEWSIZE=100M 196 | - CASSANDRA_USERNAME=cassandra 197 | - CASSANDRA_PASSWORD=cassandra 198 | networks: 199 | - confluent 200 | myapp: 201 | image: myapp:latest 202 | depends_on: 203 | - broker # The service needs to wait for the Kafka broker 204 | networks: 205 | - confluent 206 | networks: 207 | confluent: -------------------------------------------------------------------------------- /dags/kafka_stream.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from datetime import datetime 3 | from airflow import DAG 4 | from airflow.operators.python import PythonOperator 5 | 6 | default_args = { 7 | 'owner': 'airscholar', 8 | 'start_date': datetime(2023, 9, 3, 10, 00) 9 | } 10 | 11 | def get_data(): 12 | import requests 13 | 14 | res = requests.get("https://randomuser.me/api/") 15 | res = res.json() 16 | res = res['results'][0] 17 | 18 | return res 19 | 20 | def format_data(res): 21 | data = {} 22 | location = res['location'] 23 | data['id'] = uuid.uuid4() 24 | data['first_name'] = res['name']['first'] 25 | data['last_name'] = res['name']['last'] 26 | data['gender'] = res['gender'] 27 | data['address'] = f"{str(location['street']['number'])} {location['street']['name']}, " \ 28 | f"{location['city']}, {location['state']}, {location['country']}" 29 | data['post_code'] = location['postcode'] 30 | data['email'] = res['email'] 31 | data['username'] = res['login']['username'] 32 | data['dob'] = res['dob']['date'] 33 | data['registered_date'] = res['registered']['date'] 34 | data['phone'] = res['phone'] 35 | data['picture'] = res['picture']['medium'] 36 | 37 | return data 38 | 39 | def stream_data(): 40 | import json 41 | from kafka import KafkaProducer 42 | import time 43 | import logging 44 | 45 | producer = KafkaProducer(bootstrap_servers=['broker:29092'], max_block_ms=5000) 46 | curr_time = time.time() 47 | 48 | while True: 49 | if time.time() > curr_time + 60: #1 minute 50 | break 51 | try: 52 | res = get_data() 53 | res = format_data(res) 54 | 55 | producer.send('users_created', json.dumps(res).encode('utf-8')) 56 | except Exception as e: 57 | logging.error(f'An error occured: {e}') 58 | continue 59 | 60 | with DAG('user_automation', 61 | default_args=default_args, 62 | schedule_interval='@daily', 63 | catchup=False) as dag: 64 | 65 | streaming_task = PythonOperator( 66 | task_id='stream_data_from_api', 67 | python_callable=stream_data 68 | ) -------------------------------------------------------------------------------- /dockerfile: -------------------------------------------------------------------------------- 1 | # Use an appropriate base image, for instance, python:3.x 2 | FROM python:3.9-slim 3 | 4 | # Install any OS-level dependencies if necessary 5 | RUN apt-get update && apt-get install -y \ 6 | gcc \ 7 | python3-dev \ 8 | && apt-get clean && rm -rf /var/lib/apt/lists/* 9 | 10 | # Set the working directory in the container 11 | WORKDIR /app 12 | 13 | # Copy the requirements file into the container 14 | COPY requirements.txt . 15 | 16 | # Install Python dependencies with force reinstall 17 | RUN pip install --no-cache-dir --force-reinstall -r requirements.txt 18 | 19 | # Copy the entire project into the container (this can be narrowed down as required) 20 | COPY . . 21 | 22 | # The default command to run when starting the container 23 | CMD ["python", "./spark_stream.py"] 24 | -------------------------------------------------------------------------------- /image/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moontucer/Data-Streaming-Project/f529dfbc60e2ddf583f1e54fc9cbada90fe39f7e/image/architecture.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.5 2 | aiosignal==1.3.1 3 | alembic==1.12.0 4 | anyio==4.0.0 5 | apache-airflow==2.7.0 6 | apache-airflow-providers-common-sql==1.7.1 7 | apache-airflow-providers-ftp==3.5.1 8 | apache-airflow-providers-http==4.5.1 9 | apache-airflow-providers-imap==3.3.1 10 | apache-airflow-providers-sqlite==3.4.3 11 | apispec==6.3.0 12 | argcomplete==3.1.1 13 | asgiref==3.7.2 14 | async-timeout==4.0.3 15 | attrs==23.1.0 16 | Babel==2.12.1 17 | backoff==2.2.1 18 | blinker==1.6.2 19 | cachelib==0.9.0 20 | cassandra-driver==3.28.0 21 | cattrs==23.1.2 22 | certifi==2023.7.22 23 | cffi==1.15.1 24 | charset-normalizer==3.2.0 25 | click==8.1.7 26 | clickclick==20.10.2 27 | colorama==0.4.6 28 | colorlog==4.8.0 29 | ConfigUpdater==3.1.1 30 | connexion==2.14.2 31 | cron-descriptor==1.4.0 32 | croniter==1.4.1 33 | cryptography==41.0.3 34 | Deprecated==1.2.14 35 | dill==0.3.7 36 | dnspython==2.4.2 37 | docutils==0.20.1 38 | email-validator==1.3.1 39 | Flask==2.2.5 40 | Flask-AppBuilder==4.3.3 41 | Flask-Babel==2.0.0 42 | Flask-Caching==2.0.2 43 | Flask-JWT-Extended==4.5.2 44 | Flask-Limiter==3.5.0 45 | Flask-Login==0.6.2 46 | Flask-Session==0.5.0 47 | Flask-SQLAlchemy==2.5.1 48 | Flask-WTF==1.1.1 49 | frozenlist==1.4.0 50 | geomet==0.2.1.post1 51 | google-re2==1.1 52 | googleapis-common-protos==1.60.0 53 | graphviz==0.20.1 54 | grpcio==1.57.0 55 | gunicorn==21.2.0 56 | h11==0.14.0 57 | httpcore==0.17.3 58 | httpx==0.24.1 59 | idna==3.4 60 | importlib-resources==6.0.1 61 | inflection==0.5.1 62 | itsdangerous==2.1.2 63 | Jinja2==3.1.2 64 | jsonschema==4.19.0 65 | jsonschema-specifications==2023.7.1 66 | kafka-python==2.0.2 67 | lazy-object-proxy==1.9.0 68 | limits==3.6.0 69 | linkify-it-py==2.0.2 70 | lockfile==0.12.2 71 | Mako==1.2.4 72 | Markdown==3.4.4 73 | markdown-it-py==3.0.0 74 | MarkupSafe==2.1.3 75 | marshmallow==3.20.1 76 | marshmallow-oneofschema==3.0.1 77 | marshmallow-sqlalchemy==0.26.1 78 | mdit-py-plugins==0.4.0 79 | mdurl==0.1.2 80 | multidict==6.0.4 81 | opentelemetry-api==1.15.0 82 | opentelemetry-exporter-otlp==1.15.0 83 | opentelemetry-exporter-otlp-proto-grpc==1.15.0 84 | opentelemetry-exporter-otlp-proto-http==1.15.0 85 | opentelemetry-proto==1.15.0 86 | opentelemetry-sdk==1.15.0 87 | opentelemetry-semantic-conventions==0.36b0 88 | ordered-set==4.1.0 89 | packaging==23.1 90 | pathspec==0.11.2 91 | pendulum==2.1.2 92 | pluggy==1.3.0 93 | prison==0.2.1 94 | protobuf==4.24.2 95 | psutil==5.9.5 96 | py4j==0.10.9.7 97 | pycparser==2.21 98 | pydantic==1.10.12 99 | Pygments==2.16.1 100 | PyJWT==2.8.0 101 | # pyspark==3.4.1 102 | python-daemon==3.0.1 103 | python-dateutil==2.8.2 104 | python-nvd3==0.15.0 105 | python-slugify==8.0.1 106 | pytz==2023.3 107 | pytzdata==2020.1 108 | PyYAML==6.0.1 109 | referencing==0.30.2 110 | requests==2.31.0 111 | requests-toolbelt==1.0.0 112 | rfc3339-validator==0.1.4 113 | rich==13.5.2 114 | rich-argparse==1.3.0 115 | rpds-py==0.10.0 116 | setproctitle==1.3.2 117 | six==1.16.0 118 | sniffio==1.3.0 119 | spark==0.2.1 120 | SQLAlchemy==1.4.49 121 | SQLAlchemy-JSONField==1.0.1.post0 122 | SQLAlchemy-Utils==0.41.1 123 | sqlparse==0.4.4 124 | tabulate==0.9.0 125 | tenacity==8.2.3 126 | termcolor==2.3.0 127 | text-unidecode==1.3 128 | typing_extensions==4.7.1 129 | uc-micro-py==1.0.2 130 | unicodecsv==0.14.1 131 | urllib3==2.0.4 132 | Werkzeug==2.2.3 133 | wrapt==1.15.0 134 | WTForms==3.0.1 135 | yarl==1.9.2 -------------------------------------------------------------------------------- /script/entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | if [ -e "/opt/airflow/requirements.txt" ]; then 5 | $(command python) pip install --upgrade pip 6 | $(command -v pip) install --user -r requirements.txt 7 | fi 8 | 9 | if [ ! -f "/opt/airflow/airflow.db" ]; then 10 | airflow db init && \ 11 | airflow users create \ 12 | --username admin \ 13 | --firstname admin \ 14 | --lastname admin \ 15 | --role Admin \ 16 | --email admin@example.com \ 17 | --password admin 18 | fi 19 | 20 | $(command -v airflow) db upgrade 21 | 22 | exec airflow webserver -------------------------------------------------------------------------------- /spark_stream.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from cassandra.cluster import Cluster 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql.functions import from_json, col 6 | from pyspark.sql.types import StructType, StructField, StringType 7 | 8 | 9 | def create_keyspace(session): 10 | session.execute(""" 11 | CREATE KEYSPACE IF NOT EXISTS spark_streams 12 | WITH replication = {'class': 'SimpleStrategy', 'replication_factor': '1'}; 13 | """) 14 | 15 | print("Keyspace created successfully!") 16 | 17 | 18 | def create_table(session): 19 | session.execute(""" 20 | CREATE TABLE IF NOT EXISTS spark_streams.created_users ( 21 | id UUID PRIMARY KEY, 22 | first_name TEXT, 23 | last_name TEXT, 24 | gender TEXT, 25 | address TEXT, 26 | post_code TEXT, 27 | email TEXT, 28 | username TEXT, 29 | registered_date TEXT, 30 | phone TEXT, 31 | picture TEXT); 32 | """) 33 | 34 | print("Table created successfully!") 35 | 36 | 37 | def insert_data(session, **kwargs): 38 | print("inserting data...") 39 | 40 | user_id = kwargs.get('id') 41 | first_name = kwargs.get('first_name') 42 | last_name = kwargs.get('last_name') 43 | gender = kwargs.get('gender') 44 | address = kwargs.get('address') 45 | postcode = kwargs.get('post_code') 46 | email = kwargs.get('email') 47 | username = kwargs.get('username') 48 | dob = kwargs.get('dob') 49 | registered_date = kwargs.get('registered_date') 50 | phone = kwargs.get('phone') 51 | picture = kwargs.get('picture') 52 | 53 | try: 54 | session.execute(""" 55 | INSERT INTO spark_streams.created_users(id, first_name, last_name, gender, address, 56 | post_code, email, username, dob, registered_date, phone, picture) 57 | VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) 58 | """, (user_id, first_name, last_name, gender, address, 59 | postcode, email, username, dob, registered_date, phone, picture)) 60 | logging.info(f"Data inserted for {first_name} {last_name}") 61 | 62 | except Exception as e: 63 | logging.error(f'could not insert data due to {e}') 64 | 65 | 66 | def create_spark_connection(): 67 | s_conn = None 68 | 69 | try: 70 | s_conn = SparkSession.builder \ 71 | .appName('SparkDataStreaming') \ 72 | .config('spark.jars.packages', "com.datastax.spark:spark-cassandra-connector_2.13:3.4.1," 73 | "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.1") \ 74 | .config('spark.cassandra.connection.host', 'localhost') \ 75 | .getOrCreate() 76 | 77 | s_conn.sparkContext.setLogLevel("ERROR") 78 | logging.info("Spark connection created successfully!") 79 | except Exception as e: 80 | logging.error(f"Couldn't create the spark session due to exception {e}") 81 | 82 | return s_conn 83 | 84 | 85 | def connect_to_kafka(spark_conn): 86 | spark_df = None 87 | try: 88 | spark_df = spark_conn.readStream \ 89 | .format('kafka') \ 90 | .option('kafka.bootstrap.servers', 'localhost:9092') \ 91 | .option('subscribe', 'users_created') \ 92 | .option('startingOffsets', 'earliest') \ 93 | .load() 94 | logging.info("kafka dataframe created successfully") 95 | except Exception as e: 96 | logging.warning(f"kafka dataframe could not be created because: {e}") 97 | 98 | return spark_df 99 | 100 | 101 | def create_cassandra_connection(): 102 | try: 103 | # connecting to the cassandra cluster 104 | cluster = Cluster(['localhost']) 105 | 106 | cas_session = cluster.connect() 107 | 108 | return cas_session 109 | except Exception as e: 110 | logging.error(f"Could not create cassandra connection due to {e}") 111 | return None 112 | 113 | 114 | def create_selection_df_from_kafka(spark_df): 115 | schema = StructType([ 116 | StructField("id", StringType(), False), 117 | StructField("first_name", StringType(), False), 118 | StructField("last_name", StringType(), False), 119 | StructField("gender", StringType(), False), 120 | StructField("address", StringType(), False), 121 | StructField("post_code", StringType(), False), 122 | StructField("email", StringType(), False), 123 | StructField("username", StringType(), False), 124 | StructField("registered_date", StringType(), False), 125 | StructField("phone", StringType(), False), 126 | StructField("picture", StringType(), False) 127 | ]) 128 | 129 | sel = spark_df.selectExpr("CAST(value AS STRING)") \ 130 | .select(from_json(col('value'), schema).alias('data')).select("data.*") 131 | print(sel) 132 | 133 | return sel 134 | 135 | 136 | if __name__ == "__main__": 137 | # create spark connection 138 | spark_conn = create_spark_connection() 139 | 140 | if spark_conn is not None: 141 | # connect to kafka with spark connection 142 | spark_df = connect_to_kafka(spark_conn) 143 | selection_df = create_selection_df_from_kafka(spark_df) 144 | session = create_cassandra_connection() 145 | 146 | if session is not None: 147 | create_keyspace(session) 148 | create_table(session) 149 | 150 | logging.info("Streaming is being started...") 151 | 152 | streaming_query = (selection_df.writeStream.format("org.apache.spark.sql.cassandra") 153 | .option('checkpointLocation', '/tmp/checkpoint') 154 | .option('keyspace', 'spark_streams') 155 | .option('table', 'created_users') 156 | .start()) 157 | 158 | streaming_query.awaitTermination() --------------------------------------------------------------------------------