├── .gitignore ├── Dockerfile-kafka-connect ├── README.md ├── clickhouse-client.sh ├── demo-start.sh ├── demo-stop.sh ├── docker-compose.yml ├── ksql-custom-udfs ├── pom.xml └── src │ └── main │ └── java │ └── io │ └── streamthoughts │ └── ksql │ └── udfs │ ├── ArrayToString.java │ └── ExtractArrayField.java └── sql ├── ch_create_table_tweets.sql ├── ksql_create_connector_sink_jdbc_clickhouse.sql ├── ksql_create_connector_source_twitter.sql ├── ksql_create_stream_tweets.sql ├── ksql_create_stream_tweets_json.sql └── ksql_create_stream_tweets_normalized.sql /.gitignore: -------------------------------------------------------------------------------- 1 | ### Scala template 2 | *.class 3 | *.log 4 | 5 | # sbt specific 6 | .cache 7 | .history 8 | .lib/ 9 | dist/* 10 | target/ 11 | lib_managed/ 12 | src_managed/ 13 | project/boot/ 14 | project/plugins/project/ 15 | 16 | # Scala-IDE specific 17 | .scala_dependencies 18 | .worksheet 19 | ### Java template 20 | *.class 21 | 22 | # Mobile Tools for Java (J2ME) 23 | .mtj.tmp/ 24 | 25 | # Package Files # 26 | *.jar 27 | *.war 28 | *.ear 29 | 30 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 31 | hs_err_pid* 32 | ### Maven template 33 | target/ 34 | pom.xml.tag 35 | pom.xml.releaseBackup 36 | pom.xml.versionsBackup 37 | pom.xml.next 38 | release.properties 39 | dependency-reduced-pom.xml 40 | buildNumber.properties 41 | .mvn/timing.properties 42 | ### Eclipse template 43 | *.pydevproject 44 | .metadata 45 | .gradle 46 | tmp/ 47 | *.tmp 48 | *.bak 49 | *.swp 50 | *~.nib 51 | local.properties 52 | .settings/ 53 | .loadpath 54 | 55 | # Eclipse Core 56 | .project 57 | 58 | # External tool builders 59 | .externalToolBuilders/ 60 | 61 | # Locally stored "Eclipse launch configurations" 62 | *.launch 63 | 64 | # CDT-specific 65 | .cproject 66 | 67 | # JDT-specific (Eclipse Java Development Tools) 68 | .classpath 69 | 70 | # Java annotation processor (APT) 71 | .factorypath 72 | 73 | # PDT-specific 74 | .buildpath 75 | 76 | # sbteclipse plugin 77 | .target 78 | 79 | # TeXlipse plugin 80 | .texlipse 81 | ### JetBrains template 82 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio 83 | 84 | *.iml 85 | 86 | ## Directory-based project format: 87 | .idea/ 88 | # if you remove the above rule, at least ignore the following: 89 | 90 | # User-specific stuff: 91 | # .idea/workspace.xml 92 | # .idea/tasks.xml 93 | # .idea/dictionaries 94 | 95 | # Sensitive or high-churn files: 96 | # .idea/dataSources.ids 97 | # .idea/dataSources.xml 98 | # .idea/sqlDataSources.xml 99 | # .idea/dynamic.xml 100 | # .idea/uiDesigner.xml 101 | 102 | # Gradle: 103 | # .idea/gradle.xml 104 | # .idea/libraries 105 | 106 | # Mongo Explorer plugin: 107 | # .idea/mongoSettings.xml 108 | 109 | ## File-based project format: 110 | *.ipr 111 | *.iws 112 | 113 | ## Plugin-specific files: 114 | 115 | # IntelliJ 116 | /out/ 117 | 118 | # mpeltonen/sbt-idea plugin 119 | .idea_modules/ 120 | 121 | # JIRA plugin 122 | atlassian-ide-plugin.xml 123 | 124 | # Crashlytics plugin (for Android Studio and IntelliJ) 125 | com_crashlytics_export_strings.xml 126 | crashlytics.properties 127 | crashlytics-build.properties 128 | 129 | # Created by .ignore support plugin (hsz.mobi) 130 | # 131 | # 132 | azkarra-*/docs 133 | azkarra-server/src/main/resources/io/ 134 | */node_modules 135 | site/resources 136 | -------------------------------------------------------------------------------- /Dockerfile-kafka-connect: -------------------------------------------------------------------------------- 1 | FROM confluentinc/cp-kafka-connect-base:5.4.1 2 | 3 | ARG jdbcDriverPath 4 | ARG clickHouseVersion 5 | ARG connectTwitterVersion 6 | 7 | ENV JDBC_DRIVER_PATH=/usr/share/confluent-hub-components/confluentinc-kafka-connect-jdbc/lib 8 | ENV JDBC_DRIVER=clickhouse-jdbc-$clickHouseVersion.jar 9 | 10 | RUN confluent-hub install --no-prompt jcustenborder/kafka-connect-twitter:$connectTwitterVersion 11 | RUN confluent-hub install --no-prompt confluentinc/kafka-connect-jdbc:5.4.1 12 | 13 | RUN mkdir -p $JDBC_DRIVER_PATH && \ 14 | echo "Downloading JDBC Driver for ClickHouse v$clickHouseVersion" && \ 15 | wget -O ${JDBC_DRIVER_PATH}/${JDBC_DRIVER} https://github.com/ClickHouse/clickhouse-jdbc/releases/download/release_$clickHouseVersion/${JDBC_DRIVER} -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PoC : Fast Data Analytic platform with ClickHouse and Apache Kafka 2 | 3 | 4 | ## Prerequisites 5 | 6 | * Git 7 | * Maven (we recommend version 3.5.3) 8 | * Java 11 9 | * Docker, Docker-compose 10 | 11 | ## Project Tree 12 | 13 | ``` 14 | ├── clickhouse-client.sh // Utility to start a clickHouse client 15 | ├── demo-start.sh // Utility script to start the project 16 | ├── demo-stop.sh // Uitlity script to stop the project 17 | ├── docker-compose.yml 18 | ├── Dockerfile-kafka-connect 19 | ├── ksql-custom-udfs // Maven project that contains User Defined Functions (UDFs) for ksqlDB 20 | │   ├── pom.xml 21 | │   └── src 22 | │   └── main 23 | │   └── java 24 | │   └── io 25 | │   └── streamthoughts 26 | │   └── ksql 27 | │   └── udfs 28 | │   ├── ArrayToString.java 29 | │   └── ExtractArrayField.java 30 | ├── README.md 31 | └── sql // ksqlDB and ClickHouse queries 32 | ├── ch_create_table_tweets.sql 33 | ├── ksql_create_connector_sink_jdbc_clickhouse.sql 34 | ├── ksql_create_connector_source_twitter.sql 35 | ├── ksql_create_stream_tweets_json.sql 36 | ├── ksql_create_stream_tweets_normalized.sql 37 | └── ksql_create_stream_tweets.sql 38 | ``` 39 | 40 | ## Twitter API OAuth 41 | 42 | To run that demo project, you need to get credentials for using [Twitter API](https://developer.twitter.com/en/docs/basics/authentication/oauth-1-0a). 43 | 44 | * Edit file `./sql/ksql_create_connector_source_twitter.sql` to set your Twitter API Credentials : 45 | 46 | ```sql 47 | CREATE SOURCE CONNECTOR tweeterconnector WITH ( 48 | 'connector.class'='com.github.jcustenborder.kafka.connect.twitter.TwitterSourceConnector', 49 | 'twitter.oauth.accessTokenSecret'='%ACCESS_TOKEN_SECRET%', 50 | 'twitter.oauth.consumerSecret'='%CONSUMER_SECRET%', 51 | 'twitter.oauth.accessToken'='%ACCESS_TOKEN%', 52 | 'twitter.oauth.consumerKey'='%CONSUMER_KEY%', 53 | 'kafka.status.topic'='tweets', 54 | 'process.deletes'=false, 55 | 'filter.keywords'='coronavirus,2019nCoV,SARSCoV2,covid19,cov19' 56 | ); 57 | ``` 58 | 59 | ## Starting Project 60 | 61 | **Start demonstration** 62 | 63 | ```bash 64 | $ ./demo-start.sh 65 | ``` 66 | 67 | **Example of ClickHouse SQL query :** 68 | 69 | ```bash 70 | $ ./clickhouse-client.sh 71 | $ docker exec -it clickhouse bin/bash -c "clickhouse-client -q 'SELECT COUNT(*) AS COUNT, LANG FROM tweets GROUP BY LANG ORDER BY (COUNT) DESC LIMIT 10;'" 72 | ``` 73 | 74 | **Stopping** 75 | 76 | ```bash 77 | $ ./demo-stop.sh 78 | ``` 79 | -------------------------------------------------------------------------------- /clickhouse-client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | docker exec -it clickhouse bin/bash -c "clickhouse-client --multiline" 6 | -------------------------------------------------------------------------------- /demo-start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | 6 | # Functions 7 | function exec_clickhouse_query() { 8 | SQL=$1 9 | echo -e "\n🚀 Executing ClickHouse query\n" 10 | echo "${SQL}" 11 | docker exec -it clickhouse bash -c "echo \"$SQL\" | clickhouse-client --multiline" 12 | echo "----------------------------------------------" 13 | } 14 | 15 | function exec_ksql_query() { 16 | echo -e "\n🚀 Executing KSQL query\n" 17 | KSQL_QUERY=$1 18 | docker exec -it ksqldb-server bash -c "echo \"$KSQL_QUERY\" | ksql" 19 | } 20 | 21 | # Main 22 | echo "--------------------------------------------------------------------------------" 23 | echo "--- Demo : Fast Analytic Platform with Kafka, ClickHouse & Superset ---" 24 | echo "--------------------------------------------------------------------------------" 25 | 26 | export COMPOSE_PROJECT_NAME=demo-twitter-streams 27 | 28 | echo -e "\n🐳 Stopping all previsously started Docker containers" 29 | docker-compose down -v 30 | 31 | echo -e "\n🏭 Building Maven project (cd ksql-custom-udfs; mvn clean -q package)\n" 32 | (cd ksql-custom-udfs; mvn clean -q package) 33 | 34 | echo -e "\n🐳 Starting all docker containers" 35 | docker-compose up -d 36 | 37 | KAFKA_CONTAINER_NAME=kafka 38 | 39 | echo -e "\n⏳ Waiting for Kafka Broker to be up and running" 40 | while true 41 | do 42 | if [ $(docker logs $KAFKA_CONTAINER_NAME 2>&1 | grep "started (kafka.server.KafkaServer)" >/dev/null; echo $?) -eq 0 ]; then 43 | echo 44 | break 45 | fi 46 | printf "." 47 | sleep 1 48 | done; 49 | 50 | echo -e "\n⏳ Creating Kafka topics" 51 | docker exec -it kafka kafka-topics --zookeeper zookeeper:2181 --create --topic tweets --partitions 4 --replication-factor 1 52 | docker exec -it kafka kafka-topics --zookeeper zookeeper:2181 --create --topic tweets-normalized --partitions 4 --replication-factor 1 53 | 54 | exec_clickhouse_query "$(cat ./sql/ch_create_table_tweets.sql)" 55 | 56 | echo -e "\n⏳ Waiting for Kafka Connect to be up and running." 57 | while true 58 | do 59 | res=$(curl -sI http://localhost:8083 | head -n 1 | cut -d$' ' -f2) 60 | if [ "$res" == "200" ]; then 61 | echo 62 | break 63 | fi 64 | printf "." 65 | sleep 1 66 | done; 67 | 68 | echo -e "\n⏳ Waiting for KSQL to be available before launching CLI\n" 69 | while [ $(curl -s -o /dev/null -w %{http_code} http://localhost:8088/) -eq 000 ] 70 | do 71 | echo -e $(date) "KSQL Server HTTP state: " $(curl -s -o /dev/null -w %{http_code} http://localhost:8088/) " (waiting for 200)" 72 | sleep 5 73 | done 74 | 75 | echo -e "\n⏳ Starting JdbcSinkConnector for ClickHouse\n" 76 | 77 | exec_ksql_query "$(cat ./sql/ksql_create_connector_sink_jdbc_clickhouse.sql)" 78 | 79 | sleep 5 80 | 81 | echo -e "\n⏳ Starting TwitterSourceConnector\n" 82 | exec_ksql_query "$(cat ./sql/ksql_create_connector_source_twitter.sql)" 83 | 84 | exec_ksql_query "SET 'auto.offset.reset' = 'earliest';" 85 | exec_ksql_query "$(cat ./sql/ksql_create_stream_tweets.sql)" 86 | exec_ksql_query "$(cat ./sql/ksql_create_stream_tweets_normalized.sql)" 87 | exec_ksql_query "$(cat ./sql/ksql_create_stream_tweets_json.sql)" 88 | 89 | exit 0 90 | -------------------------------------------------------------------------------- /demo-stop.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | export COMPOSE_PROJECT_NAME=demo-twitter-streams 6 | 7 | echo -e "\n🐳 Stopping all previsously started Docker containers" 8 | docker-compose down -v 9 | 10 | exit 0 11 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2020 StreamThoughts. 3 | # 4 | version: '3' 5 | services: 6 | zookeeper: 7 | image: confluentinc/cp-zookeeper:5.4.1 8 | hostname: zookeeper 9 | container_name: zookeeper 10 | ports: 11 | - "2181:2181" 12 | environment: 13 | ZOOKEEPER_CLIENT_PORT: 2181 14 | ZOOKEEPER_TICK_TIME: 2000 15 | 16 | kafka: 17 | image: confluentinc/cp-kafka:5.4.1 18 | hostname: kafka 19 | container_name: kafka 20 | depends_on: 21 | - zookeeper 22 | ports: 23 | - "29092:29092" 24 | - "9092:9092" 25 | environment: 26 | KAFKA_BROKER_ID: 1 27 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181' 28 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 29 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092 30 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 31 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0 32 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 33 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 34 | CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: kafka:29092 35 | CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: zookeeper:2181 36 | CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1 37 | CONFLUENT_METRICS_ENABLE: 'false' 38 | 39 | schema-registry: 40 | image: confluentinc/cp-schema-registry:5.4.1 41 | hostname: schema-registry 42 | container_name: schema-registry 43 | depends_on: 44 | - zookeeper 45 | - kafka 46 | ports: 47 | - "8081:8081" 48 | environment: 49 | SCHEMA_REGISTRY_HOST_NAME: schema-registry 50 | SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181' 51 | 52 | kafka-connect: 53 | container_name: kafka-connect 54 | build: 55 | context: ./ 56 | dockerfile: Dockerfile-kafka-connect 57 | args: 58 | clickHouseVersion: "0.2.4" 59 | connectTwitterVersion: "0.3.33" 60 | depends_on: 61 | - zookeeper 62 | - schema-registry 63 | - kafka 64 | ports: 65 | - "8083:8083" 66 | environment: 67 | CONNECT_BOOTSTRAP_SERVERS: 'kafka:29092' 68 | CONNECT_REST_ADVERTISED_HOST_NAME: connect 69 | CONNECT_REST_PORT: 8083 70 | CONNECT_GROUP_ID: compose-connect-group 71 | CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs 72 | CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1 73 | CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000 74 | CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets 75 | CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1 76 | CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status 77 | CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1 78 | CONNECT_KEY_CONVERTER: org.apache.kafka.connect.storage.StringConverter 79 | CONNECT_VALUE_CONVERTER: io.confluent.connect.avro.AvroConverter 80 | CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081 81 | CONNECT_INTERNAL_KEY_CONVERTER: "org.apache.kafka.connect.json.JsonConverter" 82 | CONNECT_INTERNAL_VALUE_CONVERTER: "org.apache.kafka.connect.json.JsonConverter" 83 | CONNECT_ZOOKEEPER_CONNECT: 'zookeeper:2181' 84 | CONNECT_PLUGIN_PATH: "/usr/share/java,/usr/share/confluent-hub-components/" 85 | CONNECT_LOG4J_ROOT_LOGLEVEL: "INFO" 86 | CONNECT_LOG4J_LOGGERS: org.apache.zookeeper=ERROR,org.I0Itec.zkclient=ERROR,org.reflections=ERROR 87 | 88 | ksqldb-server: 89 | #image: confluentinc/cp-ksql-server:5.4.1 90 | image: confluentinc/ksqldb-server:0.8.1 91 | hostname: ksqldb-server 92 | container_name: ksqldb-server 93 | depends_on: 94 | - zookeeper 95 | - kafka 96 | - schema-registry 97 | - kafka-connect 98 | ports: 99 | - "8088:8088" 100 | environment: 101 | KSQL_BOOTSTRAP_SERVERS: kafka:29092 102 | KSQL_LISTENERS: http://0.0.0.0:8088 103 | KSQL_KSQL_CONNECT_URL: http://kafka-connect:8083 104 | KSQL_KSQL_SERVICE_ID: ksql-docker 105 | KSQL_KSQL_SCHEMA_REGISTRY_URL: http://schema-registry:8081 106 | KSQL_KSQL_EXTENSION_DIR: /var/lib/ksql/extension 107 | volumes: 108 | - ./ksql-custom-udfs/target/ksql-custom-udfs-1.0.jar:/var/lib/ksql/extension/ksql-custom-udfs-1.0.jar 109 | 110 | clickhouse: 111 | image: yandex/clickhouse-server:20.4 112 | hostname: clickhouse 113 | container_name: clickhouse 114 | # ports: 115 | # Default port for HTTP interfaces (https://clickhouse.yandex/docs/en/interfaces/http_interface/) 116 | # - "8123::8123" 117 | # Default port for native client (https://clickhouse.yandex/docs/en/interfaces/tcp/) 118 | # - "9000:9000" 119 | # volumes: 120 | # - /tmp/clickhouse/data:/var/lib/clickhouse 121 | # - /tmp/clickhouse/log:/var/log/clickhouse-server 122 | # - /tmp/clickhouse-config.xml:/etc/clickhouse-server/config.xml 123 | # Zookeeper is required for replicated tables (https://clickhouse.tech/docs/en/operations/table_engines/replication/) 124 | # depends_on: 125 | # - zookeeper 126 | 127 | superset: 128 | image: amancevice/superset:latest 129 | hostname: superset 130 | container_name: superset 131 | ports: 132 | - "8080:8088" 133 | -------------------------------------------------------------------------------- /ksql-custom-udfs/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | io.streamthoughts 8 | ksql-custom-udfs 9 | 1.0 10 | 11 | 12 | 13 | confluent 14 | http://packages.confluent.io/maven/ 15 | 16 | 17 | 18 | 19 | 20 | 21 | org.apache.maven.plugins 22 | maven-compiler-plugin 23 | 3.8.1 24 | 25 | 1.8 26 | 1.8 27 | 28 | 29 | 30 | org.apache.maven.plugins 31 | maven-compiler-plugin 32 | 3.8.1 33 | 34 | 8 35 | 8 36 | UTF-8 37 | 38 | -parameters 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | UTF-8 47 | 48 | 49 | 50 | 51 | 52 | io.confluent.ksql 53 | ksql-common 54 | 5.4.1 55 | provided 56 | 57 | 58 | 59 | io.confluent.ksql 60 | ksql-udf 61 | 5.4.1 62 | provided 63 | 64 | 65 | 66 | org.apache.kafka 67 | connect-api 68 | 2.4.1 69 | provided 70 | 71 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /ksql-custom-udfs/src/main/java/io/streamthoughts/ksql/udfs/ArrayToString.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 StreamThoughts. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package io.streamthoughts.ksql.udfs; 20 | 21 | import io.confluent.ksql.function.udf.Udf; 22 | import io.confluent.ksql.function.udf.UdfDescription; 23 | import io.confluent.ksql.function.udf.UdfParameter; 24 | 25 | import java.util.List; 26 | import java.util.stream.Collectors; 27 | 28 | @UdfDescription( 29 | name = "array_to_string", 30 | description = "Concatenates array elements using supplied delimiter and null string") 31 | public class ArrayToString { 32 | 33 | 34 | @Udf 35 | public String arrayToString(@UdfParameter("array") final List array, 36 | @UdfParameter("delimiter") final String delimiter, 37 | @UdfParameter("nullString") final String nullString) { 38 | if (array == null) return nullString; 39 | return array 40 | .stream() 41 | .map(Object::toString) 42 | .collect(Collectors.joining(delimiter)); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /ksql-custom-udfs/src/main/java/io/streamthoughts/ksql/udfs/ExtractArrayField.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 StreamThoughts. 3 | * 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | package io.streamthoughts.ksql.udfs; 20 | 21 | import io.confluent.ksql.function.udf.Udf; 22 | import io.confluent.ksql.function.udf.UdfDescription; 23 | import io.confluent.ksql.function.udf.UdfParameter; 24 | import org.apache.kafka.connect.data.Field; 25 | import org.apache.kafka.connect.data.Struct; 26 | 27 | import java.util.List; 28 | import java.util.Objects; 29 | import java.util.stream.Collectors; 30 | 31 | @UdfDescription(name = "extract_array_field", description = "Extract a single field from an array of struct") 32 | public class ExtractArrayField { 33 | 34 | @Udf 35 | @SuppressWarnings("unchecked") 36 | public List extractArrayField(@UdfParameter(value = "array") final List array, 37 | @UdfParameter(value = "field") final String field) { 38 | 39 | if (array == null) return null; 40 | 41 | final String ufield = field.toUpperCase(); 42 | 43 | final List structs = (List) array; 44 | return structs.stream() 45 | .map(record -> { 46 | final Field sf = record.schema().field(ufield); 47 | return sf != null ? record.getString(ufield) : null; 48 | }) 49 | .filter(Objects::nonNull) 50 | .collect(Collectors.toList()); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /sql/ch_create_table_tweets.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS default.tweets 2 | ( 3 | ID String, 4 | CREATEDAT DateTime, 5 | TEXT String, 6 | LANG String, 7 | RETWEETED UInt8, 8 | USERID String, 9 | USERNAME String, 10 | USERDESCRIPTION String, 11 | USERLOCATION String, 12 | HASHTAGS String, 13 | MENTIONS String 14 | ) ENGINE = MergeTree() 15 | PARTITION BY toYYYYMM(CREATEDAT) 16 | ORDER BY (CREATEDAT, LANG); 17 | -------------------------------------------------------------------------------- /sql/ksql_create_connector_sink_jdbc_clickhouse.sql: -------------------------------------------------------------------------------- 1 | CREATE SOURCE CONNECTOR clickhousejdbcconnector WITH ( 2 | 'connector.class'='io.confluent.connect.jdbc.JdbcSinkConnector', 3 | 'topics'='tweets-normalized', 4 | 'tasks.max'='1', 5 | 'connection.url'='jdbc:clickhouse://clickhouse:8123/default', 6 | 'table.name.format'='tweets' 7 | ); 8 | -------------------------------------------------------------------------------- /sql/ksql_create_connector_source_twitter.sql: -------------------------------------------------------------------------------- 1 | CREATE SOURCE CONNECTOR tweeterconnector WITH ( 2 | 'connector.class'='com.github.jcustenborder.kafka.connect.twitter.TwitterSourceConnector', 3 | 'twitter.oauth.accessTokenSecret'='%OAUTH_ACCESS_TOKEN_SECRET%', 4 | 'twitter.oauth.consumerSecret'='%OAUTH_CONSUMER_SECRET%', 5 | 'twitter.oauth.accessToken'='%OAUTH_ACCESS_TOKEN%', 6 | 'twitter.oauth.consumerKey'='%OAUTH_CONSUMER_KEY%', 7 | 'kafka.status.topic'='tweets', 8 | 'process.deletes'=false, 9 | 'filter.keywords'='coronavirus,2019nCoV,SARSCoV2,covid19,cov19' 10 | ); 11 | -------------------------------------------------------------------------------- /sql/ksql_create_stream_tweets.sql: -------------------------------------------------------------------------------- 1 | CREATE STREAM tweets WITH (KAFKA_TOPIC = 'tweets', VALUE_FORMAT='AVRO'); 2 | 3 | 4 | -------------------------------------------------------------------------------- /sql/ksql_create_stream_tweets_json.sql: -------------------------------------------------------------------------------- 1 | CREATE STREAM TWEETS_NORMALIZED_JSON 2 | WITH (KAFKA_TOPIC='tweets-normalized-json',VALUE_FORMAT='JSON') 3 | AS SELECT * FROM TWEETS_NORMALIZED; 4 | -------------------------------------------------------------------------------- /sql/ksql_create_stream_tweets_normalized.sql: -------------------------------------------------------------------------------- 1 | CREATE STREAM TWEETS_NORMALIZED 2 | WITH (kafka_topic = 'tweets-normalized') AS 3 | SELECT 4 | Id, 5 | CreatedAt / 1000 as CreatedAt, 6 | Text, 7 | Lang, 8 | Retweeted, 9 | User->Id as UserId, 10 | User->Name as UserName, 11 | IFNULL(User->Description, '') as UserDescription, 12 | IFNULL(User->Location, '') as UserLocation, 13 | ARRAY_TO_STRING( EXTRACT_ARRAY_FIELD(UserMentionEntities, 'Name'), ',', '') as Mentions, 14 | ARRAY_TO_STRING( EXTRACT_ARRAY_FIELD(HashtagEntities, 'Text'), ',', '') as Hashtags 15 | FROM tweets EMIT CHANGES; 16 | 17 | 18 | --------------------------------------------------------------------------------