├── .gitignore
├── Dockerfile-kafka-connect
├── README.md
├── clickhouse-client.sh
├── demo-start.sh
├── demo-stop.sh
├── docker-compose.yml
├── ksql-custom-udfs
├── pom.xml
└── src
│ └── main
│ └── java
│ └── io
│ └── streamthoughts
│ └── ksql
│ └── udfs
│ ├── ArrayToString.java
│ └── ExtractArrayField.java
└── sql
├── ch_create_table_tweets.sql
├── ksql_create_connector_sink_jdbc_clickhouse.sql
├── ksql_create_connector_source_twitter.sql
├── ksql_create_stream_tweets.sql
├── ksql_create_stream_tweets_json.sql
└── ksql_create_stream_tweets_normalized.sql
/.gitignore:
--------------------------------------------------------------------------------
1 | ### Scala template
2 | *.class
3 | *.log
4 |
5 | # sbt specific
6 | .cache
7 | .history
8 | .lib/
9 | dist/*
10 | target/
11 | lib_managed/
12 | src_managed/
13 | project/boot/
14 | project/plugins/project/
15 |
16 | # Scala-IDE specific
17 | .scala_dependencies
18 | .worksheet
19 | ### Java template
20 | *.class
21 |
22 | # Mobile Tools for Java (J2ME)
23 | .mtj.tmp/
24 |
25 | # Package Files #
26 | *.jar
27 | *.war
28 | *.ear
29 |
30 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
31 | hs_err_pid*
32 | ### Maven template
33 | target/
34 | pom.xml.tag
35 | pom.xml.releaseBackup
36 | pom.xml.versionsBackup
37 | pom.xml.next
38 | release.properties
39 | dependency-reduced-pom.xml
40 | buildNumber.properties
41 | .mvn/timing.properties
42 | ### Eclipse template
43 | *.pydevproject
44 | .metadata
45 | .gradle
46 | tmp/
47 | *.tmp
48 | *.bak
49 | *.swp
50 | *~.nib
51 | local.properties
52 | .settings/
53 | .loadpath
54 |
55 | # Eclipse Core
56 | .project
57 |
58 | # External tool builders
59 | .externalToolBuilders/
60 |
61 | # Locally stored "Eclipse launch configurations"
62 | *.launch
63 |
64 | # CDT-specific
65 | .cproject
66 |
67 | # JDT-specific (Eclipse Java Development Tools)
68 | .classpath
69 |
70 | # Java annotation processor (APT)
71 | .factorypath
72 |
73 | # PDT-specific
74 | .buildpath
75 |
76 | # sbteclipse plugin
77 | .target
78 |
79 | # TeXlipse plugin
80 | .texlipse
81 | ### JetBrains template
82 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
83 |
84 | *.iml
85 |
86 | ## Directory-based project format:
87 | .idea/
88 | # if you remove the above rule, at least ignore the following:
89 |
90 | # User-specific stuff:
91 | # .idea/workspace.xml
92 | # .idea/tasks.xml
93 | # .idea/dictionaries
94 |
95 | # Sensitive or high-churn files:
96 | # .idea/dataSources.ids
97 | # .idea/dataSources.xml
98 | # .idea/sqlDataSources.xml
99 | # .idea/dynamic.xml
100 | # .idea/uiDesigner.xml
101 |
102 | # Gradle:
103 | # .idea/gradle.xml
104 | # .idea/libraries
105 |
106 | # Mongo Explorer plugin:
107 | # .idea/mongoSettings.xml
108 |
109 | ## File-based project format:
110 | *.ipr
111 | *.iws
112 |
113 | ## Plugin-specific files:
114 |
115 | # IntelliJ
116 | /out/
117 |
118 | # mpeltonen/sbt-idea plugin
119 | .idea_modules/
120 |
121 | # JIRA plugin
122 | atlassian-ide-plugin.xml
123 |
124 | # Crashlytics plugin (for Android Studio and IntelliJ)
125 | com_crashlytics_export_strings.xml
126 | crashlytics.properties
127 | crashlytics-build.properties
128 |
129 | # Created by .ignore support plugin (hsz.mobi)
130 | #
131 | #
132 | azkarra-*/docs
133 | azkarra-server/src/main/resources/io/
134 | */node_modules
135 | site/resources
136 |
--------------------------------------------------------------------------------
/Dockerfile-kafka-connect:
--------------------------------------------------------------------------------
1 | FROM confluentinc/cp-kafka-connect-base:5.4.1
2 |
3 | ARG jdbcDriverPath
4 | ARG clickHouseVersion
5 | ARG connectTwitterVersion
6 |
7 | ENV JDBC_DRIVER_PATH=/usr/share/confluent-hub-components/confluentinc-kafka-connect-jdbc/lib
8 | ENV JDBC_DRIVER=clickhouse-jdbc-$clickHouseVersion.jar
9 |
10 | RUN confluent-hub install --no-prompt jcustenborder/kafka-connect-twitter:$connectTwitterVersion
11 | RUN confluent-hub install --no-prompt confluentinc/kafka-connect-jdbc:5.4.1
12 |
13 | RUN mkdir -p $JDBC_DRIVER_PATH && \
14 | echo "Downloading JDBC Driver for ClickHouse v$clickHouseVersion" && \
15 | wget -O ${JDBC_DRIVER_PATH}/${JDBC_DRIVER} https://github.com/ClickHouse/clickhouse-jdbc/releases/download/release_$clickHouseVersion/${JDBC_DRIVER}
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PoC : Fast Data Analytic platform with ClickHouse and Apache Kafka
2 |
3 |
4 | ## Prerequisites
5 |
6 | * Git
7 | * Maven (we recommend version 3.5.3)
8 | * Java 11
9 | * Docker, Docker-compose
10 |
11 | ## Project Tree
12 |
13 | ```
14 | ├── clickhouse-client.sh // Utility to start a clickHouse client
15 | ├── demo-start.sh // Utility script to start the project
16 | ├── demo-stop.sh // Uitlity script to stop the project
17 | ├── docker-compose.yml
18 | ├── Dockerfile-kafka-connect
19 | ├── ksql-custom-udfs // Maven project that contains User Defined Functions (UDFs) for ksqlDB
20 | │ ├── pom.xml
21 | │ └── src
22 | │ └── main
23 | │ └── java
24 | │ └── io
25 | │ └── streamthoughts
26 | │ └── ksql
27 | │ └── udfs
28 | │ ├── ArrayToString.java
29 | │ └── ExtractArrayField.java
30 | ├── README.md
31 | └── sql // ksqlDB and ClickHouse queries
32 | ├── ch_create_table_tweets.sql
33 | ├── ksql_create_connector_sink_jdbc_clickhouse.sql
34 | ├── ksql_create_connector_source_twitter.sql
35 | ├── ksql_create_stream_tweets_json.sql
36 | ├── ksql_create_stream_tweets_normalized.sql
37 | └── ksql_create_stream_tweets.sql
38 | ```
39 |
40 | ## Twitter API OAuth
41 |
42 | To run that demo project, you need to get credentials for using [Twitter API](https://developer.twitter.com/en/docs/basics/authentication/oauth-1-0a).
43 |
44 | * Edit file `./sql/ksql_create_connector_source_twitter.sql` to set your Twitter API Credentials :
45 |
46 | ```sql
47 | CREATE SOURCE CONNECTOR tweeterconnector WITH (
48 | 'connector.class'='com.github.jcustenborder.kafka.connect.twitter.TwitterSourceConnector',
49 | 'twitter.oauth.accessTokenSecret'='%ACCESS_TOKEN_SECRET%',
50 | 'twitter.oauth.consumerSecret'='%CONSUMER_SECRET%',
51 | 'twitter.oauth.accessToken'='%ACCESS_TOKEN%',
52 | 'twitter.oauth.consumerKey'='%CONSUMER_KEY%',
53 | 'kafka.status.topic'='tweets',
54 | 'process.deletes'=false,
55 | 'filter.keywords'='coronavirus,2019nCoV,SARSCoV2,covid19,cov19'
56 | );
57 | ```
58 |
59 | ## Starting Project
60 |
61 | **Start demonstration**
62 |
63 | ```bash
64 | $ ./demo-start.sh
65 | ```
66 |
67 | **Example of ClickHouse SQL query :**
68 |
69 | ```bash
70 | $ ./clickhouse-client.sh
71 | $ docker exec -it clickhouse bin/bash -c "clickhouse-client -q 'SELECT COUNT(*) AS COUNT, LANG FROM tweets GROUP BY LANG ORDER BY (COUNT) DESC LIMIT 10;'"
72 | ```
73 |
74 | **Stopping**
75 |
76 | ```bash
77 | $ ./demo-stop.sh
78 | ```
79 |
--------------------------------------------------------------------------------
/clickhouse-client.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | docker exec -it clickhouse bin/bash -c "clickhouse-client --multiline"
6 |
--------------------------------------------------------------------------------
/demo-start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 |
6 | # Functions
7 | function exec_clickhouse_query() {
8 | SQL=$1
9 | echo -e "\n🚀 Executing ClickHouse query\n"
10 | echo "${SQL}"
11 | docker exec -it clickhouse bash -c "echo \"$SQL\" | clickhouse-client --multiline"
12 | echo "----------------------------------------------"
13 | }
14 |
15 | function exec_ksql_query() {
16 | echo -e "\n🚀 Executing KSQL query\n"
17 | KSQL_QUERY=$1
18 | docker exec -it ksqldb-server bash -c "echo \"$KSQL_QUERY\" | ksql"
19 | }
20 |
21 | # Main
22 | echo "--------------------------------------------------------------------------------"
23 | echo "--- Demo : Fast Analytic Platform with Kafka, ClickHouse & Superset ---"
24 | echo "--------------------------------------------------------------------------------"
25 |
26 | export COMPOSE_PROJECT_NAME=demo-twitter-streams
27 |
28 | echo -e "\n🐳 Stopping all previsously started Docker containers"
29 | docker-compose down -v
30 |
31 | echo -e "\n🏭 Building Maven project (cd ksql-custom-udfs; mvn clean -q package)\n"
32 | (cd ksql-custom-udfs; mvn clean -q package)
33 |
34 | echo -e "\n🐳 Starting all docker containers"
35 | docker-compose up -d
36 |
37 | KAFKA_CONTAINER_NAME=kafka
38 |
39 | echo -e "\n⏳ Waiting for Kafka Broker to be up and running"
40 | while true
41 | do
42 | if [ $(docker logs $KAFKA_CONTAINER_NAME 2>&1 | grep "started (kafka.server.KafkaServer)" >/dev/null; echo $?) -eq 0 ]; then
43 | echo
44 | break
45 | fi
46 | printf "."
47 | sleep 1
48 | done;
49 |
50 | echo -e "\n⏳ Creating Kafka topics"
51 | docker exec -it kafka kafka-topics --zookeeper zookeeper:2181 --create --topic tweets --partitions 4 --replication-factor 1
52 | docker exec -it kafka kafka-topics --zookeeper zookeeper:2181 --create --topic tweets-normalized --partitions 4 --replication-factor 1
53 |
54 | exec_clickhouse_query "$(cat ./sql/ch_create_table_tweets.sql)"
55 |
56 | echo -e "\n⏳ Waiting for Kafka Connect to be up and running."
57 | while true
58 | do
59 | res=$(curl -sI http://localhost:8083 | head -n 1 | cut -d$' ' -f2)
60 | if [ "$res" == "200" ]; then
61 | echo
62 | break
63 | fi
64 | printf "."
65 | sleep 1
66 | done;
67 |
68 | echo -e "\n⏳ Waiting for KSQL to be available before launching CLI\n"
69 | while [ $(curl -s -o /dev/null -w %{http_code} http://localhost:8088/) -eq 000 ]
70 | do
71 | echo -e $(date) "KSQL Server HTTP state: " $(curl -s -o /dev/null -w %{http_code} http://localhost:8088/) " (waiting for 200)"
72 | sleep 5
73 | done
74 |
75 | echo -e "\n⏳ Starting JdbcSinkConnector for ClickHouse\n"
76 |
77 | exec_ksql_query "$(cat ./sql/ksql_create_connector_sink_jdbc_clickhouse.sql)"
78 |
79 | sleep 5
80 |
81 | echo -e "\n⏳ Starting TwitterSourceConnector\n"
82 | exec_ksql_query "$(cat ./sql/ksql_create_connector_source_twitter.sql)"
83 |
84 | exec_ksql_query "SET 'auto.offset.reset' = 'earliest';"
85 | exec_ksql_query "$(cat ./sql/ksql_create_stream_tweets.sql)"
86 | exec_ksql_query "$(cat ./sql/ksql_create_stream_tweets_normalized.sql)"
87 | exec_ksql_query "$(cat ./sql/ksql_create_stream_tweets_json.sql)"
88 |
89 | exit 0
90 |
--------------------------------------------------------------------------------
/demo-stop.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | set -e
4 |
5 | export COMPOSE_PROJECT_NAME=demo-twitter-streams
6 |
7 | echo -e "\n🐳 Stopping all previsously started Docker containers"
8 | docker-compose down -v
9 |
10 | exit 0
11 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2020 StreamThoughts.
3 | #
4 | version: '3'
5 | services:
6 | zookeeper:
7 | image: confluentinc/cp-zookeeper:5.4.1
8 | hostname: zookeeper
9 | container_name: zookeeper
10 | ports:
11 | - "2181:2181"
12 | environment:
13 | ZOOKEEPER_CLIENT_PORT: 2181
14 | ZOOKEEPER_TICK_TIME: 2000
15 |
16 | kafka:
17 | image: confluentinc/cp-kafka:5.4.1
18 | hostname: kafka
19 | container_name: kafka
20 | depends_on:
21 | - zookeeper
22 | ports:
23 | - "29092:29092"
24 | - "9092:9092"
25 | environment:
26 | KAFKA_BROKER_ID: 1
27 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
28 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
29 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
30 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
31 | KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS: 0
32 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
33 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
34 | CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: kafka:29092
35 | CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: zookeeper:2181
36 | CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1
37 | CONFLUENT_METRICS_ENABLE: 'false'
38 |
39 | schema-registry:
40 | image: confluentinc/cp-schema-registry:5.4.1
41 | hostname: schema-registry
42 | container_name: schema-registry
43 | depends_on:
44 | - zookeeper
45 | - kafka
46 | ports:
47 | - "8081:8081"
48 | environment:
49 | SCHEMA_REGISTRY_HOST_NAME: schema-registry
50 | SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL: 'zookeeper:2181'
51 |
52 | kafka-connect:
53 | container_name: kafka-connect
54 | build:
55 | context: ./
56 | dockerfile: Dockerfile-kafka-connect
57 | args:
58 | clickHouseVersion: "0.2.4"
59 | connectTwitterVersion: "0.3.33"
60 | depends_on:
61 | - zookeeper
62 | - schema-registry
63 | - kafka
64 | ports:
65 | - "8083:8083"
66 | environment:
67 | CONNECT_BOOTSTRAP_SERVERS: 'kafka:29092'
68 | CONNECT_REST_ADVERTISED_HOST_NAME: connect
69 | CONNECT_REST_PORT: 8083
70 | CONNECT_GROUP_ID: compose-connect-group
71 | CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs
72 | CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1
73 | CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000
74 | CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets
75 | CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1
76 | CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status
77 | CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1
78 | CONNECT_KEY_CONVERTER: org.apache.kafka.connect.storage.StringConverter
79 | CONNECT_VALUE_CONVERTER: io.confluent.connect.avro.AvroConverter
80 | CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081
81 | CONNECT_INTERNAL_KEY_CONVERTER: "org.apache.kafka.connect.json.JsonConverter"
82 | CONNECT_INTERNAL_VALUE_CONVERTER: "org.apache.kafka.connect.json.JsonConverter"
83 | CONNECT_ZOOKEEPER_CONNECT: 'zookeeper:2181'
84 | CONNECT_PLUGIN_PATH: "/usr/share/java,/usr/share/confluent-hub-components/"
85 | CONNECT_LOG4J_ROOT_LOGLEVEL: "INFO"
86 | CONNECT_LOG4J_LOGGERS: org.apache.zookeeper=ERROR,org.I0Itec.zkclient=ERROR,org.reflections=ERROR
87 |
88 | ksqldb-server:
89 | #image: confluentinc/cp-ksql-server:5.4.1
90 | image: confluentinc/ksqldb-server:0.8.1
91 | hostname: ksqldb-server
92 | container_name: ksqldb-server
93 | depends_on:
94 | - zookeeper
95 | - kafka
96 | - schema-registry
97 | - kafka-connect
98 | ports:
99 | - "8088:8088"
100 | environment:
101 | KSQL_BOOTSTRAP_SERVERS: kafka:29092
102 | KSQL_LISTENERS: http://0.0.0.0:8088
103 | KSQL_KSQL_CONNECT_URL: http://kafka-connect:8083
104 | KSQL_KSQL_SERVICE_ID: ksql-docker
105 | KSQL_KSQL_SCHEMA_REGISTRY_URL: http://schema-registry:8081
106 | KSQL_KSQL_EXTENSION_DIR: /var/lib/ksql/extension
107 | volumes:
108 | - ./ksql-custom-udfs/target/ksql-custom-udfs-1.0.jar:/var/lib/ksql/extension/ksql-custom-udfs-1.0.jar
109 |
110 | clickhouse:
111 | image: yandex/clickhouse-server:20.4
112 | hostname: clickhouse
113 | container_name: clickhouse
114 | # ports:
115 | # Default port for HTTP interfaces (https://clickhouse.yandex/docs/en/interfaces/http_interface/)
116 | # - "8123::8123"
117 | # Default port for native client (https://clickhouse.yandex/docs/en/interfaces/tcp/)
118 | # - "9000:9000"
119 | # volumes:
120 | # - /tmp/clickhouse/data:/var/lib/clickhouse
121 | # - /tmp/clickhouse/log:/var/log/clickhouse-server
122 | # - /tmp/clickhouse-config.xml:/etc/clickhouse-server/config.xml
123 | # Zookeeper is required for replicated tables (https://clickhouse.tech/docs/en/operations/table_engines/replication/)
124 | # depends_on:
125 | # - zookeeper
126 |
127 | superset:
128 | image: amancevice/superset:latest
129 | hostname: superset
130 | container_name: superset
131 | ports:
132 | - "8080:8088"
133 |
--------------------------------------------------------------------------------
/ksql-custom-udfs/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | io.streamthoughts
8 | ksql-custom-udfs
9 | 1.0
10 |
11 |
12 |
13 | confluent
14 | http://packages.confluent.io/maven/
15 |
16 |
17 |
18 |
19 |
20 |
21 | org.apache.maven.plugins
22 | maven-compiler-plugin
23 | 3.8.1
24 |
25 | 1.8
26 | 1.8
27 |
28 |
29 |
30 | org.apache.maven.plugins
31 | maven-compiler-plugin
32 | 3.8.1
33 |
34 | 8
35 | 8
36 | UTF-8
37 |
38 | -parameters
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 | UTF-8
47 |
48 |
49 |
50 |
51 |
52 | io.confluent.ksql
53 | ksql-common
54 | 5.4.1
55 | provided
56 |
57 |
58 |
59 | io.confluent.ksql
60 | ksql-udf
61 | 5.4.1
62 | provided
63 |
64 |
65 |
66 | org.apache.kafka
67 | connect-api
68 | 2.4.1
69 | provided
70 |
71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/ksql-custom-udfs/src/main/java/io/streamthoughts/ksql/udfs/ArrayToString.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 StreamThoughts.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 | package io.streamthoughts.ksql.udfs;
20 |
21 | import io.confluent.ksql.function.udf.Udf;
22 | import io.confluent.ksql.function.udf.UdfDescription;
23 | import io.confluent.ksql.function.udf.UdfParameter;
24 |
25 | import java.util.List;
26 | import java.util.stream.Collectors;
27 |
28 | @UdfDescription(
29 | name = "array_to_string",
30 | description = "Concatenates array elements using supplied delimiter and null string")
31 | public class ArrayToString {
32 |
33 |
34 | @Udf
35 | public String arrayToString(@UdfParameter("array") final List array,
36 | @UdfParameter("delimiter") final String delimiter,
37 | @UdfParameter("nullString") final String nullString) {
38 | if (array == null) return nullString;
39 | return array
40 | .stream()
41 | .map(Object::toString)
42 | .collect(Collectors.joining(delimiter));
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/ksql-custom-udfs/src/main/java/io/streamthoughts/ksql/udfs/ExtractArrayField.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 StreamThoughts.
3 | *
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 | package io.streamthoughts.ksql.udfs;
20 |
21 | import io.confluent.ksql.function.udf.Udf;
22 | import io.confluent.ksql.function.udf.UdfDescription;
23 | import io.confluent.ksql.function.udf.UdfParameter;
24 | import org.apache.kafka.connect.data.Field;
25 | import org.apache.kafka.connect.data.Struct;
26 |
27 | import java.util.List;
28 | import java.util.Objects;
29 | import java.util.stream.Collectors;
30 |
31 | @UdfDescription(name = "extract_array_field", description = "Extract a single field from an array of struct")
32 | public class ExtractArrayField {
33 |
34 | @Udf
35 | @SuppressWarnings("unchecked")
36 | public List extractArrayField(@UdfParameter(value = "array") final List array,
37 | @UdfParameter(value = "field") final String field) {
38 |
39 | if (array == null) return null;
40 |
41 | final String ufield = field.toUpperCase();
42 |
43 | final List structs = (List) array;
44 | return structs.stream()
45 | .map(record -> {
46 | final Field sf = record.schema().field(ufield);
47 | return sf != null ? record.getString(ufield) : null;
48 | })
49 | .filter(Objects::nonNull)
50 | .collect(Collectors.toList());
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/sql/ch_create_table_tweets.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS default.tweets
2 | (
3 | ID String,
4 | CREATEDAT DateTime,
5 | TEXT String,
6 | LANG String,
7 | RETWEETED UInt8,
8 | USERID String,
9 | USERNAME String,
10 | USERDESCRIPTION String,
11 | USERLOCATION String,
12 | HASHTAGS String,
13 | MENTIONS String
14 | ) ENGINE = MergeTree()
15 | PARTITION BY toYYYYMM(CREATEDAT)
16 | ORDER BY (CREATEDAT, LANG);
17 |
--------------------------------------------------------------------------------
/sql/ksql_create_connector_sink_jdbc_clickhouse.sql:
--------------------------------------------------------------------------------
1 | CREATE SOURCE CONNECTOR clickhousejdbcconnector WITH (
2 | 'connector.class'='io.confluent.connect.jdbc.JdbcSinkConnector',
3 | 'topics'='tweets-normalized',
4 | 'tasks.max'='1',
5 | 'connection.url'='jdbc:clickhouse://clickhouse:8123/default',
6 | 'table.name.format'='tweets'
7 | );
8 |
--------------------------------------------------------------------------------
/sql/ksql_create_connector_source_twitter.sql:
--------------------------------------------------------------------------------
1 | CREATE SOURCE CONNECTOR tweeterconnector WITH (
2 | 'connector.class'='com.github.jcustenborder.kafka.connect.twitter.TwitterSourceConnector',
3 | 'twitter.oauth.accessTokenSecret'='%OAUTH_ACCESS_TOKEN_SECRET%',
4 | 'twitter.oauth.consumerSecret'='%OAUTH_CONSUMER_SECRET%',
5 | 'twitter.oauth.accessToken'='%OAUTH_ACCESS_TOKEN%',
6 | 'twitter.oauth.consumerKey'='%OAUTH_CONSUMER_KEY%',
7 | 'kafka.status.topic'='tweets',
8 | 'process.deletes'=false,
9 | 'filter.keywords'='coronavirus,2019nCoV,SARSCoV2,covid19,cov19'
10 | );
11 |
--------------------------------------------------------------------------------
/sql/ksql_create_stream_tweets.sql:
--------------------------------------------------------------------------------
1 | CREATE STREAM tweets WITH (KAFKA_TOPIC = 'tweets', VALUE_FORMAT='AVRO');
2 |
3 |
4 |
--------------------------------------------------------------------------------
/sql/ksql_create_stream_tweets_json.sql:
--------------------------------------------------------------------------------
1 | CREATE STREAM TWEETS_NORMALIZED_JSON
2 | WITH (KAFKA_TOPIC='tweets-normalized-json',VALUE_FORMAT='JSON')
3 | AS SELECT * FROM TWEETS_NORMALIZED;
4 |
--------------------------------------------------------------------------------
/sql/ksql_create_stream_tweets_normalized.sql:
--------------------------------------------------------------------------------
1 | CREATE STREAM TWEETS_NORMALIZED
2 | WITH (kafka_topic = 'tweets-normalized') AS
3 | SELECT
4 | Id,
5 | CreatedAt / 1000 as CreatedAt,
6 | Text,
7 | Lang,
8 | Retweeted,
9 | User->Id as UserId,
10 | User->Name as UserName,
11 | IFNULL(User->Description, '') as UserDescription,
12 | IFNULL(User->Location, '') as UserLocation,
13 | ARRAY_TO_STRING( EXTRACT_ARRAY_FIELD(UserMentionEntities, 'Name'), ',', '') as Mentions,
14 | ARRAY_TO_STRING( EXTRACT_ARRAY_FIELD(HashtagEntities, 'Text'), ',', '') as Hashtags
15 | FROM tweets EMIT CHANGES;
16 |
17 |
18 |
--------------------------------------------------------------------------------